2022-07-22-【迁移】从差异基因到RRA聚合.md 9.7 KB


title: 【迁移】从差异基因到RRA聚合 urlname: cong-cha-yi-ji-yin-dao-RRA-ju-he date: 2022-07-22 18:46:32 index_img: https://api.limour.top/randomImg?d=2022-07-22 18:46:32

tags: ['DEGs', 'RRA']

通过比对,我们得到了counts矩阵,接下来可以进行DEGs分析。此时如果我们有多组之间的对比,则可以使用RRA算法来聚合我们的结果。

RRA的安装

conda create -n tcga -c conda-forge r-base=4.1.2 -y
conda activate tcga
conda install -c conda-forge r-rvest=1.0.2 -y
conda install -c conda-forge r-xml=3.99_0.8 -y
conda install -c conda-forge r-rcpparmadillo=0.10.8.1.0 -y
conda install -c conda-forge r-bh=1.78.0_0 -y
conda install -c conda-forge r-biocmanager=1.30.16 -y
conda install -c bioconda bioconductor-summarizedexperiment=1.24.0 -y
conda install -c bioconda bioconductor-tcgabiolinks=2.22.1 -y
conda install -c bioconda bioconductor-deseq2=1.34.0 -y
conda install -c bioconda bioconductor-rhdf5=2.38.0 -y
conda install -c bioconda bioconductor-limma=3.50.1 -y
conda install -c bioconda bioconductor-apeglm=1.16.0 -y
conda install -c bioconda r-sleuth=0.30.0 -y
conda install -c bioconda r-wasabi=1.0.1 -y
conda install -c conda-forge r-irkernel=1.3 -y
conda install -c conda-forge r-ashr=2.2_54 -y
conda install -c conda-forge r-robustrankaggreg=1.1 -y
conda install -c conda-forge r-devtools=2.4.3 -y
IRkernel::installspec(name='tcga', displayname='r-tcga')
  • deseq2 数据要求:低生物学重复 & raw counts;假定负二项分布;适合高通量测序数据
  • sleuth 数据要求:Kallisto输出的结果
  • limma 数据要求:logCPM;假定正态分布;适合芯片数据
  • fpkm数据差异基因分析 :理论上是不能进行分析的,无计可施时可以参考
  • 高生物学重复请直接使用 wilcox.test 以避免大量假阳性
  • 多数据集结果整合:RobustRankAggreg

fpkm转tpm示例(基于 SummarizedExperiment 数据框架)

fpkmToTpm <- function(fpkm){
    exp(log(fpkm) - log(sum(fpkm)) + log(1e6))
}
f_fpkmToTpm <- function(l_e){
     apply(l_e,2,fpkmToTpm)
}
assay(sce, "TPM") <- f_fpkmToTpm(assay(sce, "HTSeq - FPKM"))

第一步,多组差异基因分析

library(DESeq2)
count_all <- read.csv("~/upload/zl_liu/star_data/yyz_01/yyz_01.csv",header=TRUE)
count_all
cts_b <- count_all[ ,c(-1,-2,-3)]
rownames(cts_b) <- count_all$ID
keep <- rowSums(cts_b) > ncol(cts_b)
cts_b[keep,]
f_DESeq2 <- function(cts_bb, rowInfo, ControlN, TreatN, rm.NA=T){
    cts_b <- cts_bb[,c(ControlN, TreatN)]
    conditions <- factor(c(rep("Control",length(ControlN)), rep("Treat",length(TreatN)))) 
    colData_b <- data.frame(row.names = colnames(cts_b), conditions)
    print(colData_b)
    dds <- DESeqDataSetFromMatrix(countData = cts_b,
                              colData = colData_b,
                              design = ~ conditions)
    dds <- DESeq(dds)
    res <- results(dds)
    rres <- cbind(rowInfo, data.frame(res))
    if(rm.NA){rres <- rres[!is.na(rres$padj),]}
    rres <- rres[order(rres$log2FoldChange, decreasing = T),]
    saveRDS(rres, paste('DEGs', paste(TreatN, collapse = '_'), 'vs.', paste(ControlN, collapse = '_'), 'DESeq2.rds',sep = '_'))
    rres
}
Ct1 <- c('X1.control.DMSO', 'X2.control.DMSO')
Tt1 <- c('X1.OE.DMSO', 'X2.OE.DMSO')
Ct2 <- c('X1.OE.DMSO', 'X2.OE.DMSO')
Tt2 <- c('X1.OE.Enz', 'X2.OE.Enz')
Ct3 <- c('X1.control.Enz', 'X2.control.Enz')
Tt3 <- c('X1.OE.Enz', 'X2.OE.Enz')
Ct4 <- c('X1.CAF.DMSO', 'X2.CAF.DMSO')
Tt4 <- c('X1.CAF.Enz', 'X2.CAF.Enz')
r1 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct1, Tt1)
r2 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct2, Tt2)
r3 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct3, Tt3)
r4 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct4, Tt4)

第二步,RRA聚合差异结果

require(RobustRankAggreg)
f_dflist_RRA <- function(dflist, nameN, N, orderN, decreasing=T){
    res <- list()
    for (name in names(dflist)){
        tmp <- dflist[[name]]
        if (nrow(tmp) == 1){
            res[[name]] <- tmp[1, nameN]
        }else if(nrow(tmp) >=2){
            res[[name]] <- tmp[order(tmp[[orderN]],decreasing = decreasing), nameN]
        }
    }
    if(length(res) < 1){ return(NULL)}
    aggregateRanks(glist = res, N = N)
}
rownames(count_all) <- count_all$ID
 
r <- list(r1=r1,r2=r2,r3=r3,r4=r4)
r <- lapply(r, FUN = function(x){subset(x, padj<0.05 & log2FoldChange > 1)})
r <- f_dflist_RRA(r, 'ID', sum(keep), 'log2FoldChange')
r <- cbind(count_all[r$Name,c(2,3)], r)
r
write.csv(r, 'RRA_up.csv')
 
r <- list(r1=r1,r2=r2,r3=r3,r4=r4)
r <- lapply(r, FUN = function(x){subset(x, padj<0.05 & log2FoldChange < -1)})
r <- f_dflist_RRA(r, 'ID', sum(keep), 'log2FoldChange', decreasing = F)
r <- cbind(count_all[r$Name,c(2,3)], r)
r
write.csv(r, 'RRA_down.csv')

附加:绘图

安装包

conda create -n rsf -c conda-forge r-sf=1.0_4
conda activate rsf
library(sf)
# install.packages("sf", version = "1.0-4")
install.packages("ggVennDiagram")
conda install -c conda-forge r-ggsci -y
conda install -c conda-forge r-irkernel -y
Rscript -e "IRkernel::installspec(name='ggVennDiagram', displayname='r-ggVennDiagram')"
conda install -c conda-forge r-venndiagram -y

数据准备

x <- list()
r <- list()
r$cell <- readRDS('../A_ref_A_fiig.1_A/DEG.rds')
r$tissue <- readRDS('../B_ref_A_fiig.1_A/DEG.rds')
names(r$tissue)[3] <- 'symbol'
r_up <- lapply(r, FUN = function(x){subset(x, log2FoldChange > 0)})
x$cell_up <- r_up$cell$symbol
x$RRA_up <- readRDS('r_up.rds')
x$RRA_up <- x$RRA_up$Name
x$tissue_up <- r_up$tissue$symbol
r_dn <- lapply(r, FUN = function(x){subset(x, log2FoldChange < 0)})
x$cell_down <- r_dn$cell$symbol
x$RRA_down <- readRDS('r_dn.rds')
x$RRA_down <- x$RRA_down$Name
x$tissue_down <- r_dn$tissue$symbol
summary(x)

绘图1

#载入所需的R包;
library(ggplot2)
library(ggsci)
library(sf)
library(ggVennDiagram)
color4 <- alpha("#99CC00",0.5)
ggVennDiagram(x[1:6], label_alpha=0) +
  scale_fill_gradient(low='white',high =color4)

绘图2

venn.plot <- venn.diagram(
    x = x[1:3],
    filename = NULL,
    cex = 2.5,
    cat.cex = 2.5,
    cat.dist = c(0.07, 0.07, 0.02),
    cat.pos = c(-20, 20, 20),
    alpha = 0.5,
    fill = c("#99CC00", "#c77cff", '#f8766d')
);
grid.draw(venn.plot)
venn.plot <- venn.diagram(
    x = x[4:6],
    filename = NULL,
    cex = 2.5,
    cat.cex = 2.5,
    cat.dist = c(0.07, 0.07, 0.02),
    cat.pos = c(-20, 20, 20),
    alpha = 0.5,
    fill = c("#99CC00", "#c77cff", '#f8766d')
);
grid.draw(venn.plot)

附加:单细胞数据

第一步 读取数据

library(Seurat)
library(plyr)
library(dplyr)
library(patchwork)
library(purrr)
f_metadata_removeNA <- function(sObject, lc_groupN){
    sObject@meta.data <- sObject@meta.data[colnames(sObject),]
    sObject <- subset(x = sObject, !!sym(lc_groupN)%in%f_br_cluster_f(sObject, lc_groupN))
    sObject
}
f_br_cluster_f <- function(sObject, lc_groupN){
    lc_filter <- unlist(unique(sObject[[lc_groupN]]))
    lc_filter <- lc_filter[!is.na(lc_filter)]
    lc_filter
}

scRNA_split = readRDS("~/zlliu/R_output/21.09.21.SingleR/scRNA.rds")
scRNA_split <- f_metadata_removeNA(scRNA_split, 'Region')

n_ExN <- c('L4 IT','L5 IT','L5 ET','IT','L6b','L5/6 IT Car3','L6 IT','L2/3 IT','L5/6 NP','L6 IT Car3','L6 CT')
n_InN <- c('Lamp5','Pvalb','Sst','Vip','Sncg','PAX6')
n_NoN <- c('Astro','Endo','Micro-PVM','OPC','Oligo','Pericyte','VLMC')
n_groups <- list(NoN=n_NoN, ExN=n_ExN, InN=n_InN)

f_listUpdateRe <- function(lc_obj, lc_bool, lc_item){
    lc_obj[lc_bool] <- rep(lc_item,times=sum(lc_bool))
    lc_obj
}
f_grouplabel <- function(lc_meta.data, lc_groups){
    res <- lc_meta.data[[1]]
    for(lc_g in names(lc_groups)){
        lc_bool = (res %in% lc_groups[[lc_g]])
        for(c_n in colnames(lc_meta.data)){
            lc_bool = lc_bool  (lc_meta.data[[c_n]] %in% lc_groups[[lc_g]])
        }
        res <- f_listUpdateRe(res, lc_bool, lc_g)
    }
    names(res) <- rownames(lc_meta.data)
    res
}

scRNA_split[['n_groups']] <- f_grouplabel(scRNA_split[[c("hM1_hmca_class")]], n_groups)
sc_Neuron <- subset(x = scRNA_split, n_groups %in% c("InN", "ExN"))

samples <- SplitObject(object = sc_Neuron, split.by = 'orig.ident')

第二步 计算差异基因

DEGs <- list()
for (name in names(samples)){
    tmp <- samples[[name]]
    Idents(tmp) <- 'Region'
    DEGs[[name]] = FindAllMarkers(tmp, only.pos = T)
}

第三步 整合数据

f_dflist_subset <- function(dflist, nameN=NULL, ...){
    res <- list()
    for (name in names(dflist)){
        res[[name]] <- subset(dflist[[name]], ...)
    }
    res
}
require(RobustRankAggreg)
f_dflist_RRA <- function(dflist, nameN, N, orderN, decreasing=T){
    res <- list()
    for (name in names(dflist)){
        tmp <- dflist[[name]]
        if (nrow(tmp) == 1){
            res[[name]] <- tmp[1, nameN]
        }else if(nrow(tmp) >=2){
            res[[name]] <- tmp[order(tmp[[orderN]],decreasing = decreasing), nameN]
        }
    }
    if(length(res) < 1){ return(NULL)}
    aggregateRanks(glist = res, N = N)
}
res <- data.frame()
for(name in unique(sc_Neuron@meta.data$Region)){
    tmp <- f_dflist_subset(DEGs, name, cluster==nameN)
    tmp <- f_dflist_RRA(tmp, 'gene', N=24223, 'avg_log2FC', decreasing = T)
    if(!is.null(tmp)){
        tmp[['cluster']] <- name
        res <- rbind(res, tmp)
    }
}

第四步 保存结果

require(openxlsx)
wb <- createWorkbook()
for (name in names(samples)){
    addWorksheet(wb = wb, sheetName = name)
    writeData(wb = wb, sheet = name, x = DEGs[[name]])
}
addWorksheet(wb = wb, sheetName = 'RRA')
writeData(wb = wb, sheet = 'RRA', x = res)
saveWorkbook(wb, "DEG_Brain_Regin_Neuron.xlsx", overwrite = TRUE)