--- title: 【迁移】从差异基因到RRA聚合 urlname: cong-cha-yi-ji-yin-dao-RRA-ju-he date: 2022-07-22 18:46:32 tags: ['DEGs', 'RRA'] --- 通过[比对](/STAR--yi-jian-jiao-ben),我们得到了counts矩阵,接下来可以进行DEGs分析。此时如果我们有多组之间的对比,则可以使用RRA算法来聚合我们的结果。 ## RRA的安装 ```bash conda create -n tcga -c conda-forge r-base=4.1.2 -y conda activate tcga conda install -c conda-forge r-rvest=1.0.2 -y conda install -c conda-forge r-xml=3.99_0.8 -y conda install -c conda-forge r-rcpparmadillo=0.10.8.1.0 -y conda install -c conda-forge r-bh=1.78.0_0 -y conda install -c conda-forge r-biocmanager=1.30.16 -y conda install -c bioconda bioconductor-summarizedexperiment=1.24.0 -y conda install -c bioconda bioconductor-tcgabiolinks=2.22.1 -y conda install -c bioconda bioconductor-deseq2=1.34.0 -y conda install -c bioconda bioconductor-rhdf5=2.38.0 -y conda install -c bioconda bioconductor-limma=3.50.1 -y conda install -c bioconda bioconductor-apeglm=1.16.0 -y conda install -c bioconda r-sleuth=0.30.0 -y conda install -c bioconda r-wasabi=1.0.1 -y conda install -c conda-forge r-irkernel=1.3 -y conda install -c conda-forge r-ashr=2.2_54 -y conda install -c conda-forge r-robustrankaggreg=1.1 -y conda install -c conda-forge r-devtools=2.4.3 -y IRkernel::installspec(name='tcga', displayname='r-tcga') ``` * deseq2 数据要求:低生物学重复 & raw counts;假定负二项分布;适合高通量测序数据 * sleuth 数据要求:**Kallisto**输出的结果 * limma 数据要求:logCPM;假定正态分布;适合芯片数据 * [fpkm数据差异基因分析](https://bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf) :理论上是不能进行分析的,无计可施时可以参考 * 高生物学重复请直接使用 **wilcox.test** 以避免大量假阳性 * 多数据集结果整合:RobustRankAggreg fpkm转tpm示例(基于 SummarizedExperiment 数据框架) ```r fpkmToTpm <- function(fpkm){ exp(log(fpkm) - log(sum(fpkm)) + log(1e6)) } f_fpkmToTpm <- function(l_e){ apply(l_e,2,fpkmToTpm) } assay(sce, "TPM") <- f_fpkmToTpm(assay(sce, "HTSeq - FPKM")) ``` ## 第一步,多组差异基因分析 ```R library(DESeq2) count_all <- read.csv("~/upload/zl_liu/star_data/yyz_01/yyz_01.csv",header=TRUE) count_all cts_b <- count_all[ ,c(-1,-2,-3)] rownames(cts_b) <- count_all$ID keep <- rowSums(cts_b) > ncol(cts_b) cts_b[keep,] f_DESeq2 <- function(cts_bb, rowInfo, ControlN, TreatN, rm.NA=T){ cts_b <- cts_bb[,c(ControlN, TreatN)] conditions <- factor(c(rep("Control",length(ControlN)), rep("Treat",length(TreatN)))) colData_b <- data.frame(row.names = colnames(cts_b), conditions) print(colData_b) dds <- DESeqDataSetFromMatrix(countData = cts_b, colData = colData_b, design = ~ conditions) dds <- DESeq(dds) res <- results(dds) rres <- cbind(rowInfo, data.frame(res)) if(rm.NA){rres <- rres[!is.na(rres$padj),]} rres <- rres[order(rres$log2FoldChange, decreasing = T),] saveRDS(rres, paste('DEGs', paste(TreatN, collapse = '_'), 'vs.', paste(ControlN, collapse = '_'), 'DESeq2.rds',sep = '_')) rres } Ct1 <- c('X1.control.DMSO', 'X2.control.DMSO') Tt1 <- c('X1.OE.DMSO', 'X2.OE.DMSO') Ct2 <- c('X1.OE.DMSO', 'X2.OE.DMSO') Tt2 <- c('X1.OE.Enz', 'X2.OE.Enz') Ct3 <- c('X1.control.Enz', 'X2.control.Enz') Tt3 <- c('X1.OE.Enz', 'X2.OE.Enz') Ct4 <- c('X1.CAF.DMSO', 'X2.CAF.DMSO') Tt4 <- c('X1.CAF.Enz', 'X2.CAF.Enz') r1 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct1, Tt1) r2 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct2, Tt2) r3 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct3, Tt3) r4 <- f_DESeq2(cts_b[keep,], count_all[keep,c(1,2,3)], Ct4, Tt4) ``` ## 第二步,RRA聚合差异结果 ```R require(RobustRankAggreg) f_dflist_RRA <- function(dflist, nameN, N, orderN, decreasing=T){ res <- list() for (name in names(dflist)){ tmp <- dflist[[name]] if (nrow(tmp) == 1){ res[[name]] <- tmp[1, nameN] }else if(nrow(tmp) >=2){ res[[name]] <- tmp[order(tmp[[orderN]],decreasing = decreasing), nameN] } } if(length(res) < 1){ return(NULL)} aggregateRanks(glist = res, N = N) } rownames(count_all) <- count_all$ID r <- list(r1=r1,r2=r2,r3=r3,r4=r4) r <- lapply(r, FUN = function(x){subset(x, padj<0.05 & log2FoldChange > 1)}) r <- f_dflist_RRA(r, 'ID', sum(keep), 'log2FoldChange') r <- cbind(count_all[r$Name,c(2,3)], r) r write.csv(r, 'RRA_up.csv') r <- list(r1=r1,r2=r2,r3=r3,r4=r4) r <- lapply(r, FUN = function(x){subset(x, padj<0.05 & log2FoldChange < -1)}) r <- f_dflist_RRA(r, 'ID', sum(keep), 'log2FoldChange', decreasing = F) r <- cbind(count_all[r$Name,c(2,3)], r) r write.csv(r, 'RRA_down.csv') ``` ## 附加:绘图 ### 安装包 ```bash conda create -n rsf -c conda-forge r-sf=1.0_4 conda activate rsf library(sf) # install.packages("sf", version = "1.0-4") install.packages("ggVennDiagram") conda install -c conda-forge r-ggsci -y conda install -c conda-forge r-irkernel -y Rscript -e "IRkernel::installspec(name='ggVennDiagram', displayname='r-ggVennDiagram')" conda install -c conda-forge r-venndiagram -y ``` ### 数据准备 ```R x <- list() r <- list() r$cell <- readRDS('../A_ref_A_fiig.1_A/DEG.rds') r$tissue <- readRDS('../B_ref_A_fiig.1_A/DEG.rds') names(r$tissue)[3] <- 'symbol' r_up <- lapply(r, FUN = function(x){subset(x, log2FoldChange > 0)}) x$cell_up <- r_up$cell$symbol x$RRA_up <- readRDS('r_up.rds') x$RRA_up <- x$RRA_up$Name x$tissue_up <- r_up$tissue$symbol r_dn <- lapply(r, FUN = function(x){subset(x, log2FoldChange < 0)}) x$cell_down <- r_dn$cell$symbol x$RRA_down <- readRDS('r_dn.rds') x$RRA_down <- x$RRA_down$Name x$tissue_down <- r_dn$tissue$symbol summary(x) ``` ### 绘图1 ```R #载入所需的R包; library(ggplot2) library(ggsci) library(sf) library(ggVennDiagram) color4 <- alpha("#99CC00",0.5) ggVennDiagram(x[1:6], label_alpha=0) + scale_fill_gradient(low='white',high =color4) ``` ### 绘图2 ```R venn.plot <- venn.diagram( x = x[1:3], filename = NULL, cex = 2.5, cat.cex = 2.5, cat.dist = c(0.07, 0.07, 0.02), cat.pos = c(-20, 20, 20), alpha = 0.5, fill = c("#99CC00", "#c77cff", '#f8766d') ); grid.draw(venn.plot) venn.plot <- venn.diagram( x = x[4:6], filename = NULL, cex = 2.5, cat.cex = 2.5, cat.dist = c(0.07, 0.07, 0.02), cat.pos = c(-20, 20, 20), alpha = 0.5, fill = c("#99CC00", "#c77cff", '#f8766d') ); grid.draw(venn.plot) ``` ## 附加:单细胞数据 ### 第一步 读取数据 ```r library(Seurat) library(plyr) library(dplyr) library(patchwork) library(purrr) f_metadata_removeNA <- function(sObject, lc_groupN){ sObject@meta.data <- sObject@meta.data[colnames(sObject),] sObject <- subset(x = sObject, !!sym(lc_groupN)%in%f_br_cluster_f(sObject, lc_groupN)) sObject } f_br_cluster_f <- function(sObject, lc_groupN){ lc_filter <- unlist(unique(sObject[[lc_groupN]])) lc_filter <- lc_filter[!is.na(lc_filter)] lc_filter } scRNA_split = readRDS("~/zlliu/R_output/21.09.21.SingleR/scRNA.rds") scRNA_split <- f_metadata_removeNA(scRNA_split, 'Region') n_ExN <- c('L4 IT','L5 IT','L5 ET','IT','L6b','L5/6 IT Car3','L6 IT','L2/3 IT','L5/6 NP','L6 IT Car3','L6 CT') n_InN <- c('Lamp5','Pvalb','Sst','Vip','Sncg','PAX6') n_NoN <- c('Astro','Endo','Micro-PVM','OPC','Oligo','Pericyte','VLMC') n_groups <- list(NoN=n_NoN, ExN=n_ExN, InN=n_InN) f_listUpdateRe <- function(lc_obj, lc_bool, lc_item){ lc_obj[lc_bool] <- rep(lc_item,times=sum(lc_bool)) lc_obj } f_grouplabel <- function(lc_meta.data, lc_groups){ res <- lc_meta.data[[1]] for(lc_g in names(lc_groups)){ lc_bool = (res %in% lc_groups[[lc_g]]) for(c_n in colnames(lc_meta.data)){ lc_bool = lc_bool (lc_meta.data[[c_n]] %in% lc_groups[[lc_g]]) } res <- f_listUpdateRe(res, lc_bool, lc_g) } names(res) <- rownames(lc_meta.data) res } scRNA_split[['n_groups']] <- f_grouplabel(scRNA_split[[c("hM1_hmca_class")]], n_groups) sc_Neuron <- subset(x = scRNA_split, n_groups %in% c("InN", "ExN")) samples <- SplitObject(object = sc_Neuron, split.by = 'orig.ident') ``` ### 第二步 计算差异基因 ```r DEGs <- list() for (name in names(samples)){ tmp <- samples[[name]] Idents(tmp) <- 'Region' DEGs[[name]] = FindAllMarkers(tmp, only.pos = T) } ``` ### 第三步 整合数据 ```r f_dflist_subset <- function(dflist, nameN=NULL, ...){ res <- list() for (name in names(dflist)){ res[[name]] <- subset(dflist[[name]], ...) } res } require(RobustRankAggreg) f_dflist_RRA <- function(dflist, nameN, N, orderN, decreasing=T){ res <- list() for (name in names(dflist)){ tmp <- dflist[[name]] if (nrow(tmp) == 1){ res[[name]] <- tmp[1, nameN] }else if(nrow(tmp) >=2){ res[[name]] <- tmp[order(tmp[[orderN]],decreasing = decreasing), nameN] } } if(length(res) < 1){ return(NULL)} aggregateRanks(glist = res, N = N) } ``` ```r res <- data.frame() for(name in unique(sc_Neuron@meta.data$Region)){ tmp <- f_dflist_subset(DEGs, name, cluster==nameN) tmp <- f_dflist_RRA(tmp, 'gene', N=24223, 'avg_log2FC', decreasing = T) if(!is.null(tmp)){ tmp[['cluster']] <- name res <- rbind(res, tmp) } } ``` ### 第四步 保存结果 ```r require(openxlsx) wb <- createWorkbook() for (name in names(samples)){ addWorksheet(wb = wb, sheetName = name) writeData(wb = wb, sheet = name, x = DEGs[[name]]) } addWorksheet(wb = wb, sheetName = 'RRA') writeData(wb = wb, sheet = 'RRA', x = res) saveWorkbook(wb, "DEG_Brain_Regin_Neuron.xlsx", overwrite = TRUE) ```