将多数据集转化成无依赖的数据格式,进行单变量co.md 4.6 KB


title: 将多数据集转化成无依赖的数据格式,进行单变量COXPH,并进行RRA整合 tags: [] id: '1953' categories:

  • - uncategorized date: 2022-08-08 09:53:15 ---

PCaDB

f_dedup_IQR

ano <- readRDS('../../DEG/PCaDB/PCaDB_Gene_Annotation.RDS')
tmp <- readRDS('../../DEG/PCaDB/CPC-Gene_eSet.RDS')
tmp
clinical <- tmp@phenoData@data
clinical <- clinical[!is.na(clinical$bcr_status),]
sum(clinical$bcr_status)
sum(clinical$time_to_bcr)
expr <- (exprs(tmp))
expr <- f_dedup_IQR(as.data.frame(expr), ano[rownames(expr),]$gene_name)
expr <- scale(t(expr))
expr <- as.data.frame(expr[,!is.na(colSums(expr))])
expr
clinical <- clinical[rownames(clinical) %in% rownames(expr),]
sum(clinical$bcr_status)
sum(clinical$time_to_bcr)
clinical
expr <- expr[rownames(clinical),]
expr
clinical$pfs_status <- clinical$bcr_status
clinical$pfs_time <-  clinical$time_to_bcr
data <- list()
data$data <- expr
data$meta  <- clinical
saveRDS(data, 'CPC-Gene.rds')

TCGA

f_counts2TMM

tmp <- load('../../DEG/TCGA/PRAD_tp.rda')
counts <- data@assays@data$unstranded
colnames(counts) <- data@colData$patient
counts <- counts[,f_rm_duplicated(colnames(counts))]
geneInfo <- as.data.frame(data@rowRanges)[c('gene_id','gene_type','gene_name')]
counts <- f_dedup_IQR(as.data.frame(counts), geneInfo$gene_name)
counts
counts <- f_counts2TMM(counts)
counts <- log2(counts + 1)
counts <- scale(t(counts))
counts <- as.data.frame(counts[,!is.na(colSums(counts))])
counts
clinical <- readRDS('../../DEG/TCGA/clinical/TCGA_PRAD_with_ICGC.rds')
clinical <- clinical[rownames(clinical) %in% rownames(counts),]
sum(clinical$new_dcf_status)
sum(clinical$new_dcf_time)
clinical
counts <- counts[rownames(clinical),]
counts
clinical$pfs_status <- clinical$new_dcf_status
clinical$pfs_time <- (clinical$new_dcf_time / 365)*12
data <- list()
data$data <- expr
data$meta  <- clinical
saveRDS(data, 'PRAD.rds')

train_sets

train_sets <- list()
train_sets$CPGEA <- readRDS('data/CPGEA.rds')
train_sets$PRAD <- readRDS('data/PRAD.rds')
train_sets$mCPRC <- readRDS('data/mCRPC.rds')
train_sets$DKFZ <- readRDS('data/DKFZ.rds')
train_sets$GSE54460 <- readRDS('data/GSE54460.rds')
saveRDS(train_sets, 'train_sets.rds')

单变量COXPH

library("survival")
library("survminer")
f_DEG_coxph_oneGene <- function(trainSet, geneN){
    if(!(geneN %in% colnames(trainSet$data))){
        return(data.frame(row.names = geneN, mean=NA, lower=NA, upper=NA, Pvalue=1, VarName=geneN))
    }
    df <- cbind(trainSet$data[geneN],trainSet$meta[c('pfs_status', 'pfs_time')])
    coxmf <- paste0("Surv(pfs_time, pfs_status==1)~", '`',geneN,'`')
#     print(coxmf)
    res.cox <- coxph(formula(coxmf), data =  df)
    tmp_res <- summary(res.cox)
    res <- as.data.frame(tmp_res$conf.int)
    res <- res[,c(1,3,4)]
    colnames(res) <- c('mean', 'lower', 'upper')
    res[['Pvalue']] <- tmp_res$coefficients[,'Pr(>z)']
    res[['VarName']] <- rownames(res)
    res
}
f_DEG_coxph <- function(trainSet, geneList){
    res <- NULL
    for(gene in geneList){
        res <- rbind(res,f_DEG_coxph_oneGene(trainSet, gene))
    }
    res
}
f_DEG_coxph_Sets <- function(trainSets, geneList){
    res <- list()
    for(Name in names(trainSets)){
        res[[Name]] <- f_DEG_coxph(trainSets[[Name]], geneList)
    }
    res
}
 
DEG <- readRDS('../DEG/TCGA/mCRPC_DEG.rds')
DEG <- subset(DEG, padj < 0.05 & abs(log2FoldChange)>2 & !grepl('pseudogene', gene_type))
train_sets <- readRDS('train_sets.rds')
tmp <- f_DEG_coxph_Sets(train_sets, unique(DEG$gene_name))
saveRDS(tmp, 'mCRPC.rds')

RRA聚合

tmp <- readRDS('mCRPC.rds')
require(RobustRankAggreg)
f_dflist_RRA <- function(dflist, nameN, N, orderN, decreasing=T){
    res <- list()
    for (name in names(dflist)){
        tmp <- dflist[[name]]
        if (nrow(tmp) == 1){
            res[[name]] <- tmp[1, nameN]
        }else if(nrow(tmp) >=2){
            res[[name]] <- tmp[order(tmp[[orderN]],decreasing = decreasing), nameN]
        }
    }
    if(length(res) < 1){ return(NULL)}
    aggregateRanks(glist = res, N = N)
}
tmp$CPGEA
r <- lapply(tmp, FUN = function(x){subset(x, Pvalue<0.05 & mean > 1)})
r <- f_dflist_RRA(r, 'VarName', nrow(tmp$CPGEA), 'mean')
r <- subset(r, Score<0.05)
r_up <- r
r <- lapply(tmp, FUN = function(x){subset(x, Pvalue<0.05 & mean < 1)})
r <- f_dflist_RRA(r, 'VarName', nrow(tmp$CPGEA), 'mean', decreasing = F)
r <- subset(r, Score<0.05)
r_down <- r
 
r_up$Score <- log1p(1/r_up$Score)
r_down$Score <- log1p(1/r_down$Score)
r_down$Score <- -r_down$Score
r <- rbind(r_up,r_down)
r <- r[order(r$Score, decreasing = T),]
r
saveRDS(r, 'mCRPC_RRA.rds')