之前通过TCGAbiolinks下载了蛋白质组数据,而其中peptide_target给的是抗体的名字,进行富集时需要进行ID转换,我们先来构造一个映射表。RPPA的抗体与基因映射关系在这里。
准备映射表
从官网下载手工版的映射表,整理成下面的格式,RPPA_Expanded_Ab_List_Updated、RPPA_Standard_Ab_List_Updated和the list of Updated Gene Names都需要。
清洗合并映射表
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
| ad2g1 <- read.csv('sp_Ab2Gene.CSV') ad2g2 <- read.csv('sp_Ab2Gene_old.CSV') ad2g3 <- read.csv('sp_Ab2Gene_new.CSV') ad2g <- ad2g2[(ad2g2$Ab %in% ad2g1$Ab) == F,] ad2g <- rbind(ad2g, ad2g1, ad2g3) ad2g <- unique(ad2g) ad2g[order(ad2g$Gene),] ad2g_d <- list() ad2g_d$gene <- unique(ad2g$Gene) ad2g_d$raw <- ad2g ad2g_d$g2ad <- list() for (gene in ad2g_d$gene){ ad2g_d$g2ad[[gene]] <- with(ad2g, Ab[Gene == gene]) } ad2g_d$g2ad_c <- list() ad2g$Ab <- stringr::str_replace_all(string = tolower(ad2g$Ab), pattern = '(_-\\. /)', replacement = '') for (gene in ad2g_d$gene){ ad2g_d$g2ad_c[[gene]] <- with(ad2g, Ab[Gene == gene]) }
|
查看遗漏数据
1 2 3
| load('PRAD_TCPA_DE.rdata') tmp <- stringr::str_replace_all(string = tolower(r1$peptide_target), pattern = '(_-\\. /)', replacement = '') r1[(tmp %in% ad2g$Ab) == F,]
|
通过搜索补全遗漏的映射表,添加到映射表中,重新清洗合并映射表
更改富集数据库的SYMBOL为Antibody
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
| KEGG <- readRDS('../../../GSEA/kk_SYMBOL.rds') tmp_1 <- KEGG$TERM2GENE[ KEGG$TERM2GENE$gene %in% ad2g_d$gene, ] tmp_2 <- KEGG$TERM2GENE[ (KEGG$TERM2GENE$gene %in% ad2g_d$gene)==F, ] tmp_3 <- data.frame() for(i in 1:nrow(tmp_1)){ tmp_4 <- ad2g_d$g2ad_c[[tmp_1[i, 'gene']]] tmp_5 <- rep(tmp_1[i, 'gsid'], length(tmp_4)) tmp_6 <- data.frame(gsid=tmp_5, gene=tmp_4) tmp_3 <- rbind(tmp_3, tmp_6) } tmp_3 <- unique(tmp_3) KEGG_ab <- list() KEGG_ab$TERM2NAME <- KEGG$TERM2NAME[KEGG$TERM2NAME$gsid %in% unique(tmp_3$gsid),] KEGG_ab$TERM2GENE <- tmp_3 saveRDS(KEGG_ab, 'KEGG_Ab.rds')
|