title: cBioPortal的原始数据获取 tags: [] id: '1956' categories:
https://github.com/cBioPortal/datahub/tree/master/public
https://www.cbioportal.org/study/summary?id=prad_su2c_2019
wget -e "https_proxy=http://127.0.0.1:20809" https://github.com/cBioPortal/datahub/raw/master/public/prad_su2c_2019/data_clinical_patient.txt -O data_clinical_patient.txt
d <- read.table('data_mrna_seq_fpkm_capture.txt', header = T, sep = '\t', allowEscapes = T, quote = '')
d
meta <- read.table('data_clinical_sample.txt', header = T, sep = '\t', comment.char = '#')
meta
clinical <- read.table('data_clinical_patient.txt', header = T, sep = '\t', comment.char = '#')
clinical
f_rm_duplicated <- function(NameL, reverse=F){
tmp <- data.frame(table(NameL))
if(reverse){
tmp <- tmp$NameL[tmp$Freq > 1]
}else{
tmp <- tmp$NameL[tmp$Freq == 1]
}
which(NameL %in% as.character(tmp))
}
f_name_dedup <- function(lc_exp, rowN = 1){
if (rowN == 0){
res <- lc_exp
rowNn <- rownames(lc_exp)
}else{
res <- lc_exp[-rowN]
rowNn <- lc_exp[[rowN]]
}
noDup <- f_rm_duplicated(rowNn)
tmp <- rowNn[noDup]
noDup <- res[noDup,]
rownames(noDup) <- tmp
Dup <- f_rm_duplicated(rowNn, T)
rowNn <- rowNn[Dup]
Dup <- res[Dup,]
rownames(Dup) <- NULL
lc_tmp = by(Dup,
rowNn,
function(x) rownames(x)[which.max(rowMeans(x))])
lc_probes = as.integer(lc_tmp)
Dup = Dup[lc_probes,]
rownames(Dup) <- rowNn[lc_probes]
return(rbind(noDup,Dup))
}
meta <- meta[f_rm_duplicated(meta$PATIENT_ID),]
rownames(meta)<- meta$PATIENT_ID
meta
rownames(clinical) <- clinical$PATIENT_ID
clinical
mergeID <- intersect(rownames(clinical), rownames(meta))
df <- cbind(clinical[mergeID,], meta[mergeID,])
rownames(df) <- df$SAMPLE_ID
df
saveRDS(df, 'meta.rds')
saveRDS(d, 'fpkm.rds')