tcgabiolinks下载甲基化数据.md 4.2 KB


title: TCGAbiolinks下载甲基化数据 tags: [] id: '2310' categories:

  • - 数据库 date: 2022-09-09 00:32:59 ---

安装补充包

  • conda activate tcga
  • # conda install -c bioconda bioconductor-sesamedata -y
  • conda install -c conda-forge r-magick -y
  • BiocManager::install("sesameData")
  • BiocManager::install("sesame")
  • BiocManager::install("doParallel")
  • BiocManager::install("ComplexHeatmap")
  • BiocManager::install("matlab")
  • conda install -c bioconda bioconductor-motifstack -y
  • # conda install -c bioconda bioconductor-bsgenome.hsapiens.ucsc.hg38 -y
  • BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
  • BiocManager::install("BSgenome.Hsapiens.UCSC.hg19")
  • # conda install -c bioconda bioconductor-genomicranges -y
  • conda install -c bioconda bioconductor-rgadem -y

下载数据

library(TCGAbiolinks)
query_met.hg38 <- GDCquery(
    project= "TCGA-PRAD", 
    data.category = "DNA Methylation", 
    data.type = "Methylation Beta Value",
    platform = "Illumina Human Methylation 450"
)
GDCdownload(query_met.hg38)
data.hg38 <- GDCprepare(query_met.hg38)
saveRDS(data.hg38, 'prad.meth.rds')

清洗数据

library(SummarizedExperiment)
# 删除包含 NA 值的探针
data.hg38 <- subset(data.hg38,subset = (rowSums(is.na(assay(data.hg38))) == 0))
# 去除重复样本
data.hg38 <- data.hg38[, substr(colnames(data.hg38), 14, 16) == "01A"]
group <- readRDS('../idea_2/fig3.2/fig5/tcga.predict.rds')
gRow <- intersect(data.hg38$patient, rownames(group))
group <- group[gRow,]
data.hg38 <- data.hg38[, substr(colnames(data.hg38), 1, 12) %in% gRow]
data.hg38$group <- group[substr(colnames(data.hg38), 1, 12), 'group']

计算整体差异

TCGAvisualize_meanMethylation(
  data.hg38,
  groupCol = "group",
  group.legend  = "Groups",
  filename = NULL,
  print.pvalue = TRUE
)

识别差异甲基化 CpG 位点

res <- TCGAanalyze_DMC(
  data.hg38,
  # colData 函数获取的矩阵中分组列名
  groupCol = "group",
  group1 = "High Risk",
  group2 = "Low Risk",
  p.cut = 0.05,
  diffmean.cut = 0.15,
  save = FALSE,
  legend = "State",
  plot.filename = "./PRAD_metvolcano.png",
  cores =  8
)

绘制热图

sig_met <- subset(res, status != "Not Significant")
res_data <- subset(data.hg38, subset = (rownames(data.hg38) %in% rownames(sig_met)))
library(circlize)
library(ComplexHeatmap)
group <- group[substr(colnames(data.hg38), 1, 12), ]
#定义注释信息
ha <- HeatmapAnnotation(`Risk group` = group$group,
                        `Risk Score` = group$Risk_Score,
                        col = list(
                            `Risk group` = c("Low Risk" = "#99CCFFAA", 'High Risk' = '#FF6666AA')
                        ),
#                         annotation_name_gp = gpar(fontsize = 14),
                        show_annotation_name = TRUE)
heatmap  <- Heatmap(
  assay(res_data),
  name = "DNA methylation",
  col = matlab::jet.colors(200),
  show_row_names = FALSE,
  cluster_rows = TRUE,
  cluster_columns = FALSE,
  show_column_names = FALSE,
  bottom_annotation = ha,
  column_title = "DNA Methylation",
  column_order = order(group$Risk_Score)
)
pdf("./prad_meth_heatmap.pdf",width = 6, height = 4);{
    draw(heatmap, annotation_legend_side =  "bottom")
};dev.off()

模体分析(需要hg19)

query_met.hg19 <- GDCquery(
    project= "TCGA-PRAD", 
    data.category = "DNA methylation",
    platform = "Illumina Human Methylation 450",
    legacy = TRUE # hg19
)
GDCdownload(query_met.hg19)
data.hg19 <- GDCprepare(query_met.hg19)
saveRDS(data.hg38, 'prad.meth.hg19.rds')
library(rGADEM)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19)
library(motifStack)
probes <- rowRanges(res_data)
# 获取差异的探针并设置 200bp 大小的窗口
sequence <- GRanges(
  seqnames = as.character(seqnames(probes)),
  ranges = IRanges(start = start(ranges(probes)) - 100,
          end = start(ranges(probes)) + 100),
  strand = "*"
)
# 识别模体
gadem <- GADEM(sequence, verbose = FALSE, genome = Hsapiens)

因为hg19的数据还在下载,更多内容见官方教程