---
title: TCGAbiolinks下载甲基化数据
tags: []
id: '2310'
categories:
  - - 数据库
date: 2022-09-09 00:32:59
---

## 安装补充包

*   [conda activate tcga](https://occdn.limour.top/1655.html)
*   \# conda install -c bioconda bioconductor-sesamedata -y
*   conda install -c conda-forge r-magick -y
*   BiocManager::install("sesameData")
*   BiocManager::install("sesame")
*   BiocManager::install("doParallel")
*   BiocManager::install("ComplexHeatmap")
*   BiocManager::install("matlab")
*   conda install -c bioconda bioconductor-motifstack -y
*   \# conda install -c bioconda bioconductor-bsgenome.hsapiens.ucsc.hg38 -y
*   BiocManager::install("BSgenome.Hsapiens.UCSC.hg38")
*   BiocManager::install("BSgenome.Hsapiens.UCSC.hg19")
*   \# conda install -c bioconda bioconductor-genomicranges -y
*   conda install -c bioconda bioconductor-rgadem -y

## 下载数据

```R
library(TCGAbiolinks)
query_met.hg38 <- GDCquery(
    project= "TCGA-PRAD", 
    data.category = "DNA Methylation", 
    data.type = "Methylation Beta Value",
    platform = "Illumina Human Methylation 450"
)
GDCdownload(query_met.hg38)
data.hg38 <- GDCprepare(query_met.hg38)
saveRDS(data.hg38, 'prad.meth.rds')
```

## 清洗数据

```R
library(SummarizedExperiment)
# 删除包含 NA 值的探针
data.hg38 <- subset(data.hg38,subset = (rowSums(is.na(assay(data.hg38))) == 0))
# 去除重复样本
data.hg38 <- data.hg38[, substr(colnames(data.hg38), 14, 16) == "01A"]
group <- readRDS('../idea_2/fig3.2/fig5/tcga.predict.rds')
gRow <- intersect(data.hg38$patient, rownames(group))
group <- group[gRow,]
data.hg38 <- data.hg38[, substr(colnames(data.hg38), 1, 12) %in% gRow]
data.hg38$group <- group[substr(colnames(data.hg38), 1, 12), 'group']
```

## 计算整体差异

```R
TCGAvisualize_meanMethylation(
  data.hg38,
  groupCol = "group",
  group.legend  = "Groups",
  filename = NULL,
  print.pvalue = TRUE
)
```

## **识别差异甲基化** CpG **位点**

```R
res <- TCGAanalyze_DMC(
  data.hg38,
  # colData 函数获取的矩阵中分组列名
  groupCol = "group",
  group1 = "High Risk",
  group2 = "Low Risk",
  p.cut = 0.05,
  diffmean.cut = 0.15,
  save = FALSE,
  legend = "State",
  plot.filename = "./PRAD_metvolcano.png",
  cores =  8
)
```

### 绘制热图

```R
sig_met <- subset(res, status != "Not Significant")
res_data <- subset(data.hg38, subset = (rownames(data.hg38) %in% rownames(sig_met)))
library(circlize)
library(ComplexHeatmap)
group <- group[substr(colnames(data.hg38), 1, 12), ]
#定义注释信息
ha <- HeatmapAnnotation(`Risk group` = group$group,
                        `Risk Score` = group$Risk_Score,
                        col = list(
                            `Risk group` = c("Low Risk" = "#99CCFFAA", 'High Risk' = '#FF6666AA')
                        ),
#                         annotation_name_gp = gpar(fontsize = 14),
                        show_annotation_name = TRUE)
heatmap  <- Heatmap(
  assay(res_data),
  name = "DNA methylation",
  col = matlab::jet.colors(200),
  show_row_names = FALSE,
  cluster_rows = TRUE,
  cluster_columns = FALSE,
  show_column_names = FALSE,
  bottom_annotation = ha,
  column_title = "DNA Methylation",
  column_order = order(group$Risk_Score)
)
pdf("./prad_meth_heatmap.pdf",width = 6, height = 4);{
    draw(heatmap, annotation_legend_side =  "bottom")
};dev.off()
```

## **模体分析**（需要hg19）

```R
query_met.hg19 <- GDCquery(
    project= "TCGA-PRAD", 
    data.category = "DNA methylation",
    platform = "Illumina Human Methylation 450",
    legacy = TRUE # hg19
)
GDCdownload(query_met.hg19)
data.hg19 <- GDCprepare(query_met.hg19)
saveRDS(data.hg38, 'prad.meth.hg19.rds')
```

```R
library(rGADEM)
library(GenomicRanges)
library(BSgenome.Hsapiens.UCSC.hg19)
library(motifStack)
probes <- rowRanges(res_data)
# 获取差异的探针并设置 200bp 大小的窗口
sequence <- GRanges(
  seqnames = as.character(seqnames(probes)),
  ranges = IRanges(start = start(ranges(probes)) - 100,
          end = start(ranges(probes)) + 100),
  strand = "*"
)
# 识别模体
gadem <- GADEM(sequence, verbose = FALSE, genome = Hsapiens)
```

因为hg19的数据还在下载，更多内容见[官方教程](https://www.bioconductor.org/packages/release/workflows/vignettes/TCGAWorkflow/inst/doc/TCGAWorkflow.html)