pyscenic-一-转录因子分析.md 5.1 KB


title: pyscenic (一) 转录因子分析 tags: [] id: '1063' categories:

  • - 单细胞下游分析
  • - 生物信息学 date: 2021-10-15 01:50:00 ---

第一步 安装

pip3.8 install wheel -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3.8 install pyscenic -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3.8 install seaborn -i https://pypi.tuna.tsinghua.edu.cn/simple
pip3.8 install scanpy -i https://pypi.tuna.tsinghua.edu.cn/simple

第二步 跑示例教程

import os
import glob
import pickle
import pandas as pd
import numpy as np

from dask.diagnostics import ProgressBar

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

# 配置数据路径
root_path = os.path.expanduser("~/zlliu/R_output/21.10.22.pyscenic")
 
# 配置结果保存路径
output_path = root_path
if not os.path.isdir(output_path):
    os.makedirs(output_path)

DATA_FOLDER="./tmp"
RESOURCES_FOLDER="./resources"
DATABASE_FOLDER = "."
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "mm9-*.mc9nr.feather")
MOTIF_ANNOTATIONS_FNAME = os.path.join(RESOURCES_FOLDER, "motifs-v9-nr.mgi-m0.001-o0.0.tbl")
MM_TFS_FNAME = os.path.join(RESOURCES_FOLDER, 'mm_tfs.txt')
SC_EXP_FNAME = os.path.join(RESOURCES_FOLDER, "GSE60361_C1-3005-Expression.txt")
REGULONS_FNAME = os.path.join(DATA_FOLDER, "regulons.p")
MOTIFS_FNAME = os.path.join(DATA_FOLDER, "motifs.csv")
!wget https://ftp.ncbi.nlm.nih.gov/geo/series/GSE60nnn/GSE60361/suppl/GSE60361_C1-3005-Expression.txt.gz
if not os.path.isdir(RESOURCES_FOLDER):
    os.makedirs(RESOURCES_FOLDER)
!gunzip -c ./GSE60361_C1-3005-Expression.txt.gz > $RESOURCES_FOLDER/GSE60361_C1-3005-Expression.txt
ex_matrix = pd.read_csv(SC_EXP_FNAME, sep='\t', header=0, index_col=0).T
ex_matrix.shape
!wget https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.hgnc-m0.001-o0.0.tbl
!wget https://resources.aertslab.org/cistarget/motif2tf/motifs-v9-nr.mgi-m0.001-o0.0.tbl
!wget https://raw.githubusercontent.com/aertslab/pySCENIC/master/resources/lambert2018.txt
BASEFOLDER_NAME = '.'

MOTIFS_HGNC_FNAME = os.path.join(BASEFOLDER_NAME, 'motifs-v9-nr.hgnc-m0.001-o0.0.tbl')
MOTIFS_MGI_FNAME = os.path.join(BASEFOLDER_NAME, 'motifs-v9-nr.mgi-m0.001-o0.0.tbl')
CURATED_TFS_HGNC_FNAME = os.path.join(BASEFOLDER_NAME, 'lambert2018.txt')

OUT_TFS_HGNC_FNAME = os.path.join(RESOURCES_FOLDER, 'hs_hgnc_tfs.txt')
OUT_TFS_HGNC_FNAME = os.path.join(RESOURCES_FOLDER, 'hs_hgnc_curated_tfs.txt')
OUT_TFS_MGI_FNAME = os.path.join(RESOURCES_FOLDER, 'mm_mgi_tfs.txt')

df_motifs_mgi = pd.read_csv(MOTIFS_MGI_FNAME, sep='\t')
mm_tfs = df_motifs_mgi.gene_name.unique()
with open(OUT_TFS_MGI_FNAME, 'wt') as f:
    f.write('\n'.join(mm_tfs) + '\n')
len(mm_tfs)

df_motifs_hgnc = pd.read_csv(MOTIFS_HGNC_FNAME, sep='\t')
hs_tfs = df_motifs_hgnc.gene_name.unique()
with open(OUT_TFS_HGNC_FNAME, 'wt') as f:
    f.write('\n'.join(hs_tfs) + '\n')
len(hs_tfs)

with open(CURATED_TFS_HGNC_FNAME, 'rt') as f:
    hs_curated_tfs = list(map(lambda s: s.strip(), f.readlines()))
len(hs_curated_tfs)

hs_curated_tfs_with_motif = list(set(hs_tfs).intersection(hs_curated_tfs))
len(hs_curated_tfs_with_motif)

with open(OUT_TFS_HGNC_FNAME, 'wt') as f:
    f.write('\n'.join(hs_curated_tfs_with_motif) + '\n')

tf_names = load_tf_names(MM_TFS_FNAME)
!wget https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-tss-centered-5kb-10species.mc9nr.feather
!wget https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-500bp-upstream-7species.mc9nr.feather
!wget https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-500bp-upstream-10species.mc9nr.feather
!wget https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-tss-centered-10kb-7species.mc9nr.feather
!wget https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-tss-centered-10kb-10species.mc9nr.feather
!wget https://resources.aertslab.org/cistarget/databases/mus_musculus/mm9/refseq_r45/mc9nr/gene_based/mm9-tss-centered-5kb-7species.mc9nr.feather
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
    return os.path.splitext(os.path.basename(fname))[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
dbs
adjacencies = grnboost2(ex_matrix, tf_names=tf_names, verbose=True)
with open('adjacencies.pydata','wb') as f:
    pickle.dump(adjacencies,f)
modules = list(modules_from_adjacencies(adjacencies, ex_matrix))
with open('modules.pydata','wb') as f:
    pickle.dump(modules,f)
with ProgressBar():
    df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)
with open('df.pydata','wb') as f:
    pickle.dump(df,f)
from gc import collect as gc
gc()
regulons = df2regulons(df)
with open('regulons.pydata','wb') as f:
    pickle.dump(regulons,f)
auc_mtx = aucell(ex_matrix, regulons, num_workers=4)
sns.clustermap(auc_mtx, figsize=(8,8))