官网 下载zipped data sets 和Male data ,unzip解压,基本概念见此 。
数据输入与清洗 1 2 3 4 5 6 7 8 9 library( WGCNA) enableWGCNAThreads( ) femData = read.csv( "LiverFemale3600.csv" ) femData datExpr0 = as.data.frame( t( femData[ , - c ( 1 : 8 ) ] ) ) colnames( datExpr0) = femData$ substanceBXH rownames( datExpr0) = colnames( femData) [ - c ( 1 : 8 ) ] datExpr0
上面的代码可以得到一个列名为基因,行名为样本的表达矩阵,表达量已经归一化。
1 2 3 4 5 6 7 8 9 10 11 12 gsg = goodSamplesGenes( datExpr0, verbose = 3 ) ; gsg$ allOKif ( ! gsg$ allOK) { if ( sum ( ! gsg$ goodGenes) > 0 ) printFlush( paste( "Removing genes:" , paste( names ( datExpr0) [ ! gsg$ goodGenes] , collapse = ", " ) ) ) ; if ( sum ( ! gsg$ goodSamples) > 0 ) printFlush( paste( "Removing samples:" , paste( rownames( datExpr0) [ ! gsg$ goodSamples] , collapse = ", " ) ) ) ; datExpr0 = datExpr0[ gsg$ goodSamples, gsg$ goodGenes] }
上面的代码用于删除存在缺失值的基因和样本
1 2 3 4 5 6 7 8 9 10 sampleTree = hclust( dist( datExpr0) , method = "average" ) plot( sampleTree, main = "Sample clustering to detect outliers" , sub= "" , xlab= "" , cex.lab = 1.5 , cex.axis = 1.5 , cex.main = 2 ) abline( h = 15 , col = "red" ) clust = cutreeStatic( sampleTree, cutHeight = 15 , minSize = 10 ) table( clust) keepSamples = ( clust== 1 ) datExpr = datExpr0[ keepSamples, ]
上面的代码用于删除离群样本
1 2 3 4 sampleTree = hclust( dist( datExpr0) , method = "ward.D2" ) p <- ggtree:: ggtree( ape:: as.phylo( sampleTree) , linetype = "dashed" , layout = "circular" ) p <- p + ggtree:: geom_tiplab( ) p
画一个更好看的聚类图
1 2 3 4 5 traitData = read.csv( "ClinicalTraits.csv" ) ; allTraits = traitData[ , - c ( 31 , 16 ) ] ; allTraits = allTraits[ , c ( 2 , 11 : 36 ) ] ; allTraits
读取性状数据
1 2 3 4 5 6 7 8 9 10 f_rm_colN <- function ( df, regex) { df[ , ! grepl( regex, colnames( df) ) ] } traitRows = match( rownames( datExpr) , allTraits$ Mice) datTraits = allTraits[ traitRows, ] rownames( datTraits) = datTraits$ Mice datTraits <- f_rm_colN( datTraits, 'Mice' ) datTraits collectGarbage( )
清洗性状数据
1 2 3 4 5 sampleTree = hclust( dist( datExpr) , method = "average" ) traitColors = numbers2colors( datTraits, signed = FALSE ) plotDendroAndColors( sampleTree, traitColors, groupLabels = colnames( datTraits) , main = "Sample dendrogram and trait heatmap" )
颜色越深,代表这个表型数据与这个样本的基因表达量关系越密切。
选择合适的β值 1 2 3 4 5 6 7 8 powers = c ( c ( 1 : 10 ) , seq( from = 12 , to= 20 , by= 2 ) ) sft = pickSoftThreshold( datExpr, powerVector = powers, verbose = 5 ) plot( sft$ fitIndices[ , 1 ] , - sign ( sft$ fitIndices[ , 3 ] ) * sft$ fitIndices[ , 2 ] , xlab= "Soft Threshold (power)" , ylab= "Scale Free Topology Model Fit,signed R^2" , type= "n" , main = paste( "Scale independence" ) ) ; text( sft$ fitIndices[ , 1 ] , - sign ( sft$ fitIndices[ , 3 ] ) * sft$ fitIndices[ , 2 ] , labels= powers, cex= 0.9 , col= "red" ) ; abline( h= 0.90 , col= "red" )
上面的代码用于绘制无标度拓扑拟合指数图,输入的数据为清洗好的表达矩阵。一般选择在0.9以上的,第一个达到0.9以上数值,作为β值。
1 2 3 4 plot( sft$ fitIndices[ , 1 ] , sft$ fitIndices[ , 5 ] , xlab= "Soft Threshold (power)" , ylab= "Mean Connectivity" , type= "n" , main = paste( "Mean connectivity" ) ) text( sft$ fitIndices[ , 1 ] , sft$ fitIndices[ , 5 ] , labels= powers, cex= 0.9 , col= "red" )
β值选接近平缓的第一个值,网络的连通性较好。sft$powerEstimate
可以获得推荐值
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 nSamples = nrow( datExpr) type = "unsigned" if ( is.na ( power) ) { power = ifelse( nSamples< 20 , ifelse( type == "unsigned" , 9 , 18 ) , ifelse( nSamples< 30 , ifelse( type == "unsigned" , 8 , 16 ) , ifelse( nSamples< 40 , ifelse( type == "unsigned" , 7 , 14 ) , ifelse( type == "unsigned" , 6 , 12 ) ) ) ) }
计算邻接矩阵 1 2 3 4 5 corFnc = "cor" corOptions = list ( use = "p" ) corFnc = "bicor" corOptions = list ( maxPOutliers = 0.05 , use = "p" ) A = adjacency( datExpr, type = "unsigned" , power = 6 , corFnc = corFnc, corOptions = corOptions) ;
计算拓扑重叠矩阵 和节点相异度矩阵 1 2 TOM = TOMsimilarity( A) dissTOM = 1 - TOM
相异度矩阵聚类分析以鉴定网络模块 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 geneTree = hclust( as.dist( dissTOM) , method = "average" ) ; plot( geneTree, xlab= "" , sub= "" , main = "Gene clustering on TOM-based dissimilarity" , labels = FALSE , hang = 0.04 ) ; minModuleSize = 30 ; dynamicMods = cutreeDynamic( dendro = geneTree, distM = dissTOM, deepSplit = 2 , pamRespectsDendro = FALSE , minClusterSize = minModuleSize) ; table( dynamicMods) dynamicColors = labels2colors( dynamicMods) table( dynamicColors) plotDendroAndColors( geneTree, dynamicColors, "Dynamic Tree Cut" , dendroLabels = FALSE , hang = 0.03 , addGuide = TRUE , guideHang = 0.05 , main = "Gene dendrogram and module colors" )
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 MEList = moduleEigengenes( datExpr, colors = dynamicColors) MEs = MEList$ eigengenes MEDiss = 1 - cor( MEs) ; METree = hclust( as.dist( MEDiss) , method = "average" ) ; plot( METree, main = "Clustering of module eigengenes" , xlab = "" , sub = "" ) MEDissThres = 0.25 abline( h= MEDissThres, col = "red" ) merge = mergeCloseModules( datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3 ) mergedColors = merge$ colors; mergedMEs = merge$ newMEs; plotDendroAndColors( geneTree, cbind( dynamicColors, mergedColors) , c ( "Dynamic Tree Cut" , "Merged dynamic" ) , dendroLabels = FALSE , hang = 0.03 , addGuide = TRUE , guideHang = 0.05 ) moduleColors = mergedColors colorOrder = c ( "grey" , standardColors( 50 ) ) ; moduleLabels = match( moduleColors, colorOrder) - 1 ; MEs = mergedMEs;
关联网络模块与性状 1 2 3 moduleTraitCor = cor( MEs, datTraits, use = "p" ) ; moduleTraitPvalue = corPvalueStudent( moduleTraitCor, nSamples) ;
1 2 3 4 5 6 7 8 9 10 11 12 13 14 textMatrix = paste( signif ( moduleTraitCor, 2 ) , "\n(" , signif ( moduleTraitPvalue, 1 ) , ")" , sep = "" ) ; labeledHeatmap( Matrix = moduleTraitCor, xLabels = names ( datTraits) , yLabels = names ( MEs) , ySymbols = names ( MEs) , colorLabels = FALSE , colors = blueWhiteRed( 50 ) , textMatrix = textMatrix, setStdMargins = FALSE , cex.text = 0.5 , zlim = c ( - 1 , 1 ) , main = paste( "Module-trait relationships" ) )
越红的模块越正相关,越蓝的模块越负相关。
关联网络模块与基因 1 2 3 4 geneModuleMembership = as.data.frame( cor( datExpr, MEs, use = "p" ) ) ; MMPvalue = as.data.frame( corPvalueStudent( as.matrix( geneModuleMembership) , nSamples) ) ;names ( geneModuleMembership) = paste( "MM" , substring( names ( MEs) , 3 ) , sep= "" ) ;names ( MMPvalue) = paste( "p.MM" , substring( names ( MEs) , 3 ) , sep= "" ) ;
关联基因与性状 1 2 3 4 5 weight = as.data.frame( datTraits$ weight_g) ; geneTraitSignificance = as.data.frame( cor( datExpr, weight, use = "p" ) ) ; GSPvalue = as.data.frame( corPvalueStudent( as.matrix( geneTraitSignificance) , nSamples) ) ;names ( geneTraitSignificance) = paste( "GS." , names ( weight) , sep= "" ) ;names ( GSPvalue) = paste( "p.GS." , names ( weight) , sep= "" ) ;
可视化基因与模块、性状的相关性 1 2 3 4 5 6 7 8 9 10 11 module = "blue" column = match( module, substring( names ( MEs) , 3 ) ) ; moduleGenes = moduleColors== module; par( mfrow = c ( 1 , 1 ) ) ; verboseScatterplot( abs ( geneModuleMembership[ moduleGenes, column] ) , abs ( geneTraitSignificance[ moduleGenes, 1 ] ) , xlab = paste( "Module Membership in" , module, "module" ) , ylab = "Gene significance for body weight" , main = paste( "Module membership vs. gene significance\n" ) , cex.main = 1.2 , cex.lab = 1.2 , cex.axis = 1.2 , col = module)
MM-GS图中的每一个点代表一个基因,横坐标值表示基因与模块的相关性,纵坐标值表示基因与表型性状的相关性,与性状高度显著相关的基因往往是与这个性状显著相关的模块中的重要元素。
获得基因注释结果 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 annot = read.csv( file = "GeneAnnotation.csv" ) ; probes2annot = match( names ( datExpr) , annot$ substanceBXH) ;sum ( is.na ( probes2annot) ) geneInfo0 = data.frame( substanceBXH = names ( datExpr) , geneSymbol = annot$ gene_symbol[ probes2annot] , LocusLinkID = annot$ LocusLinkID[ probes2annot] , moduleColor = moduleColors, geneTraitSignificance, GSPvalue) ; modOrder = order( - abs ( cor( MEs, weight, use = "p" ) ) ) ; for ( mod in 1 : ncol( geneModuleMembership) ) { oldNames = names ( geneInfo0) geneInfo0 = data.frame( geneInfo0, geneModuleMembership[ , modOrder[ mod] ] , MMPvalue[ , modOrder[ mod] ] ) ; names ( geneInfo0) = c ( oldNames, paste( "MM." , modNames[ modOrder[ mod] ] , sep= "" ) , paste( "p.MM." , modNames[ modOrder[ mod] ] , sep= "" ) ) } geneOrder = order( geneInfo0$ moduleColor, - abs ( geneInfo0$ GS.datTraits.weight_g) ) ; geneInfo = geneInfo0[ geneOrder, ] geneInfo
上述基因可以按模块拆分,然后进行富集分析
可视化 基因网络
TOMplot(dissTOM^7, geneTree, moduleColors, main = “Network heatmap plot, all genes”)
1 2 3 4 5 select = moduleGenes selectTOM = dissTOM[ select, select] ; selectTree = hclust( as.dist( selectTOM) , method = "average" ) selectColors = moduleColors[ select] ; TOMplot( selectTOM^ 7 , selectTree, selectColors, main = paste0( "Network heatmap plot, " , module, ' module gene' ) )
表征基因网络 1 2 3 4 5 6 7 8 names ( weight) = "weight" MET = orderMEs( cbind( MEs, weight) ) plotEigengeneNetworks( MET, "" , marDendro = c ( 0 , 4 , 1 , 2 ) , marHeatmap = c ( 3 , 4 , 1 , 2 ) , cex.lab = 0.8 , xLabelsAngle = 90 ) plotEigengeneNetworks( MET, "Eigengene dendrogram" , marDendro = c ( 0 , 4 , 2 , 0 ) , plotHeatmaps = FALSE ) plotEigengeneNetworks( MET, "Eigengene adjacency heatmap" , marHeatmap = c ( 3 , 4 , 2 , 2 ) , plotDendrograms = FALSE , xLabelsAngle = 90 )
从分层聚类图有一个叶子是weight,从中可以看出weight与哪些模块更接近
导出网络 Cytoscape 源自系统生物学,用于将生物分子交互网络与高通量基因表达数据和其他的分子状态信息整合在一起,可以用于大规模蛋白质-蛋白质相互作用、蛋白质-DNA和遗传交互作用的分析 。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 modules = c ( "blue" , "magenta" ) ; inModule = is.finite ( match( moduleColors, modules) ) ; modTOM = TOM[ inModule, inModule] ; modProbes = names ( datExpr) [ inModule] ;dimnames ( modTOM) = list ( modProbes, modProbes) modGenes = annot$ gene_symbol[ match( modProbes, annot$ substanceBXH) ] ; cyt = exportNetworkToCytoscape( modTOM, edgeFile = paste( "CytoscapeInput-edges-" , paste( modules, collapse= "-" ) , ".txt" , sep= "" ) , nodeFile = paste( "CytoscapeInput-nodes-" , paste( modules, collapse= "-" ) , ".txt" , sep= "" ) , weighted = TRUE , threshold = 0.02 , nodeNames = modProbes, altNodeNames = modGenes, nodeAttr = moduleColors[ inModule] )