R包WGCNA分析代码

WGCNA（加权基因共表达网络分析）R软件包，用于执行加权相关网络分析，包括网络构建、模块检测、基因选择、拓扑结构计算、数据模拟、可视化以及与外部软件的接口等功能。WGCNA旨在寻找协同表达的基因模块，并探索基因网络与关注的表型之间的关联，以及网络中的核心基因。推荐15个样品以上的数据。
### 1. 包的安装和加载
## 安装依赖包
if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")
BiocManager::install(c("GO.db", "preprocessCore", "impute"))

install.packages('WGCNA')

library(WGCNA)
packageVersion('WGCNA')
help(package='WGCNA')

### 2. 准备数据
## 读入基因表达数据,行：基因名，列：样品名
exprs_file <- "/Users/Datasets/GDC_TCGA_Rectal_Cancer_READ/gene_expression/TCGA-READ.htseq_fpkm-uq.tsv"
exprs_df <- read.table(exprs_file,sep="\t",head = TRUE,row.names=1)
exprs_df[1:3,1:5]
dim(exprs_df)
names(exprs_df)

## 转化为行为样本名，列为基因名的数据框
datExpr <- as.data.frame(t(as.matrix(exprs_df)))
datExpr[1:3,1:5]

## 读入表型数据
clin_file <- "/Users/Datasets/GDC_TCGA_Rectal_Cancer_READ/Phenotype/TCGA-READ.GDC_phenotype.tsv"
# Display the current working directory
datClin <- read.table(clin_file,sep="\t",head = TRUE,row.names=1)
datClin[1:3,1:5]
# 行名转化 TCGA.AG.A025.01A -> TCGA-AG-A025-01A
library(stringr)
rownames(datClin) <- str_replace_all(rownames(datClin), "-", ".")
samples <-  intersect(rownames(datExpr),rownames(datClin))

## 最终用于演示的基因表达数据
datExpr <- datExpr[samples,]
datExpr <- datExpr[,1:1000]
dim(datExpr) # 90个样本，1000个基因

## 最终用于演示的表型数据
features <- c('bmi.exposures',
              'height.exposures',
              'weight.exposures')

# features <- c('tumor_grade.diagnoses',
#               'tumor_stage.diagnoses',
#               'lymphatic_invasion')
datClin <- datClin[samples,features]
# names(datClin)
dim(datClin)
# 填充缺省值
datClin[which(datClin[,'bmi.exposures']==NA),] = 20.0
datClin[which(datClin[,'height.exposures']==NA),] = 175.5

### 3.离群点检查
table(is.na(datExpr))
# 检查是否有离群值，
gsg = goodSamplesGenes(datExpr, verbose = 3)
class(gsg) #[1] "list"
gsg$allOK
# 去除有离群值的行（sample）和列（gene）
if (!gsg$allOK)
{
  # Optionally, print the gene and sample names that were removed:
  if (sum(!gsg$goodGenes)>0)
    printFlush(paste("Removing genes:", paste(names(datExpr)[!gsg$goodGenes], collapse = ", ")));
  if (sum(!gsg$goodSamples)>0)
    printFlush(paste("Removing samples:", paste(rownames(datExpr)[!gsg$goodSamples], collapse = ", ")));
  # Remove the offending genes and samples from the data:
  datExpr = datExpr[gsg$goodSamples, gsg$goodGenes]
}
dim(datExpr)
# 聚类样本，观察是否有离群值或异常值。
sampleTree = hclust(dist(datExpr), method = "average")
sizeGrWindow(12,9) #视图
par(cex = 0.6);
par(mar = c(0,4,2,0))
plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
     cex.axis = 1.5, cex.main = 2)
# names(phenotype_df)

## 可视化表型数据与基因表达量数据的联系
sampleTree2 = hclust(dist(datExpr), method = "average")
traitColors = numbers2colors(datClin, signed = FALSE) # 数值到颜色映射
# traitColors = labels2colors(datClin) # 将标签转换为颜色
plotDendroAndColors(sampleTree2, traitColors,
                    groupLabels = names(datClin),
                    main = "Sample dendrogram and trait heatmap")

### 4. 挑选软阈值
## 挑选软阈值是构建网络拓扑分析的关键，
## 挑选合适的软阈值使得网络具备无标度拓扑结构
enableWGCNAThreads() # 开启多线程
powers = c(c(1:10), seq(from = 12, to=20, by=2))
# 软阈值的无标度拓扑分析
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
sft$powerEstimate  # [1] 3
# 可视化
sizeGrWindow(9, 5)
par(mfrow = c(1,2))
cex1 = 0.9
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"))

text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
abline(h=0.85,col="red")  # 查看位于0.85以上的点，可以改变高度值
# 一般选择在0.9以上的，第一个达到0.9以上数值。
# 如果没有，可以降低，但不要低于0.8

## 如果sft$powerEstimate显示的结果为NA，则手动挑选

## 平均连接度
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

### 5. 共表达网络构建
# 自动化网络构建和模块检测
net = blockwiseModules(datExpr, power = 3,
                       TOMType = "unsigned", minModuleSize = 30,
                       reassignThreshold = 0, mergeCutHeight = 0.15,
                       numericLabels = TRUE, pamRespectsDendro = FALSE,
                       #saveTOMs = TRUE,
                       #saveTOMFileBase = "TCGA_READ_TOM",
                       verbose = 3)

# power = 3 为挑选的软阈值
#minModuleSize：模块中最少的基因数
#mergeCutHeight ：模块合并阈值，阈值越大，模块越少（重要）
#saveTOMs = TRUE,saveTOMFileBase = "femaleMouseTOM"保存TOM矩阵，名字为"femaleMouseTOM"
#net$colors 包含模块分配，net$MEs 包含模块的模块特征基因。

table(net$colors) # 模块数

# 模块的层次聚类树状图
sizeGrWindow(12, 9)
mergedColors = labels2colors(net$colors)
plotDendroAndColors(net$dendrograms[[1]], mergedColors[net$blockGenes[[1]]],
                    "Module colors",
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

## 保存分配模块和模块包含的基因信息。
# moduleLabels = net$colors
# moduleColors = labels2colors(net$colors)
# MEs = net$MEs; # 主成分
# geneTree = net$dendrograms[[1]];
# save(MEs, moduleLabels, moduleColors, geneTree,
#      file = "TCGA_READ-auto.RData")

### 6.模块-表型数据关联
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)
# 重新计算带有颜色标签的模块
table(moduleColors)
# 计算数据集中各个基因模块的第一主成分
MEs0 = moduleEigengenes(datExpr, moduleColors)$eigengenes
# 排序，将相近的特征向量放在一起
MEs = orderMEs(MEs0)
# moduleTraitCor = cor(MEs, use = "p")
moduleTraitCor = cor(MEs, datClin, use = "p")
# datClin 必需是数值型
moduleTraitPvalue = corPvalueStudent(moduleTraitCor, nSamples)
# 通过相关值对每个关联进行颜色编码
sizeGrWindow(10,6)
# 展示模块与表型数据的相关系数和 P值
textMatrix = paste(signif(moduleTraitCor, 2), "\n(",
                   signif(moduleTraitPvalue, 1), ")", sep = "");
dim(textMatrix) = dim(moduleTraitCor)
par(mar = c(6, 8.5, 3, 3))
# 用热图的形式展示相关系数
labeledHeatmap(Matrix = moduleTraitCor,
               xLabels = names(datClin),
               yLabels = names(MEs),
               ySymbols = names(MEs),
               colorLabels = FALSE,
               #colors = greenWhiteRed(50),
               colors = blueWhiteRed(50), 
               textMatrix = textMatrix,
               setStdMargins = FALSE,
               cex.text = 0.5,
               zlim = c(-1,1),
               main = paste("Module-trait relationships"))

### 7. 模块成员（基因）和模块的相关性
## 模块eigengene的值代表模块
weight = as.data.frame(datClin$weight.exposures)
names(weight) = "weight"
modNames = substring(names(MEs), 3)
geneModuleMembership = as.data.frame(cor(datExpr, MEs, use = "p"))

#rownames(MEs)[1:10]
#rownames(datExpr)[1:10]
# 计算p值
MMPvalue = as.data.frame(corPvalueStudent(as.matrix(geneModuleMembership), nSamples));
names(geneModuleMembership) = paste("MM", modNames, sep="")
names(MMPvalue) = paste("p.MM", modNames, sep="")

### 8. 模块成员（基因）和表型数据的相关性
## 基因的显著性GS定义为基因与性状的相关性（绝对值）
geneTraitSignificance = as.data.frame(cor(datExpr, weight, use = "p")) #和体重性状的关联
# 计算p值
GSPvalue = as.data.frame(corPvalueStudent(as.matrix(geneTraitSignificance), nSamples))
names(geneTraitSignificance) = paste("GS.", names(weight), sep="")
names(GSPvalue) = paste("p.GS.", names(weight), sep="")

### 9. MM-GS图
## 可视化GS和MM
module = "turquoise"
column = match(module, modNames)
moduleGenes = moduleColors==module
sizeGrWindow(7, 7)
par(mfrow = c(1,1))
verboseScatterplot(abs(geneModuleMembership[moduleGenes, column]),
                    abs(geneTraitSignificance[moduleGenes, 1]),
                    xlab = paste("Module Membership in", module, "module"),
                    ylab = "Gene significance (GS)",
                    main = paste("Module membership vs. gene significance\n"),
                    cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2, col = module)
##图中的每一个点代表一个基因，横坐标值表示基因与模块的相关性，
##纵坐标值表示基因与表型性状的相关性

### 10. 筛选和表型性状显著相关的基因(Hub基因)
geneInfo = data.frame(moduleColor = moduleColors,
                      geneTraitSignificance,
                      GSPvalue)
geneInfo[1:5,]

#head(GSPvalue)
#head(geneTraitSignificance)

## 模块和表型性状（weight）的相关性，并排序
modOrder = order(-abs(cor(MEs, weight, use = "p")))
head(MEs)
# 添加模块成员的信息：
for (mod in 1:ncol(geneModuleMembership))
{
  oldNames = names(geneInfo)
  geneInfo = data.frame(geneInfo, geneModuleMembership[, modOrder[mod]],
                         MMPvalue[, modOrder[mod]])
  names(geneInfo) = c(oldNames, paste("MM.", modNames[modOrder[mod]], sep=""),
                       paste("p.MM.", modNames[modOrder[mod]], sep=""))
}
head(geneInfo)

geneOrder = order(geneInfo$moduleColor, -abs(geneInfo$GS.weight))  # 排序
geneInfo = geneInfo[geneOrder, ]
## 输出为CSV格式
#write.csv(geneInfo, file = "geneInfo.csv")

## 筛选hub基因
sigLevel = 0.001
sigGeneInfo <- geneInfo[which(geneInfo$p.GS.weight < sigLevel),]
dim(sigGeneInfo)
hubGenes <- rownames(sigGeneInfo)

### 11. Hub基因富集分析
# GO、KEGG、MSigDB通路富集分析

### 12. 网络可视化
## TOM (Topological overlap matrix)：把邻接矩阵转换为拓扑重叠矩阵，
## 以降低噪音和假相关，获得的新距离矩阵，
## 这个信息可拿来构建网络或绘制TOM图。
#计算TOM矩阵
plotTOM = TOMsimilarityFromExpr(datExpr, power = 3)
# 网络热图：在热图中，高度共表达的相互连通性由逐渐饱和的黄色和红色表示。
TOMplot(plotTOM, geneTree, moduleColors, main = "Network heatmap plot, all genes")

sizeGrWindow(7,7)
# The following shows the correlations between the hub genes
plotNetworkHeatmap(datExpr, plotGenes = hubGenes,
                   networkType="signed", useTOM=FALSE,
                   power=1, main="signed correlations")


### 13.绘制模块之间相关性图
#pdf(file="Eigengene dendrogram and Eigengene adjacency heatmap.pdf", width=5, height=7.5) 
par(cex = 0.9) 
plotEigengeneNetworks(MEs, "", marDendro = c(0,4,1,5), 
                      marHeatmap = c(3,3,1,1), cex.lab = 0.8, 
                      xLabelsAngle= 90) 

## margin：上右下左
#dev.off()

# # 只画模块聚类图
# sizeGrWindow(6,6);
# par(cex = 1.0)
# plotEigengeneNetworks(MEs, "Eigengene dendrogram", marDendro = c(0,4,2,0),
#                       plotHeatmaps = FALSE)
# MEDissThres = 0.8######剪切高度可修改 
# # Plot the cut line into the dendrogram 
# abline(h=MEDissThres, col = "red") 

# # 只画热图
# plotEigengeneNetworks(MEs, "Eigengene adjacency heatmap", marHeatmap = c(3,4,2,2),
#                       plotDendrograms = FALSE, xLabelsAngle = 90)

### 14.网络输出
TOM = TOMsimilarityFromExpr(datExpr, power = 3)
#dim(TOM)
#dim(datExpr)
rownames(TOM) = colnames(TOM) = colnames(datExpr)
hubTOM = TOM[hubGenes, hubGenes]
dim(hubTOM)
# 导出到网络可视化软件Cytoscape
cyt = exportNetworkToCytoscape(hubTOM,
                               edgeFile = paste("CytoscapeInput-edges-", paste("hubGenes", collapse="-"), ".txt", sep=""),
                               nodeFile = paste("CytoscapeInput-nodes-", paste("hubGenes", collapse="-"), ".txt", sep=""),
                               weighted = TRUE,
                               threshold = 0.01,
                               nodeNames = hubGenes)
参考：
https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-9-559
https://horvath.genetics.ucla.edu/html/CoexpressionNetwork/Rpackages/WGCNA/Tutorials/index.html
https://zhuanlan.zhihu.com/p/425952397
https://zhuanlan.zhihu.com/p/503837859?utm_id=0
https://www.zhihu.com/tardis/zm/art/469889443?source_id=1005
R包WGCNA分析代码

悦读