【原】9步驟完成單細胞數(shù)據(jù)挖掘文章全部圖表復現(xiàn)

健明 2023-08-04 發(fā)布于廣東

展開全文

兩個月前我們更新了優(yōu)秀的馬拉松學員筆記：單細胞+芯片+轉(zhuǎn)錄組測序的數(shù)據(jù)挖掘文章一比一復現(xiàn)，內(nèi)容非常詳實，理論上該流程可以復用到然后一個癌癥或者其它疾病，大綱也很清晰（單細胞數(shù)據(jù)分析本身往往是數(shù)據(jù)挖掘課題的一個環(huán)節(jié)而已），接下來繼續(xù)更新另外一個癌癥的復現(xiàn)，大綱不一樣哦，大家一定要抽空刷完這波：

文章圖表復現(xiàn)： Integration of scRNA-Seq and Bulk RNA-Seq to Analyse the Heterogeneity of Ovarian Cancer Immune Cells and Establish a Molecular Risk Model

文章大概思路如下：

對ssRNA數(shù)據(jù)降維、分群、注釋
用GSVA對鑒定到的T細胞，B細胞，髓系細胞繼續(xù)細分，發(fā)現(xiàn)了兩種關(guān)鍵的細胞類型
對TCGA數(shù)據(jù)進行免疫浸潤分析，發(fā)現(xiàn)其中一種細胞的比例與生存密切相關(guān)
用WGCNA分析找到與該細胞相關(guān)的hub gene
下載imvigor 210隊列數(shù)據(jù)，用SVM模型預測免疫表型
用NMF把TCGA樣本分為兩個亞組，該細胞在這兩個亞組的比例有顯著差異
批量單因素cox回歸找出hub gene中與生存相關(guān)的gene
lasso回歸進一步篩選gene
多因素cox回歸，計算riskScore，riskScore與生存信息的相關(guān)性

具體復現(xiàn)流程

1. 細胞分群注釋

首先下載數(shù)據(jù)，文章中用了兩個scRNA數(shù)據(jù)集，GSE154600和GES158937

普通點擊下載太慢了，點進NCBI的FTP站點，在linux上用axel下載會快很多

網(wǎng)頁顯示

把數(shù)據(jù)整理成下圖標準格式就可以讀取了

數(shù)據(jù)標準格式

library(Seurat)
library(stringr)
library(clustree)
library(dplyr)
library(GSVA)
library(msigdbr)
library(GSEABase)
library(pheatmap)
library(harmony)
rm(list = ls())
gc()

# 1. 循環(huán)讀取每個樣本的數(shù)據(jù)
filename <- paste('processed_data/',list.files('processed_data/'),sep = '')
sceList <- lapply(filename, function(x){
  obj <- CreateSeuratObject(counts = Read10X(x),
                            project = str_split(x,'/')[[1]][2])
})
names(sceList) <- list.files('processed_data/')
sce <- merge(sceList[[1]],sceList[-1],add.cell.ids = names(sceList),project='OC')
dim(sce) # 38224個gene， 67323個cell

簡單過濾后，看下PCA，有明顯批次效應

# 2. 查看線粒體(MT開頭)、核糖體(RPS/RPL開頭)基因占比
grep('^MT',x=rownames(sce@assays$RNA@data),value = T)
grep('^RP[SL]',x=rownames(sce@assays$RNA@data),value = T)

sce <- PercentageFeatureSet(sce,pattern = '^MT',col.name = 'percent.MT')
sce <- PercentageFeatureSet(sce,pattern = '^RP[SL]',col.name = 'percent.RP')
VlnPlot(sce,features = 'percent.MT',pt.size = 0)
VlnPlot(sce,features = 'percent.RP',pt.size = 0)
VlnPlot(sce,features = 'nCount_RNA',pt.size = 0)
VlnPlot(sce,features = 'nFeature_RNA',pt.size = 0)
# 3. 過濾細胞
sce <- subset(sce,subset = nCount_RNA>3000 & nFeature_RNA>300 & percent.MT<40)
dim(sce) # 38224個gene， 29389個cell
# 4. 過濾gene
sce <- sce[rowSums(sce@assays$RNA@counts>0)>3,]
dim(sce) # 28074個gene， 29389個cell

# 5. 看下批次效應
sce <- NormalizeData(sce) # 默認用logNormalize
sce <- FindVariableFeatures(sce) # 默認選Top2000作為高變gene
sce <- ScaleData(sce) # 默認用variableFeature進行scale
sce <- RunPCA(sce) # 默認用variableFeature進行PCA降維，降維前一定要做scale
DimPlot(sce,reduction = 'pca',group.by = 'orig.ident')

原文中用SCTransform進行數(shù)據(jù)標準化，那再用SCTransform做一遍

運行完了后，在assay下面，除了RNA外還多了一個SCT，然后默認的assay也變成了'SCT’

再看看PCA結(jié)果，批次效應貌似也好了些

# 6. SCTransform進行標準化
sce <- SCTransform(sce) # 這一步包含了NormalizeData 、ScaleData、FindVariableFeatures三步
DefaultAssay(sce)
sce <- RunPCA(sce)
DimPlot(sce,reduction = 'pca',group.by = 'orig.ident')
ElbowPlot(sce,ndims = 40)

這里選擇resolution為0.4，20個cluster，跟文章中差不多

從樣本進行區(qū)分的umap圖上來看，貌似還是有批次效應，其實可以再SCTransform之前再增加一步去批次效應的步驟，這里就略過了

# 7. 在PCA基礎上進行umap降維,然后分群
sce <- RunUMAP(sce,reduction = 'pca',dims = 1:40)
sce <- FindNeighbors(sce,dims = 1:40)

sce_res <- sce
for (i in c(0.01, 0.05, 0.1, 0.15, 0.2, 0.3,0.4, 0.5,0.8,1)){
  sce_res <- FindClusters(sce_res,resolution = i)
}
clustree(sce_res,prefix = 'SCT_snn_res.')

sce <- FindClusters(sce,resolution = 0.4)
DimPlot(sce,reduction = 'umap',group.by = 'seurat_clusters',label = T)

細胞注釋完了后，把每個細胞的注釋結(jié)果添加到metadata里

# 8. 手動進行細胞注釋

# 免疫細胞  5,6,8,9,15,20 
DotPlot(sce,features = c('PTPRC','CD45'))
# T細胞   0,[8],9,13
DotPlot(sce,features = c('CD3D','CD3E','CD8A')) 
# B細胞   14,16,18,[20]
DotPlot(sce,features = c('CD79A', 'CD37', 'CD19', 'CD79B', 'MS4A1','CD20'))
# 漿細胞 [14,16,18]
DotPlot(sce,features = c('IGHG1','MZB1','SDC1','CD79A'))
# NK細胞  5,[9,15]
DotPlot(sce,features = c('FGFBP2','FCG3RA','CX3CR1'))
# NK細胞  8,15
DotPlot(sce,features = c('CD160','NKG7','GNLY','CD247','CCL3','GZMB','CXCR1','TYOB','PRF1'))
# 內(nèi)皮細胞 [11]
DotPlot(sce,features = c('PECAM1','VWF'))
# 成纖維細胞 [0,7],10,11
DotPlot(sce,features = c('FGF7','MME','DCN','LUM','GSN','PF4','PPBP'))
# 上皮細胞 1,2,[3],4,[12,13,17,19]
DotPlot(sce,features = c('EPCAM','KRT19','PROM1','ALDH1A1','CD24'))
# 單核細胞和巨噬細胞 5,6,9,15
DotPlot(sce,features = c('CD68','CD163','CD14','LYZ'))
# Ovarian somatic cell  [13]
DotPlot(sce,features = c('LGR5'))

marker <- data.frame(cluster = 0:20,cell = 0:20)
marker[marker$cluster %in% c(5,6,8,9,15,20),2] <- 'Immune cells'
marker[marker$cluster %in% c(0,8,9,13),2] <- 'T cells'
marker[marker$cluster %in% c(14,16,18,20),2] <- 'B cells'
marker[marker$cluster %in% c(14,16,18),2] <- 'plasma cells'
marker[marker$cluster %in% c(5,9,15),2] <- 'NK cells'
marker[marker$cluster %in% c(11),2] <- 'Endothelial cells'
marker[marker$cluster %in% c(5,6),2] <- 'myeloid cells'
marker[marker$cluster %in% c(1,2,3,4,12,13,17,19),2] <- 'epithelial cell'
marker[marker$cluster %in% c(0,7,10),2] <- 'fibroblast'
marker[marker$cluster %in% c(13),2] <- 'Ovarian somatic cell'

sce@meta.data$cell_type <- sapply(sce@meta.data$seurat_clusters,function(x){marker[x,2]})
DimPlot(sce,reduction = 'umap',group.by = 'cell_type',label = T)

# 9. marker gene 熱圖展示
heatmap_marker_gene <- c('CD3D','CD3E','CD8A',
                         'CD79A', 'CD19', 'CD79B', 'MS4A1','CD20',
                         'MZB1','SDC1','CD79A',
                         'CD68','CD163','CD14','LYZ')
heatmap_cells <- rownames(sce@meta.data[sce@meta.data$cell_type %in% c("myeloid cells","B cells","T cells"),])
sub_sce <- subset(sce,cells = heatmap_cells)
DoHeatmap(sub_sce,features = heatmap_marker_gene,group.by = 'cell_type',size = 3)

2.用GSVA對鑒定到的T細胞，B細胞，髓系細胞繼續(xù)細分，發(fā)現(xiàn)了兩種關(guān)鍵的細胞類型

# 10. GSVA分析

Hallmark_gene_set <- msigdbr(species = 'Homo sapiens',category = 'H')
Hallmark_list <- split(Hallmark_gene_set$gene_symbol,Hallmark_gene_set$gs_name)

gsva_res <- gsva(expr = sce@assays$SCT@scale.data,
                 gset.idx.list = Hallmark_list,
                 method = 'ssgsea',
                 kcdf='Poisson',
                 parallel.sz = parallel::detectCores())
pheatmap(gsva_res,fontsize_row = 4,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,
         show_rownames = T)

文中作者鑒定出T細胞，B細胞和髓系細胞后，用GSVA再進行細分，采用的基因集就是msigdb的Hallmark gene set。

這里讀入的hallmark_gene_set是一個8209*15的data frame，找出它的gene symbol和gene set name這兩列，用split函數(shù)把它轉(zhuǎn)換成一個列表，這樣hallmark_list是一個長度為length(unique(Hallmark_gene_set$gs_name))的list，其中hallmark_list的每一項都是某一gs_name對應的所有g(shù)ene_symbol，只需要把表達矩陣和gene list傳遞給gsva函數(shù)即可進行g(shù)sva分析

但是簡單看了下hallmark gene set的名字，根據(jù)這些gene set似乎沒法對T細胞和B細胞再進行細分，不太清楚作者是怎么做的

于是發(fā)現(xiàn)了另一個基因集C8

一共700個基因集，看下和T細胞，B細胞相關(guān)的基因集

并不是很多，和OV相關(guān)的也沒有，感覺也不太合適，于是決定上cellmarker自己下載基因集

cell_type_gene_set <- msigdbr(species = 'Homo sapiens',category = 'C8')
cell_type_list <- split(cell_type_gene_set$gene_symbol,cell_type_gene_set$gs_name)
names(cell_type_list)

搜出來直接點擊就可以下載了

t_cell_marker <- read.table('CellMarker_T_cell.csv',sep = ',',header = 1)
b_cell_marker <- read.table('CellMarker_B_cell.csv',sep = ',',header = 1)
macrophage_marker <- read.table('CellMarker_Macrophage.csv',sep = ',',header = 1)

讀進來是這樣的一個表格，species中除了人還有小鼠的，就根據(jù)cell type和cell marker這兩列去構(gòu)建gene list，只要保證最后構(gòu)建出來的gene list的每一項是一個包含gene symbol的vector，然后每一項的名字是對應的細胞類型就可以了，我寫的比較復雜，大家可以自己發(fā)揮

需要注意的有幾點：

cellmarker這一列里，有多個gene的話，多個gene是一個字符串，需要自己拆開，添加到一個vector里
CD4+ T cell可能包含Activated CD4+ T cell和Effector CD4+ T cell，可以把它們對應的gene都合并成一個vector

t_cell_marker <- t_cell_marker[t_cell_marker$Species=='Human',]
b_cell_marker <- b_cell_marker[b_cell_marker$Species=='Human',]
macrophage_marker <- macrophage_marker[macrophage_marker$Species=='Human',]

# T cell 挑了4種進行熱圖展示
cytotoxic <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'cytotoxic')]
naive <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'naive')]
regulatory <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'regulatory')]
exhausted <- unique(t_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(t_cell_marker$Cell.Type)),'exhausted')]
t_cell_df <- t_cell_marker[t_cell_marker$Cell.Type %in% c(cytotoxic,naive,regulatory,exhausted),]
t_cell_df[t_cell_df$Cell.Type %in% naive,"Cell.Type"] <- 'naive'
t_cell_df[t_cell_df$Cell.Type %in% cytotoxic,"Cell.Type"] <- 'cytotoxic'
t_cell_df[t_cell_df$Cell.Type %in% regulatory,"Cell.Type"] <- 'regulatory'
t_cell_df[t_cell_df$Cell.Type %in% exhausted,"Cell.Type"] <- 'exhausted'
t_cell_list <- split(t_cell_df$Cell.Marker,t_cell_df$Cell.Type)

# B cell 挑了5種進行熱圖展示
unique(b_cell_marker$Cell.Type)
b_plasma <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'plasma')]
b_memory <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'memory')]
b_naive <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'naive')]
b_activated <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'activated')]
b_transitional <- unique(b_cell_marker$Cell.Type)[str_detect(str_to_lower(unique(b_cell_marker$Cell.Type)),'transitional')]

b_cell_df <- b_cell_marker[b_cell_marker$Cell.Type %in% c(b_plasma,b_memory,b_naive,b_activated,b_transitional),]
b_cell_df[b_cell_df$Cell.Type %in% b_plasma,"Cell.Type"] <- 'plasma'
b_cell_df[b_cell_df$Cell.Type %in% b_memory,"Cell.Type"] <- 'memory'
b_cell_df[b_cell_df$Cell.Type %in% b_naive,"Cell.Type"] <- 'naive'
b_cell_df[b_cell_df$Cell.Type %in% b_activated,"Cell.Type"] <- 'activated'
b_cell_df[b_cell_df$Cell.Type %in% b_transitional,"Cell.Type"] <- 'transitional'
b_cell_list <- split(b_cell_df$Cell.Marker,b_cell_df$Cell.Type)

# Macrophage cell
unique(macrophage_marker$Cell.Type)
M1 <- unique(macrophage_marker$Cell.Type)[str_detect(str_to_lower(unique(macrophage_marker$Cell.Type)),'m1')]
M2 <- unique(macrophage_marker$Cell.Type)[str_detect(str_to_lower(unique(macrophage_marker$Cell.Type)),'m2')]
macrophage_df <- macrophage_marker[macrophage_marker$Cell.Type %in% c(M1,M2),]
macrophage_list <- split(macrophage_df$Cell.Marker,macrophage_df$Cell.Type)

# 到這一步，gene list里的每一項，是一個字符串（gene symbol組成的）組成的vector，我寫個函數(shù)，把每個字符串里的gene symbol拆出來，
# 最后組合成一個由gene symbol組成的vector，然后去個重
myfun <- function(x){
  res <- c()
  for (i in x){
    temp <- unlist(str_split(gsub(' ','',i),','))
    res <- c(res,temp)
  }
  unique(res)
}

t_cell_list <- lapply(t_cell_list, function(x){myfun(x)})
b_cell_list <- lapply(b_cell_list, function(x){myfun(x)})
macrophage_list <- lapply(macrophage_list, function(x){myfun(x)})

# 找到三種細胞分別對應的表達矩陣
t_cell <- colnames(sce[,sce@meta.data$cell_type=='T cells'])
b_cell <- colnames(sce[,sce@meta.data$cell_type=='B cells'])
macrophage_cell <- colnames(sce[,sce@meta.data$cell_type=='myeloid cells'])

t_cell_exp <- sce@assays$SCT@scale.data[,t_cell]
b_cell_exp <- sce@assays$SCT@scale.data[,b_cell]
macrophage_cell_exp <- sce@assays$SCT@scale.data[,macrophage_cell]

t_cell_gsva_res <- gsva(expr = t_cell_exp,
                           gset.idx.list = t_cell_list,
                           method = 'ssgsea',
                           kcdf='Poisson',
                           parallel.sz = parallel::detectCores())
pheatmap(t_cell_gsva_res,fontsize_row = 8,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,show_rownames = T,cluster_cols = F)

b_cell_gsva_res <- gsva(expr = b_cell_exp,
                        gset.idx.list = b_cell_list,
                        method = 'ssgsea',
                        kcdf='Poisson',
                        parallel.sz = parallel::detectCores())

pheatmap(b_cell_gsva_res,fontsize_row = 8,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,show_rownames = T,cluster_cols = F)

macrophage_cell_gsva_res <- gsva(expr = macrophage_cell_exp,
                        gset.idx.list = macrophage_list,
                        method = 'ssgsea',
                        kcdf='Poisson',
                        parallel.sz = parallel::detectCores())

pheatmap(macrophage_cell_gsva_res,fontsize_row = 8,
         height = 10,
         width=18,
         annotation_col = data.frame(row.names = colnames(sce),group=sce@meta.data$orig.ident),
         show_colnames = F,show_rownames = T,cluster_cols = F)

3. 對TCGA數(shù)據(jù)進行免疫浸潤分析，發(fā)現(xiàn)其中一種細胞的比例與生存密切相關(guān)

首先下載TCGA數(shù)據(jù)

在Xena的datasets找到OV相關(guān)的數(shù)據(jù)集，點進去后下載了FPKM數(shù)據(jù)

把行名從emsembl改成symbol之后，就可以分析了

#加載需要的R包
library(TCGAbiolinks)
library(WGCNA)
library(stringr)
library(SummarizedExperiment)
library(clusterProfiler)
library(org.Hs.eg.db)
library(tibble)
library(CIBERSORT)
library(tidyr)
library(ggplot2)
library(RColorBrewer)
library(readxl)
library(survival)
library(survminer)
library(WGCNA)
library(data.table)
library(dplyr)
library(IMvigor210CoreBiologies)
library(e1071)
library(pROC)
library(NMF)
library(pheatmap)
library(clusterProfiler)
library(ggsignif)
library(limma)
library(org.Hs.eg.db)
library(glmnet)
library(tibble)
library(ggrisk)

rm(list = ls())
gc()exp <- fread('TCGA-OV.htseq_fpkm.tsv',header = T)
exp <- column_to_rownames(exp,var = 'Ensembl_ID')

# 行名轉(zhuǎn)換
exp <- as.data.frame(exp)
exp$ensembl_id <- str_split(rownames(exp),'\\.',simplify = T)[,1]
id_df <- bitr(exp$ensembl_id,fromType = 'ENSEMBL',toType = 'SYMBOL',OrgDb = org.Hs.eg.db)
id_df <- id_df[!duplicated(id_df$ENSEMBL),]
index <- match(exp$ensembl_id,id_df$ENSEMBL)
# 對symbol id這一列去重去空，設為行名
exp$symbol_id <- id_df[index,'SYMBOL']
exp <- exp[!is.na(exp$symbol_id),]
exp <- exp[!duplicated(exp$symbol_id),]
rownames(exp) <- exp$symbol_id
exp <- exp[,1:(ncol(exp)-2)]

save(exp,file = 'TCGA_OV.Rdata')

免疫浸潤

# 免疫浸潤
load('TCGA_OV.Rdata')
lm22f <- system.file('extdata','LM22.txt',package = 'CIBERSORT')
TME.result <- cibersort(lm22f,as.matrix(exp),perm = 100,QN=T)
TME.result <- as.data.frame(TME.result)
save(result,file = 'TME_result.Rdata')

group <- ifelse(as.numeric(substr(colnames(exp),14,15))<10,'tumor','normal')
patient_exp <- exp[,which(group=='tumor')]
result <- TME.result[,-c(23:25)] %>% as.data.frame() %>% 
  rownames_to_column('Sample') %>% gather(key = cell_type,value = proportion,-Sample)

my_pallet <- colorRampPalette(brewer.pal(8,'Set1'))
ggplot(result,aes(Sample,proportion,fill=cell_type))+geom_bar(stat = 'identity')+
  scale_fill_manual(values = my_pallet(22))+
  labs(x='',y = "Estiamted Proportion",fill='cell_type')+theme_bw()+
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        legend.position = "bottom")

cibersort計算了每種免疫細胞在每個樣本中的豐度值，接下來要看看M1和M2這兩種細胞對生存的影響

首先下載臨床信息數(shù)據(jù)，從里面找到OV相關(guān)的臨床信息即可

文中作者應該是分析了M1和M2兩種細胞，然后發(fā)現(xiàn)只有M1與生存信息顯著相關(guān)

# KM-plot
load('TCGA_OV.Rdata')
load('TME_result.Rdata')
# 準備生存數(shù)據(jù)，包含OS,OS.time,DFI,DFI.time這些信息
# https://gdc./about-data/publications/pancanatlas
sur_data <- read_xlsx('TCGA-CDR-SupplementalTableS1.xlsx',sheet = 1)
sur_data <- column_to_rownames(sur_data,var = '...1')
sur_data <- sur_data[sur_data$type=='OV',c('bcr_patient_barcode','OS','OS.time','DFI','DFI.time')]
rownames(sur_data) <- sur_data$bcr_patient_barcode
save(sur_data,file = 'survival_data.Rdata')
# 建立sample和patient對應關(guān)系，把cibersort結(jié)果中Macrophages M1在每個樣本中的信息轉(zhuǎn)移過來
sample_patient_df <- data.frame(sample=colnames(exp),patient=substr(colnames(exp),1,12))
sample_patient_df$TAM_M1 <- TME.result$`Macrophages M1`
sample_patient_df$TAM_M1_type <- ifelse(sample_patient_df$TAM_M1>median(sample_patient_df$TAM_M1),'high','low')
sample_patient_df <- sample_patient_df[!duplicated(sample_patient_df$patient),]
# 下載的生存數(shù)據(jù)的臨床信息中patient id 與 exp中patient id 取交集
sur_data <- sur_data[intersect(sample_patient_df$patient,sur_data$bcr_patient_barcode),]
# 把sample_patient_df中Macrophages M1的信息對應到sur_data中每個patient id上
index <- match(sur_data$bcr_patient_barcode,sample_patient_df$patient)
sur_data$TAM_M1 <- sample_patient_df[index,'TAM_M1_type']

M1_OS <- survfit(Surv(OS.time,OS)~TAM_M1,data = sur_data)
ggsurvplot(M1_OS,pval = T,risk.table = T,surv.median.line = 'hv',
           title='Overall survival',xlab='Days')
M1_DFI <- survfit(Surv(DFI.time,DFI)~TAM_M1,data = sur_data)
ggsurvplot(M1_DFI,pval = T,risk.table = T,surv.median.line = 'hv',
           title='Disease Free Interval',xlab='Days')
save(sur_data,file = 'sur_data.Rdata')

生存信息中的行名是patient這一列，TCGA表達矩陣exp中列名是sample這一列，這兩列不是一一對應的，一個patient可能對應多個sample?？梢钥紤]直接去重或者手動刪除一個病人的多個樣本，這樣一個病人只對應一個樣本，同樣也只有一個TAM_M1的豐度信息，就可以進行后續(xù)分析了

這里我根據(jù)TAM_M1的中位數(shù)，把樣本分為了M1高表達和低表達

最后的sur_data長這樣

4. 用WGCNA分析找到與該細胞相關(guān)的hub gene

# WGCNA
load('TCGA_OV.Rdata')
load('TME_result.Rdata')
load('sur_data.Rdata')
exp <- as.data.frame(exp)

# 過濾異常值
exp <- exp[rowMads(as.matrix(exp))>0.01,]
dim(exp)

# 查找并刪除異常值
temp <- exp
colnames(temp) <- 1:ncol(exp)
plot(hclust(dist(t(temp)))) # 無異常樣本

# 構(gòu)建基因共表達網(wǎng)絡
exp <- as.data.frame(t(exp))

# 選擇合適的power
sft <- pickSoftThreshold(exp,networkType = 'signed')

a <- ggplot(sft$fitIndices,aes(x=Power,y=-sign(slope)*SFT.R.sq))+
  geom_text(label=sft$fitIndices$Power,color='red')+
  geom_hline(yintercept = 0.9,color='red')+
  xlab("Soft Threshold (power)")+ylab("Scale Free Topology Model Fit,signed R^2")+
  ggtitle("Scale independence")+theme(plot.title = element_text(hjust = 0.5))

b <- ggplot(sft$fitIndices,aes(x=Power,y=mean.k.))+
  geom_text(label=sft$fitIndices$Power,color='red')+
  xlab("Soft Threshold (power)")+ylab("Mean Connectivity")+
  ggtitle('Mean connectivity')+theme(plot.title = element_text(hjust = 0.5))

a+b
power <- sft$powerEstimate
power <- 5

# 檢驗是否符合無尺度網(wǎng)絡
k <- softConnectivity(exp,power = power)
scaleFreePlot(k)

# 構(gòu)建鄰接矩陣
adj <- adjacency(exp,power = power,type = 'signed')

# 計算TOM矩陣
TOM <- TOMsimilarity(adj,TOMType = 'signed')
save(TOM,file = 'TOM.Rdata')
load('TOM.Rdata')
# 計算gene間相異性
dissTOM <- 1-TOM

# 對gene進行聚類和可視化
geneTree <- hclust(as.dist(dissTOM),method = 'average')
sizeGrWindow(12,9)
plot(geneTree,labels = F,hang=0.04,xlab='',sub='')

# 將聚類樹進行修剪，把gene歸到不同模塊里
dynamic_module <- cutreeDynamic(dendro = geneTree,distM = dissTOM,minClusterSize = 50,pamRespectsDendro = F)
table(dynamic_module)
module_color <- labels2colors(dynamic_module)
plotDendroAndColors(dendro = geneTree,colors = module_color,dendroLabels = F,hang=0.03,addGuide = T,
                    guideHang = 0.05,groupLabels='Dynamic color tree',
                    main='gene dendrogram and module colors')

隨著power的增加，gene之間的連通性降低，取log后成線性關(guān)系，符合無尺度網(wǎng)絡分布

這里分出來20多個模塊，太多了，后面進行合并

# 隨機選擇400個基因畫拓撲重疊熱圖
set.seed(10)
select = sample(5000, size = 400)
selectTOM = dissTOM[select, select]
selectTree = hclust(as.dist(selectTOM), method = "average") 
selectColors = module_color[select]

sizeGrWindow(9,9) 
plotDiss = selectTOM^power
diag(plotDiss) = NA
TOMplot(plotDiss, 
        selectTree, 
        selectColors, 
        main = "Network heatmap plot, selected genes")

默認畫出來的是第一張圖，如果把plotDiss改成1-plotDiss，可以得到第二張圖，顏色越深表示gene相關(guān)性越強。可以看到相關(guān)性較高的gene都是在同一個模塊內(nèi)的

默認結(jié)果圖

改掉參數(shù)后

# 計算每個模塊特征gene向量
MEList <- moduleEigengenes(exp,dynamic_module)
MEs <- MEList$eigengenes

# 計算特征基因的相異度，基于相異度對原有模塊進行層次聚類
dissME <- 1-cor(MEs)
MEtree <- hclust(as.dist(dissME),method = 'average')

# 對模塊進行合并
cut_height <- 0.6
sizeGrWindow(12,8)
plot(MEtree)
abline(h=cut_height,col='red')

merge_module <- mergeCloseModules(exprData = exp,colors=module_color,cutHeight = cut_height)
merged_color <- merge_module$colors
# merged_MEs每一行是一個病人ID，每一列是一個模塊的特征gene向量
merged_MEs <- merge_module$newMEs

sizeGrWindow(12,8)
plotDendroAndColors(dendro = geneTree,colors = cbind(module_color,merged_color),dendroLabels = F,hang=0.03,addGuide = T,
                    guideHang = 0.05,groupLabels='Dynamic color tree',
                    main='gene dendrogram and module colors')
table(merged_color)

將height在0.6以下的模塊進行合并

合并模塊后，模塊數(shù)量減少到13個

# 計算模塊與TAM_M1，TAM_M2間的相關(guān)性
trait <- data.frame(row.names = rownames(exp),ID = substr(rownames(exp),1,12),
                    M1 = TME.result$`Macrophages M1`,M2 = TME.result$`Macrophages M2`)
index <- match(trait$ID,sur_data$bcr_patient_barcode)
trait$os <- sur_data[index,'OS']


module_trait_cor <- cor(merged_MEs,trait[,2:3],use = 'p')
module_trait_pvalue <- corPvalueStudent(module_trait_cor,ncol(exp))
trait_colors <- numbers2colors(trait[,2:3],signed = T,centered = T)
a <- paste(signif(module_trait_cor, 2), "\n(", signif(module_trait_pvalue, 1), ")", sep = "")
sizeGrWindow(8,6)
labeledHeatmap(module_trait_cor,xLabels = colnames(trait[,2:3]),yLabels = colnames(merged_MEs),colorLabels = FALSE, 
               colors = blueWhiteRed(50),textMatrix = a,
               setStdMargins = FALSE,cex.text = 0.65,zlim = c(-1,1), 
               main = paste("Module-trait relationships"))

藍色模塊和M1的相關(guān)性最高

# 找到M1相關(guān)的關(guān)鍵模塊
module <- 'MEblue'
module_membership <- signedKME(exp,merged_MEs)
# 計算表達矩陣exp和OS之間相關(guān)性
gene_os_sig <- apply(exp,2,function(x){cor(x,trait$os)})

verboseScatterplot(abs(module_membership$kMEblue),
                   abs(gene_os_sig),
                   xlab = paste("Module Membership in", module, "module"),
                   ylab = "Gene significance for OS",
                   main = paste("Module membership vs. gene significance\n"),
                   cex.main = 1.2, cex.lab = 1.2, cex.axis = 1.2)

# 找出MEblue模塊內(nèi)所有基因
module_membership <- module_membership[!is.na(module_membership$kMEblue),]
# 篩選出MM>0.5 和 exp-OS相關(guān)性>0.1 的作為hub gene
MM_gene <- rownames(module_membership[abs(module_membership$kMEblue)>0.5,])
OS_sig_gene <- names(gene_os_sig[abs(gene_os_sig)>0.1])
hub_gene <- intersect(MM_gene,OS_sig_gene)

save(hub_gene,file = 'WGCNA_hub_gene.Rdata')

5. 下載imvigor 210隊列數(shù)據(jù)，用SVM模型預測免疫表型

# SVM預測免疫類型
# DEseq安裝 http:///packages/3.8/bioc/html/DESeq.html
# IMvigor210CoreBiologies安裝：http://research-pub./IMvigor210CoreBiologies/packageVersions/
data(cds)
expreSet <- as.data.frame(counts(cds))
annoData <- fData(cds)
phenoData <- pData(cds)

這里需要安裝兩個包，e1071和IMvigor210CoreBiologies，后者比較難安，需要手動下載安裝包到本地，然后再進行安裝，然后就可以得到數(shù)據(jù)了

# 清洗數(shù)據(jù)
expreSet <- expreSet[rowSums(expreSet>0)>3,]

phenoData$`Immune phenotype` <- as.character(phenoData$`Immune phenotype`)
phenoData <- phenoData[!is.na(phenoData$`Immune phenotype`),]
phenoData$`Immune phenotype` <- as.factor(phenoData$`Immune phenotype`)

expreSet <- expreSet[,rownames(phenoData)]
# count轉(zhuǎn)TPM
index <- match(rownames(expreSet),annoData$entrez_id)
expreSet$gene_length <- annoData[index,'length']
kb <- expreSet$gene_length/1000
rpk <- expreSet[,-ncol(expreSet)] / kb
tpm <- t(t(rpk) / colSums(rpk) * 1E6)
exp <- tpm

# 取log
exp <- log2(exp+1)
exp <- as.data.frame(exp)

用SVM建模的大體思路是這樣的:

首先把數(shù)據(jù)分成訓練集和測試集
訓練集包含表達矩陣和每個樣本對應的標簽（免疫類型），以此構(gòu)建svm模型
把測試集的表達矩陣帶入到svm模型中，得到預測的結(jié)果
用測試集的真實標簽和預測的結(jié)果即可畫roc曲線

注意這里我把測試集的標簽由factor改為了numeric，這樣可以得到每個標簽預測的概率，畫出來的roc曲線點很多，如果不改，畫出來的roc曲線點會很少，可能只有3個點

# 劃分訓練集和測試集
test_size <- 0.2
test_sample <- sample(colnames(exp),size = as.integer(ncol(exp)*test_size))
train_sample <- setdiff(colnames(exp),test_sample)
# 訓練集和測試集的表達矩陣
test_exp <- as.data.frame(t(exp[,test_sample]))
train_exp <- as.data.frame(t(exp[,train_sample]))
# 訓練集和測試集的標簽  
test_label <- phenoData[test_sample,"Immune phenotype"]
train_label <- phenoData[train_sample,"Immune phenotype"]
# 訓練集標簽轉(zhuǎn)換為二分類
train_label_desert <- as.factor(ifelse(train_label=='desert','desert','not desert'))
train_label_inflamed <- as.factor(ifelse(train_label=='inflamed','inflamed','not inflamed'))
train_label_excluded <- as.factor(ifelse(train_label=='excluded','excluded','not excluded'))
# 測試集標簽轉(zhuǎn)換為二分類
test_label_desert <- as.factor(ifelse(test_label=='desert','desert','not desert'))
test_label_inflamed <- as.factor(ifelse(test_label=='inflamed','inflamed','not inflamed'))
test_label_excluded <- as.factor(ifelse(test_label=='excluded','excluded','not excluded'))
# 測試集標簽轉(zhuǎn)換為數(shù)字
test_label_desert <- ifelse(test_label=='desert',1,0)
test_label_inflamed <- ifelse(test_label=='inflamed',1,0)
test_label_excluded <- ifelse(test_label=='excluded',1,0)

# 訓練模型
desert_model <- svm(x = train_exp, 
                    y = train_label_desert,
                    type = 'C',kernel = 'radial',probability = T)
inflamed_model <- svm(x = train_exp, 
                    y = train_label_inflamed,
                    type = 'C',kernel = 'radial',probability = T)
excluded_model <- svm(x = train_exp, 
                    y = train_label_excluded,
                    type = 'C',kernel = 'radial',probability = T)
# 進行預測
test_predict_desert <- predict(desert_model,test_exp,prob = T)
# 得出預測結(jié)果的概率
desert_prob <- attr(test_predict_desert,'probabilities')[,2]
test_predict_inflamed <- predict(inflamed_model,test_exp)
inflamed_prob <- attr(test_predict_desert,'probabilities')[,2]
test_predict_excluded <- predict(excluded_model,test_exp)
excluded_prob <- attr(test_predict_desert,'probabilities')[,2]

desert_roc_obj <- roc(response = as.numeric(test_label_desert),
                      predictor = as.numeric(desert_prob),na.rm=T)
desert_auc <- round(auc(test_label_desert,desert_prob),2)
inflamed_roc_obj <- roc(response = as.numeric(test_label_inflamed),
                        predictor = as.numeric(inflamed_prob),na.rm=T)
inflamed_auc <- round(auc(test_label_inflamed,inflamed_prob),2)
excluded_roc_obj <- roc(response = as.numeric(test_label_excluded),
                        predictor = as.numeric(excluded_prob),na.rm=T)
excluded_auc <- round(auc(test_label_excluded,excluded_prob),2)

desert <- ggroc(desert_roc_obj)
inflamed <- ggroc(inflamed_roc_obj)
excluded <- ggroc(excluded_roc_obj)

roc_data <- list(desert_roc_obj,inflamed_roc_obj,excluded_roc_obj)
names(roc_data) <- c('desert','inflamed','excluded')

## sensitivity(敏感性): 也稱recall或TPR(真陽性率，實際為正例并且模型預測為正例 占 所有所有正例 的比例)，越接近1越好
## specificity(特異性)：也稱TNR(真陰性率，實際為負例并且模型預測為負例 占 所有所有負例 的比例)，越接近1越好
ggroc(roc_data)+geom_segment(aes(x = 1, xend = 0, y = 0, yend = 1), color="red",linetype=6)+
  annotate('text',x=c(0.12,0.12,0.12),y=c(0.1,0.06,0.02),label=c(paste0('desert_auc: ',desert_auc),
                                                              paste0('inflamed_auc: ',inflamed_auc),
                                                              paste0('excluded_auc: ',excluded_auc))

6. 用NMF把TCGA樣本分為兩個亞組，該細胞在這兩個亞組的比例有顯著差異

load('TCGA_OV.Rdata')
load('WGCNA_hub_gene.Rdata')

hubgene_exp <- exp[hub_gene,]

estimate <- nmf(hubgene_exp,rank = 2:10,method = 'brunet')
save(estimate,file = 'estimate.Rdata')
plot(estimate)

rank <- 6
nmf.rank <- nmf(hubgene_exp,rank = rank,method = 'brunet')
group <- predict(nmf.rank)
index <- extractFeatures(nmf.rank,'max')
index <- unlist(index)

nmf.exp <- hubgene_exp[index,]
nmf.exp <- na.omit(nmf.exp)

jco <- c("#2874C5","#EABF00","#C6524A","#868686")
consensusmap(nmf.rank,labRow=NA,labCol=NA,
             annCol = data.frame("cluster"=group[colnames(nmf.exp)]))

cophenetic值在6-7之間變化最大，因此rank選6，就會把379個樣本分為6類

anno_col <- data.frame(sample = colnames(nmf.exp),group = group)
anno_col <- anno_col[order(anno_col$group),]
pheatmap(nmf.exp[,rownames(anno_col)],show_colnames = F,annotation_col = select(anno_col,group),
         cluster_cols = F)

這里把annotation_col調(diào)整了順序，這樣每一個類的樣本就在一起，如果按默認的就會非?；靵y

這里annotation_col是一個一列的dataframe，行名是樣本名，唯一的一列是樣本分組信息

# 挑出兩個組分析
group12 <- group[group==2 | group==1]
group12 <- sort(group12)
exp_temp <- nmf.exp[,names(group12)]
pheatmap(exp_temp,show_colnames = F,annotation_col = data.frame(row.names = colnames(exp_temp),group = group12),
         cluster_cols = F)

# KM-plot
load('survival_data.Rdata')
load('TME_result.Rdata')
temp <- data.frame(row.names = colnames(exp_temp),bcr_patient_barcode = substr(colnames(exp_temp),1,12),
                   group=group12)
temp <- left_join(temp,sur_data,by='bcr_patient_barcode')
group_OS <- survfit(Surv(OS.time,OS)~group,data = temp)
ggsurvplot(group_OS,pval = T,risk.table = T,surv.median.line = 'hv',
           title='Overall survival',xlab='Days')

# M1細胞的小提琴圖
vln_df <- data.frame(row.names = names(group12),M1=TME.result[names(group12),'Macrophages M1'],group = group12)
ggplot(vln_df,aes(x=group,y=M1))+geom_violin(aes(fill=group))+geom_boxplot(width=0.2)+
    geom_signif(comparisons = list(c('1','2')),map_signif_level = F)

# KEGG富集
kegg_exp <- exp[,names(group12)]
design <- model.matrix(~factor(as.character(group12),levels = c('1','2')))
fit <- lmFit(kegg_exp,design=design)
fit <- eBayes(fit)
deg <- topTable(fit,coef = 2,number = Inf)
deg$change <- ifelse((deg$logFC>log2(2))&(deg$adj.P.Val<0.05),'up',
                     ifelse((deg$logFC<log2(0.5))&(deg$adj.P.Val<0.05),'down','nochange'))
save(deg,file = 'TCGA-deg.Rdata')
up_1_id <- bitr(rownames(deg[deg$change=='up',]),fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Hs.eg.db)
kegg <- enrichKEGG(up_1_id$ENTREZID,organism = 'hsa',keyType = 'ncbi-geneid')
dotplot(kegg)

up_2_id <- bitr(rownames(deg[deg$change=='down',]),fromType = 'SYMBOL',toType = 'ENTREZID',OrgDb = org.Hs.eg.db)
kegg <- enrichKEGG(up_2_id$ENTREZID,organism = 'hsa',keyType = 'ncbi-geneid')
dotplot(kegg)

7. 批量單因素cox回歸找出hub gene中與生存相關(guān)的gene

load('TCGA_OV.Rdata')
load('survival_data.Rdata')
load('TCGA-deg.Rdata')
# 數(shù)據(jù)處理
deg_gene <- rownames(deg[deg$change!='nochange',])
exp <- as.data.frame(t(exp))
exp <- exp[,deg_gene]
exp$ID <- substr(rownames(exp),1,12)
exp <- exp[!duplicated(exp$ID),]
index <- match(rownames(sur_data),exp$ID)
index <- index[!is.na(index)]

sur_data <- merge(sur_data,exp,by.x='bcr_patient_barcode',by.y='ID')

# 批量單因素COX回歸
gene <- c()
p_value <- c()
HR <- c()
lower95 <- c()
upper95 <- c()
for (i in 6:ncol(sur_data)){
  res <- coxph(Surv(OS.time,OS)~sur_data[,i],data=sur_data)
  sum_res <- summary(res)
  p <- sum_res$coefficients[,'Pr(>|z|)']
  if (p<0.05){
    gene <- c(gene,colnames(sur_data)[i])
    p_value <- c(p_value,p)
    HR <- c(HR,sum_res$conf.int[,'exp(coef)'])
    lower95 <- c(lower95,sum_res$conf.int[,'lower .95'])
    upper95 <- c(upper95,sum_res$conf.int[,'upper .95'])
  }
}
cox_df <- data.frame(row.names = gene,pvalue=p_value,HR=HR,lower.95=lower95,upper.95=upper95)

用批量單因素cox回歸，找到pvalue小于0.05的gene

8. lasso回歸進一步篩選gene

# LASSO進一步篩選gene
sur_data <- column_to_rownames(sur_data,'bcr_patient_barcode')
sur_data <- select(sur_data,c(OS,OS.time,rownames(cox_df)))
sur_data <- sur_data[!is.na(sur_data$OS.time),]
x <- as.matrix(sur_data[,5:ncol(sur_data)])
y <- Surv(sur_data$OS.time,sur_data$OS)
glm.fit <- glmnet(x,y,family = 'cox',alpha = 1)
plot(glm.fit)
cv.fit <- cv.glmnet(x,y,alpha = 1,nfolds = 10,family="cox")
plot(cv.fit)
lambda <- cv.fit$lambda.min

glm.fit <- glmnet(x,y,family = 'cox',alpha = 1,lambda = lambda)
lasso_filter_gene <- names(glm.fit$beta[glm.fit$beta[,1]!=0,1])
coef <- glm.fit$beta[glm.fit$beta[,1]!=0,1]

9. 多因素cox回歸，計算riskScore，riskScore與生存信息的相關(guān)性

# 篩選后的gene用多因素cox回歸建模
sur_data_temp <- select(sur_data,c(OS,OS.time,lasso_filter_gene))
# gene名IGKV4-1，在ggrisk時會報錯，改成.
colnames(sur_data_temp) <- c("OS","OS.time","LAMP3","STAT1","BATF2","CXCL10","CXCL11","IGKV4.1")
cox.res <- coxph(Surv(OS.time,OS)~.,data = sur_data_temp)
riskScore <- predict(cox.res,type = 'risk',newdata=sur_data_temp)
riskScore <- as.data.frame(riskScore)
riskScore$ID <- rownames(riskScore)
riskScore$risk <- ifelse(riskScore$riskScore>median(riskScore$riskScore),'high','low')
riskScore$OS <- sur_data$OS
riskScore$OS.time <- sur_data$OS.time

ggrisk(cox.res)

# riskscore做生存分析
surv_fit <- survfit(Surv(OS.time,OS)~risk,data = riskScore)
ggsurvplot(surv_fit,data = riskScore,pval = T,risk.table = T,surv.median.line = 'hv',
           legend.title = 'RiskScore',title = 'Overall survival',
           ylab='Cummulative survival',xlab='Time(Days)')