Limma分析还能这么用?2步教你用limma分析找出最佳的靶向基因!
公众号后台回复“111”
领取本篇代码、基因集或示例数据等文件
文件编号:240115
需要租赁服务器的小伙伴可以扫码添加小果,此外小果还提供生信分析,思路设计,文献复现等,有需要的小伙伴欢迎来撩~
##### 导入R包 #####
library(limma)
library(splines)
library(splines2)
ns<-naturalSpline
library(scales)
library(BinfTools)
##### 构建功能函数 #####
##### 根据给定的细胞系,构建一个线性模型矩阵,用于后续的差异分析 #####
makemodel<-function(x, colname, cell.lines){
cell.lines<-sapply(strsplit(cell.lines, split="_", fixed=T),'[[', 1)
if(ncol(x)==1){
olCells<-rownames(x)[which(rownames(x) %in% cell.lines)]
x<-as.data.frame(x[olCells,])
rownames(x)<-olCells
colnames(x)<-colname
} else {
x<-x[which(rownames(x) %in% cell.lines),]
}
sm<-NULL
if(ncol(x)==1){
sm<-ns(x[,1])
} else {
sm<-ns(x[,which(colnames(x) %in% colname)])
}
dm<-model.matrix(~sm)
return(dm)
}
##### 根据给定的细胞系和表达矩阵,返回符合条件的细胞系名称 #####
olCells<-function(cell.lines, x){
tmp<-sapply(strsplit(cell.lines, split="_", fixed=T),'[[',1)
#return(tmp)
return(cell.lines[which(tmp %in% rownames(x))])
}
##### 根据给定的两个GSEA结果,进行差异分析和可视化,比较不同物种或条件下的基因集富集情况 #####
compGSEA<-function(x, y, title="Comparing GSEA results", xlab="Human", ylab="Mouse", p=1){
x<-subset(x, padj <p)< span>
</p)<>
y<-subset(y, padj <p)< span>
</p)<>
x<-x[which(x$pathway %in% y$pathway),]
y<-y[which(y$pathway %in% x$pathway),]
x<-x[order(x$pathway),]
y<-y[order(y$pathway),]
if(!all.equal(x$pathway, y$pathway)){
stop("Check again...")
}
dat<-data.frame(row.names=x$pathway,
x=x$NES,
y=y$NES)
dat$Direction<-ifelse(dat$x*dat$y>0, "Positive", "Negative")
dat$Trend<-ifelse(dat$x*dat$y>0,abs(dat$x-dat$y), abs(dat$x-dat$y)*-1)
#print(head(dat))
g<-ggpubr::ggscatter(dat, x="x", y="y", add="reg.line", conf.int=T, color="Trend",
#palette=c(Negative="blue",Positive="red"),
cor.coef=T, add.params=list(color="purple",fill="purple"),
xlab=paste(xlab,"NES"), ylab=paste(ylab,"NES")) +
ggpubr::gradient_color(c("blue","white","red"))
print(g)
return(dat)
}
#### 提取基因的符号名称 ####
putSym<-function(res){
res$hgnc_symbol<-sapply(strsplit(rownames(res), split=".", fixed=T),'[[',1)
return(res)
}
#### 规范化 ####
fixRows<-function(x, obtype=c("res","counts"), col=NULL){
if(obtype[1]=="res"){
x<-x[order(x$baseMean, decreasing=T),]
}
if(obtype[1]=="counts"){
x<-x[order(rowMeans(x), decreasing=T),]
}
genes<-c()
if(is.null(col)){
genes<-sapply(strsplit(rownames(x), ".",T),'[[',1)
} else {
genes<-x[,which(colnames(x) %in% col)]
}
dups<-unique(genes[which(duplicated(genes))])
to_remove <- c()
for (i in 1:length(dups)) {
ind <- which(genes %in% dups[i], arr.ind = T)
to_remove <- c(to_remove, ind[-1])
}
x <- x[-to_remove, ]
genes<-genes[-to_remove]
rownames(x)<-genes
return(x)
}
##### 导入数据 #####
#读取CCLE(Cancer Cell Line Encyclopedia)的基因表达数据和细胞系信息数据
CCLE<-read.csv("/data/CCLE_expression.csv")
#区分每个细胞系
info<-read.csv("/data/sample_info.csv")
#通过 DepMap ID 和 CCLE 名称匹配细胞系标识符
CCLE<-merge(CCLE, info[,c(1,4)], by.x="X", by.y="DepMap_ID")
rownames(CCLE)<-CCLE$CCLE_Name
CCLE<-CCLE[,-(which(colnames(CCLE) %in% c("X","DepMap_ID","CCLE_Name")))]
CCLE<-t(CCLE)
#读入细胞系注释数据(识别 SCLC 细胞系——小细胞肺癌)
x<-read.delim("/data/Cell_lines_annotations_20181226.txt")
SCLC<-CCLE[,which(colnames(CCLE) %in% x[which(x$type_refined %in% "lung_small_cell"),]$CCLE_ID)]
#识别SCLC的子类
ATOH1<-c("CORL24_LUNG","HCC33_LUNG")
POU2F3<-c("NCIH1048_LUNG", "CORL311_LUNG","NCIH211_LUNG","NCIH526_LUNG")
YAP1<-c("SW1271_LUNG","NCIH1341_LUNG","DMS114_LUNG", "NCIH196_LUNG", "NCIH1339_LUNG", "SBC5_LUNG","NCIH2286_LUNG","NCIH841_LUNG")
YAP1_2<-c("SW1271_LUNG","NCIH1341_LUNG","DMS114_LUNG", "NCIH1339_LUNG", "SBC5_LUNG","NCIH2286_LUNG","NCIH841_LUNG") #YAP1_2 removes outlier sample NCIH196_LUNG
NEUROD1<-c("NCIH2066_LUNG","CORL279_LUNG","NCIH1694_LUNG","NCIH524_LUNG","DMS273_LUNG","NCIH446_LUNG","NCIH2171_LUNG","NCIH82_LUNG","SCLC21H_LUNG")
ASCL1<-colnames(SCLC)
ASCL1<-ASCL1[which(!ASCL1 %in% ATOH1)]
ASCL1<-ASCL1[which(!ASCL1 %in% POU2F3)]
ASCL1<-ASCL1[which(!ASCL1 %in% YAP1)]
ASCL1<-ASCL1[which(!ASCL1 %in% NEUROD1)]
typing<-data.frame(Cell.line=c(ATOH1,POU2F3,YAP1,NEUROD1,ASCL1),
type=c(rep("ATOH1", length(ATOH1)), rep("POU2F3", length(POU2F3)),
rep("YAP1", length(YAP1)), rep("NEUROD1", length(NEUROD1)),
rep("ASCL1", length(ASCL1))))
#加载 TAK243 药物反应数据
x<-read.csv("/data/TAK_final_results.csv", row.names=1)
#有些细胞系我们有 TAK 数据,但在数据集中没有表达数据,所以仅保留同时具有这两种数据的细胞系
x<-as.data.frame(x[which(rownames(x) %in% sapply(strsplit(colnames(CCLE), split="_", fixed=T),'[[',1)),])
CCLE<-CCLE[,which(sapply(strsplit(colnames(CCLE),"_",T),'[[',1) %in% rownames(x))]
#将药物反应数据中细胞系的顺序与表达数据匹配以进行表达建模
x<-x[match(sapply(strsplit(colnames(CCLE), split="_", T),'[[',1), rownames(x)),]
x<-x[complete.cases(x),]
#检查所有细胞系的顺序是否相同
all.equal(sapply(strsplit(colnames(CCLE), "_", T),'[[',1), rownames(x))
##### Limma 分析 #####
#为 EC50 值创建spline模型
nsEC50<-ns(x$EC50)
#为 Limma 制作设计模型矩阵
dm_EC50<-model.matrix(~nsEC50) #Model for all cell lines
#使用自定义函数为子类型组生成设计模型矩阵
dm_EC50_YAP<-makemodel(x, "EC50", YAP1) #All YAP1-high/Triple Negative
dm_EC50_YAP2<-makemodel(x, "EC50", YAP1_2) #All YAP1-high except NCIH196_LUNG (outlier)
dm_EC50_ND1<-makemodel(x, "EC50", NEUROD1) #All NEUROD1-high
dm_EC50_ASCL1<-makemodel(x, "EC50", ASCL1) #All ASCL1-high
#表达矩阵子集仅包含每次分析所需的细胞系
CCLE<-CCLE[,which(colnames(CCLE) %in% olCells(colnames(CCLE), x))] #All cell lines
CCLE2<-CCLE[,which(!colnames(CCLE) %in% "NCIH196_LUNG")] #All cell lines except NCIH196_LUNG (outlier)
dm_EC50_2<-makemodel(x[which(!rownames(x) %in% "NCIH196"),], "EC50", colnames(CCLE2)) #Create design model matrix for all SCLC cell lines except NCIH196_LUNG (outlier)
CCLE_YAP<-CCLE[,which(colnames(CCLE) %in% olCells(YAP1, x))]
CCLE_YAP2<-CCLE[,which(colnames(CCLE) %in% olCells(YAP1_2, x))]
CCLE_ND1<-CCLE[,which(colnames(CCLE) %in% olCells(NEUROD1,x))]
CCLE_ASCL1<-CCLE[,which(colnames(CCLE) %in% olCells(ASCL1,x))]
#使用确定的 SCLC 亚型,使子集数据框仅包含已检测的 SCLC 细胞系
subtyping<-subset(typing, Cell.line %in% colnames(CCLE))
subtyping<-subtyping[match(colnames(CCLE), subtyping$Cell.line),] #Ensure same order of cell lines
all.equal(subtyping$Cell.line, colnames(CCLE)) #Check - should return TRUE
#创建条件向量
subtype<-subtyping$type
subtype2<-subtype[-which(colnames(CCLE) %in% "NCIH196_LUNG")]
#使用 Spearman 相关性检查样本如何关联和聚类
#corHeat(CCLE, hmcol=colPal("Greys"),annodf=data.frame(row.names=subtyping$Cell.line, Subtype=subtyping$type), showTree=T)
#运行 limma 分析 - 拟合线性回归模型
EC50_fit<-lmFit(CCLE, design=dm_EC50)
EC50_fit2<-lmFit(CCLE2, design=dm_EC50_2)
EC50_fit_YAP<-lmFit(CCLE_YAP, design=dm_EC50_YAP)
EC50_fit_YAP2<-lmFit(CCLE_YAP2, design=dm_EC50_YAP2)
EC50_fit_ND1<-lmFit(CCLE_ND1, design=dm_EC50_ND1)
EC50_fit_ASCL1<-lmFit(CCLE_ASCL1, design=dm_EC50_ASCL1)
#差异表达的经验贝叶斯检验
EC50_fit<-eBayes(EC50_fit)
EC50_fit2<-eBayes(EC50_fit2)
EC50_fit_YAP<-eBayes(EC50_fit_YAP)
EC50_fit_YAP2<-eBayes(EC50_fit_YAP2)
EC50_fit_ND1<-eBayes(EC50_fit_ND1)
EC50_fit_ASCL1<-eBayes(EC50_fit_ASCL1)
#返回所有结果(所有基因的回归系数和 p 值)
EC50_res<-topTable(EC50_fit, number=Inf)
EC50_res2<-topTable(EC50_fit2, number=Inf)
EC50_res_YAP<-topTable(EC50_fit_YAP, number=Inf)
EC50_res_YAP2<-topTable(EC50_fit_YAP2, number=Inf)
EC50_res_ND1<-topTable(EC50_fit_ND1, number=Inf)
EC50_res_ASCL1<-topTable(EC50_fit_ASCL1, number=Inf)
#现在将包含基因符号的列 (hgnc_symbol) 添加到结果中
EC50_res<-putSym(EC50_res)
EC50_res2<-putSym(EC50_res2)
EC50_res_ASCL1<-putSym(EC50_res_ASCL1)
EC50_res_YAP<-putSym(EC50_res_YAP)
EC50_res_YAP2<-putSym(EC50_res_YAP2)
EC50_res_ND1<-putSym(EC50_res_ND1)
#输出DE结果
limResList<-list(SCLC_w196=EC50_res,
SCLC_wo196=EC50_res2,
YAP1_w196=EC50_res_YAP,
YAP1_wo196=EC50_res_YAP2,
ASCL1=EC50_res_ASCL1,
NEUROD1=EC50_res_ND1)
dir.create("/results/Limma_Results")
mapply(write.table, limResList, file=paste0("/results/Limma_Results/",names(limResList),".res.txt"),
quote=F, sep="t")
#将 limma 结果表转换为与 BinfTools 包函数兼容的格式
#'fixRows()' will set the rownames to just the HGNC symbol instead of the HGNC.ENTREZID format it currently is
bt_EC50<-fixRows(fromLimma(EC50_res), "res")
bt_EC502<-fixRows(fromLimma(EC50_res2), "res")
bt_EC50_YAP<-fixRows(fromLimma(EC50_res_YAP), "res")
bt_EC50_YAP2<-fixRows(fromLimma(EC50_res_YAP2), "res")
bt_EC50_ASCL1<-fixRows(fromLimma(EC50_res_ASCL1),"res")
bt_EC50_ND1<-fixRows(fromLimma(EC50_res_ND1),"res")
#用火山图显示显着的敏感/抵抗基因 (FDR < 0.05)
#Save sensitizer(down/negatively correlated)/resistor(up/positively correlated) genes
pdf("/results/VolcanoPlots.pdf")
EC_SCLC<-volcanoPlot(bt_EC50, "ALL SCLC", p=0.05, FC=0, showNum=F, returnDEG=T)
EC_SCLC2<-volcanoPlot(bt_EC502, "ALL SCLC - no H196", p=0.05, FC=0, showNum=F, returnDEG=T)
YAP_EC50<-volcanoPlot(bt_EC50_YAP, "YAP1 High", p=0.05, FC=0, showNum=F, returnDEG=T)
YAP_EC502<-volcanoPlot(bt_EC50_YAP2, "YAP1 High - no H196", pval=0.01, FC=0, showNum=F, returnDEG=T)
ASCL_EC50<-volcanoPlot(bt_EC50_ASCL1, "ASCL1 High", p=0.05, FC=0, showNum=F, returnDEG=T)
ND1_EC50<-volcanoPlot(bt_EC50_ND1, "NEUROD1 High", pval=0.01, FC=0, showNum=F, returnDEG=T)
dev.off()
#输出维恩图并返回重叠基因
pdf("/results/VennDiagrams.pdf")
Sensitizers<-plotVenn(list(SCLC=EC_SCLC$Down, YAP1=YAP_EC50$Down,
ASCL1=ASCL_EC50$Down, NEUROD1=ND1_EC50$Down),
title="Sensitizers", retVals=T)
Sensitizers2<-plotVenn(list(SCLC=EC_SCLC2$Down, YAP1=YAP_EC502$Down,
ASCL1=ASCL_EC50$Down, NEUROD1=ND1_EC50$Down),
title="Sensitizers - no H196", retVals=T)
Resistors<-plotVenn(list(SCLC=EC_SCLC$Up, YAP1=YAP_EC50$Up,
ASCL1=ASCL_EC50$Up, NEUROD1=ND1_EC50$Up),
title="Resistors", retVals=T)
Resistors2<-plotVenn(list(SCLC=EC_SCLC2$Up, YAP1=YAP_EC502$Up,
ASCL1=ASCL_EC50$Up, NEUROD1=ND1_EC50$Up),
title="Resistors - no H196", retVals=T)
dev.off()
#重命名重叠部分,使其更清晰
OLnames<-c("SCLC","YAP1","ASCL1","NEUROD1","SCLC_YAP1", "SCLC_ASCL1","SCLC_NEUROD1",
"YAP1_ASCL1","YAP1_NEUROD1","ASCL1_NEUROD1","SCLC_YAP1_ASCL1","SCLC_YAP1_NEUROD1",
"SCLC_ASCL1_NEUROD1","YAP1_ASCL1_NEUROD1","All")
names(Sensitizers)<-OLnames
names(Resistors)<-OLnames
names(Sensitizers2)<-OLnames
names(Resistors2)<-OLnames
#将重叠写出为基因集文件
dir.create("/results/gmts/")
write.gmt(Sensitizers, "/results/gmts/Sensitizer_Genes1.gmt")
write.gmt(Sensitizers2, "/results/gmts/Sensitizer_Genes2.gmt")
write.gmt(Resistors, "/results/gmts/Resistor_Genes1.gmt")
write.gmt(Resistors2, "/results/gmts/Resistor_Genes2.gmt")
小果还提供思路设计、定制生信分析、文献思路复现;有需要的小伙伴欢迎直接扫码咨询小果,竭诚为您的科研助力!
定制生信分析
服务器租赁
扫码咨询小果
往期回顾
01 |
02 |
03 |
04 |