除venn图绘制,完全参考自: 1.生信媛(伪)从零开始学转录组(7、8) 2.http://www.jianshu.com/p/324aae3d5ea4
一、使用DESeq2进行差异基因分析
1.导入数据,构建 DESeq2 #所需的 DESeqDataSet 对象
library(DESeq2)
countData <- raw_count_filt[,2:5]
condition <- factor(c("control","KD","KD","control"))
dds <- DESeqDataSetFromMatrix(countData, DataFrame(condition), design= ~ condition )
2.这一步到下一步之间可以过滤掉一些low count数据,节省内存,提高运行速度##
nrow(dds)
#[1] 60492
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
[1] 29493
#使用DESeq进行差异表达分析: DESeq包含三步
dds <- DESeq(dds)
'''
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
'''
3.用results获取结果: results的参数非常的多,这里不好具体展开
res <- results(dds)
#可用mcols查看每一项结果的具体含义
mcols(res, use.names = TRUE)
'''
DataFrame with 6 rows and 2 columns
type description
<character> <character>
baseMean intermediate mean of normalized counts for all samples
log2FoldChange results log2 fold change (MLE): condition KD vs control
lfcSE results standard error: condition KD vs control
statresults Wald statistic: condition KD vs control
pvalue results Wald test p-value: condition KD vs control
padjresultsBH adjusted p-values
'''
4.用summary看描述性的结果
summary(res)
'''
out of 29493 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 3110, 11%
LFC < 0 (down) : 2108, 7.1%
outliers [1] : 0, 0%
low counts [2] : 14867, 50%
(mean count < 19)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
'''
5.画个MA图,还能标注p值最小的基因
6.没有经过 statistical moderation平缓log2 fold changes的情况
plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
}) 
7.经过lfcShrink 收缩log2 fold change, 结果会好看很多
res.shrink <- lfcShrink(dds, contrast = c("condition","KD","control"), res=res)
plotMA(res.shrink, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})
8.先把差异表达的基因找出来
res.deseq2 <- subset(res, padj < 0.05)
summary(res.deseq2)
'''
out of 4276 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2570, 60%
LFC < 0 (down) : 1706, 40%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 19)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
'''
二、使用edgeR进行差异基因分析
第一步: 构建DGEList对象
library(edgeR)
group <- factor(c("control","KD","KD","control"))
genelist <- DGEList(counts=raw_count_filt[,2:5], group = group)
第二步: 过滤 low counts数据。
# 简单粗暴的方法
keep <- rowSums(genelist$count) > 50
# 利用CPM标准化
keep <- rowSums(cpm(genelist) > 0.5 ) >=2
table(keep)
'''
keep
FALSE TRUE
43944 16548
'''
genelist.filted <- genelist[keep, ,keep.lib.sizes=FALSE]
summary(genelist.filted)
'''
Length Class Mode
counts 66192 -none- numeric
samples 3 data.frame list
'''
第三步: 根据组成偏好(composition bias)标准化。
edgeR的calcNormFactors函数使用TMM算法对DGEList标准化
genelist.norm <- calcNormFactors(genelist.filted)
第四步: 实验设计矩阵(Design matrix), 类似于DESeq2中的design参数
design <- model.matrix(~0+group)
colnames(design) <- levels(group)
design
'''
control KD
1 1 0
2 0 1
3 0 1
4 1 0
attr(,"assign")
[1] 1 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
'''
第五步: 估计离散值(Dispersion)
install.packages(statmod)
genelist.Disp <- estimateDisp(genelist.norm, design, robust = TRUE)
#进一步通过quasi-likelihood (QL)拟合NB模型,用于解释生物学和技术性导致的基因特异性变异
fit <- glmQLFit(genelist.Disp, design, robust=TRUE)
head(fit$coefficients)
'''
control KD
ENSG00000000003 -10.194651 -9.981594
ENSG00000000419 -9.537979 -9.436034
ENSG00000000457 -11.244295 -11.521055
ENSG00000000460 -10.456470 -10.345220
ENSG00000001036 -9.600292 -9.267762
ENSG00000001084 -9.706178 -9.399947
'''
第六步: 差异表达检验(1)——这一步主要构建比较矩阵
cntr.vs.KD <- makeContrasts(control-KD, levels=design)
res <- glmQLFTest(fit, contrast=cntr.vs.KD)
ig.edger <- res$table[p.adjust(res$table$PValue, method = "BH") < 0.01, ]
#后续就是提取显著性差异的基因用作下游分析,做一些图看看
topTags(res,n=10)
'''
Coefficient: 1*control -1*KD
logFC logCPM F PValue FDR
ENSG00000092067 7.318378 5.224942 1918.1434 3.812446e-13 6.308835e-09
ENSG00000175928 5.529875 4.462782 1493.2747 1.392974e-12 1.152547e-08
ENSG00000280322 4.351239 4.750425 1283.8540 3.041784e-12 1.677848e-08
ENSG00000104332 3.458766 4.031653 563.5671 2.102783e-10 8.699212e-07
ENSG00000150893 2.103118 5.219683 501.3174 3.825353e-10 1.266039e-06
ENSG00000036448 1.835316 4.737906 387.6063 1.418224e-09 3.911461e-06
ENSG00000229807 2.726486 9.684564 345.7889 2.530529e-09 5.982171e-06
ENSG00000169136 -1.658243 5.724014 330.3849 3.186983e-09 5.991361e-06
ENSG00000142530 -1.921870 6.145287 328.9378 3.258536e-09 5.991361e-06
ENSG00000062038 2.460219 3.545624 314.3368 4.099064e-09 6.783131e-06
'''
is.de <- decideTestsDGE(res)
summary(is.de)
'''
1*control -1*KD
-13111
010169
1 3268
'''
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
legend="topright")
第六步:差异表达检验(2)
找表达量变化比较大的基因,对应的函数是glmTreat
#tr <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5) #object 'B.LvsP' not found
tr <- glmTreat(fit, coef=2, lfc=log2(1.5))
plotMD(tr,status=is.de, values=c(1,-1), col=c("red","blue"),legend="topright")
三、使用limma进行差异分析
1.数据预处理: Limma使用edgeR的DGEList对象,并且过滤方法都是一致的,对应edgeR的第一步,第二步, 第三步
library(edgeR)
library(limma)
group <- factor(c("control","KD","KD","control"))
genelist <- DGEList(counts=raw_count_filt[,2:5], group = group)
### filter base use CPM
keep <- rowSums(cpm(genelist) > 0.5 ) >=2
table(keep)
'''
keep
FALSE TRUE
43944 16548
'''
genelist.filted <- genelist[keep, ,keep.lib.sizes=FALSE]
### normalizaition
x <- calcNormFactors(genelist.filted, method = "TMM")
2.差异表达分析: 使用”limma-trend“
design <- model.matrix(~0+group)
colnames(design) <- levels(group)
logCPM <- cpm(genelist.norm, log=TRUE, prior.count=3)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend=TRUE)
topTable(fit, coef=ncol(design))
'''
logFC AveExprt P.Valueadj.P.ValB
ENSG00000112378 7.908618 7.423498 107.1885 3.275833e-12 5.883721e-11 18.31544
ENSG00000070669 7.720042 7.311585 106.4612 3.430632e-12 5.883721e-11 18.28501
ENSG00000143222 7.144495 6.617131 106.3505 3.454928e-12 5.883721e-11 18.28034
ENSG00000105063 8.155105 7.754410 106.0044 3.532139e-12 5.883721e-11 18.26570
ENSG00000167513 8.504830 8.022979 105.9841 3.536740e-12 5.883721e-11 18.26484
ENSG00000204673 7.195007 6.784525 105.7979 3.579168e-12 5.883721e-11 18.25692
ENSG00000161671 7.710280 7.164736 105.7947 3.579908e-12 5.883721e-11 18.25678
ENSG00000088038 7.420823 7.047706 105.6121 3.622078e-12 5.883721e-11 18.24899
ENSG00000126461 7.864620 7.508545 105.4921 3.650112e-12 5.883721e-11 18.24386
ENSG00000057757 7.282897 6.855236 105.3629 3.680589e-12 5.883721e-11 18.23832
'''
3.差异表达分析: 使用limma-voom
### DGE with voom
v <- voom(genelist.norm, design, plot=TRUE)
#v <- voom(counts, design, plot=TRUE)
fit <- lmFit(v, design)
fit <- eBayes(fit)
all <- topTable(fit, coef=ncol(design), number=10000)
sig.limma <- all[all$adj.P.Val < 0.01, ]
fit <- treat(fit, lfc=log2(1.2))
topTreat(fit, coef=ncol(design))
'''
logFC AveExprt P.Value adj.P.Val
ENSG00000167658 12.43870 12.37409 151.8180 1.306387e-09 1.40086e-07
ENSG00000080824 12.09251 11.76151 144.6511 1.622197e-09 1.40086e-07
ENSG00000074800 12.40951 11.98962 138.0662 2.007816e-09 1.40086e-07
ENSG00000067225 11.63560 11.39947 136.4730 2.104020e-09 1.40086e-07
ENSG00000184009 11.13266 11.09332 135.8970 2.135837e-09 1.40086e-07
ENSG00000075624 11.36112 11.27684 133.9536 2.284186e-09 1.40086e-07
ENSG00000065978 11.36201 11.03076 133.6619 2.306871e-09 1.40086e-07
ENSG00000109971 11.46223 11.19645 132.2101 2.425811e-09 1.40086e-07
ENSG00000149925 10.80475 10.33477 129.4707 2.652039e-09 1.40086e-07
ENSG00000142676 11.07432 10.81495 128.3201 2.768057e-09 1.40086e-07
'''
4.提取了各种的显著性基因,用UpSetR.
#install.packages("UpSetR")
library(UpSetR)
input <- fromList(list(edgeR=rownames(ig.edger), DESeq2=rownames(res.deseq2), limma=rownames(sig.limma)))
summary(input)
'''
edgeRDESeq2 limma
Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
Median :0.0000 Median :0.0000 Median :1.0000
Mean :0.3173 Mean :0.3899 Mean :0.9118
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000 Max. :1.0000
'''
upset(input)
5.venn图
丑丑的venn图,刚学,不要见怪。
vennDiagram(input, include = "both", names = c("edger", "edger", "deseq2"),cex = 1,counts.col = "red")
#limma包不能绘制彩色图。。。。。。
#install.packages("venneuler")
#install.packages("rJava")
library("rJava")#需要java......
library(gplots)
install.packages("gplots")
venn(input)#也不能是彩色的。。。。。。
library(VennDiagram)
names(input)<-c("edger", "edger", "deseq2")
venn.diagram(input,"VennDiagram.venn.png")
'''
Error in `[[<-.data.frame`(`*tmp*`, i, value = c(1L, 0L)) :
replacement has 2 rows, data has 10967
'''
四、富集分析
Y叔的clusterProfiler
1.数据筛选,根据padj < 0.05 且Log2FoldChange的绝对值大于1的标准
library(DESeq2)
#summary(res)
#mcols(res, use.names = TRUE)
deseq2.sig <- subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
summary(deseq2.sig)
#安装包下载注释数据(rog.HS.eg.db)
source("https://bioconductor.org/biocLite.R")
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
biocLite("clusterProfiler")
biocLite("topGO")
library("topGO")
biocLite("AnnotationHub")
biocLite("Rgraphviz")
library(Rgraphviz)
biocLite("graph")
library(graph)
library(AnnotationHub)
library(clusterProfiler)
ah <- AnnotationHub()
org.hs <- ah[['AH53766']]
2.GO富集和GESA
'''
enrichGO(gene=deseq2.sig , OrgDb=org.hs, keytype = "ENTREZID", ont = "MF",
pvalueCutoff = 0.05, pAdjustMethod = "BH", universe, qvalueCutoff = 0.2,
minGSSize = 10, maxGSSize = 500, readable = FALSE, pool = FALSE)
'''
ego <- enrichGO(
gene = row.names(deseq2.sig),
OrgDb = org.hs,
keytype = "ENSEMBL",
ont = "MF"
)
3.可视化分为冒泡图和网络图等,画起来也就几行代码的事情
dotplot(ego,font.size=5)
enrichMap(ego, vertex.label.cex=0.6, layout=igraph::layout.kamada.kawai)
plotGOgraph(ego)