转录组学习笔记7、8

zd00572

Posted by zd200572 on August 9, 2017

除venn图绘制,完全参考自: 1.生信媛(伪)从零开始学转录组(7、8) 2.http://www.jianshu.com/p/324aae3d5ea4

一、使用DESeq2进行差异基因分析

1.导入数据,构建 DESeq2 #所需的 DESeqDataSet 对象

library(DESeq2)
countData <- raw_count_filt[,2:5]
condition <- factor(c("control","KD","KD","control"))
dds <- DESeqDataSetFromMatrix(countData, DataFrame(condition), design= ~ condition )

2.这一步到下一步之间可以过滤掉一些low count数据,节省内存,提高运行速度##

nrow(dds)
#[1] 60492
dds <- dds[ rowSums(counts(dds)) > 1, ]
nrow(dds)
[1] 29493
#使用DESeq进行差异表达分析: DESeq包含三步
dds <- DESeq(dds)
'''
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
'''

3.用results获取结果: results的参数非常的多,这里不好具体展开

res <- results(dds)
#可用mcols查看每一项结果的具体含义
mcols(res, use.names = TRUE)
'''
DataFrame with 6 rows and 2 columns
   type description
<character> <character>
baseMean   intermediate   mean of normalized counts for all samples
log2FoldChange  results log2 fold change (MLE): condition KD vs control
lfcSE   results standard error: condition KD vs control
statresults Wald statistic: condition KD vs control
pvalue  results  Wald test p-value: condition KD vs control
padjresultsBH adjusted p-values
'''

4.用summary看描述性的结果

summary(res)
'''
out of 29493 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 3110, 11% 
LFC < 0 (down)   : 2108, 7.1% 
outliers [1] : 0, 0% 
low counts [2]   : 14867, 50% 
(mean count < 19)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
'''

5.画个MA图,还能标注p值最小的基因

6.没有经过 statistical moderation平缓log2 fold changes的情况

plotMA(res, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
  points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
  text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
}) ![](http://osnq2ssd7.bkt.clouddn.com/%E6%B2%A1%E6%9C%89%E7%BB%8F%E8%BF%87statistical%20moderation%20%E5%B9%B3%E7%BC%93log2.png)

7.经过lfcShrink 收缩log2 fold change, 结果会好看很多

res.shrink <- lfcShrink(dds, contrast = c("condition","KD","control"), res=res)
plotMA(res.shrink, ylim = c(-5,5))
topGene <- rownames(res)[which.min(res$padj)]
with(res[topGene, ], {
  points(baseMean, log2FoldChange, col="dodgerblue", cex=2, lwd=2)
  text(baseMean, log2FoldChange, topGene, pos=2, col="dodgerblue")
})

8.先把差异表达的基因找出来

res.deseq2 <- subset(res, padj < 0.05)
summary(res.deseq2)
'''
out of 4276 with nonzero total read count
adjusted p-value < 0.1
LFC > 0 (up) : 2570, 60% 
LFC < 0 (down)   : 1706, 40% 
outliers [1] : 0, 0% 
low counts [2]   : 0, 0% 
(mean count < 19)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
'''

二、使用edgeR进行差异基因分析

第一步: 构建DGEList对象

library(edgeR)
group <- factor(c("control","KD","KD","control"))
genelist <- DGEList(counts=raw_count_filt[,2:5], group = group)

第二步: 过滤 low counts数据。

# 简单粗暴的方法
keep <- rowSums(genelist$count) > 50
# 利用CPM标准化
keep <- rowSums(cpm(genelist) > 0.5 ) >=2
table(keep)
'''
keep
FALSE  TRUE 
43944 16548 
'''
genelist.filted <- genelist[keep, ,keep.lib.sizes=FALSE]
summary(genelist.filted)
'''
Length Class  Mode   
counts  66192  -none- numeric
samples 3  data.frame list 
'''

第三步: 根据组成偏好(composition bias)标准化。

edgeR的calcNormFactors函数使用TMM算法对DGEList标准化

genelist.norm <- calcNormFactors(genelist.filted)

第四步: 实验设计矩阵(Design matrix), 类似于DESeq2中的design参数

design <- model.matrix(~0+group)
colnames(design) <- levels(group)
design
'''
  control KD
1   1  0
2   0  1
3   0  1
4   1  0
attr(,"assign")
[1] 1 1
attr(,"contrasts")
attr(,"contrasts")$group
[1] "contr.treatment"
'''

第五步: 估计离散值(Dispersion)

install.packages(statmod)
genelist.Disp <- estimateDisp(genelist.norm, design, robust = TRUE)
#进一步通过quasi-likelihood (QL)拟合NB模型,用于解释生物学和技术性导致的基因特异性变异
fit <- glmQLFit(genelist.Disp, design, robust=TRUE)
head(fit$coefficients)
'''
  control KD
ENSG00000000003 -10.194651  -9.981594
ENSG00000000419  -9.537979  -9.436034
ENSG00000000457 -11.244295 -11.521055
ENSG00000000460 -10.456470 -10.345220
ENSG00000001036  -9.600292  -9.267762
ENSG00000001084  -9.706178  -9.399947
'''

第六步: 差异表达检验(1)——这一步主要构建比较矩阵

cntr.vs.KD <- makeContrasts(control-KD, levels=design)
res <- glmQLFTest(fit, contrast=cntr.vs.KD)
ig.edger <- res$table[p.adjust(res$table$PValue, method = "BH") < 0.01, ]
#后续就是提取显著性差异的基因用作下游分析,做一些图看看

topTags(res,n=10)
'''
Coefficient:  1*control -1*KD 
logFC   logCPM F   PValue  FDR
ENSG00000092067  7.318378 5.224942 1918.1434 3.812446e-13 6.308835e-09
ENSG00000175928  5.529875 4.462782 1493.2747 1.392974e-12 1.152547e-08
ENSG00000280322  4.351239 4.750425 1283.8540 3.041784e-12 1.677848e-08
ENSG00000104332  3.458766 4.031653  563.5671 2.102783e-10 8.699212e-07
ENSG00000150893  2.103118 5.219683  501.3174 3.825353e-10 1.266039e-06
ENSG00000036448  1.835316 4.737906  387.6063 1.418224e-09 3.911461e-06
ENSG00000229807  2.726486 9.684564  345.7889 2.530529e-09 5.982171e-06
ENSG00000169136 -1.658243 5.724014  330.3849 3.186983e-09 5.991361e-06
ENSG00000142530 -1.921870 6.145287  328.9378 3.258536e-09 5.991361e-06
ENSG00000062038  2.460219 3.545624  314.3368 4.099064e-09 6.783131e-06
'''
is.de <- decideTestsDGE(res)
summary(is.de)
'''
   1*control -1*KD
-13111
010169
1 3268
'''
plotMD(res, status=is.de, values=c(1,-1), col=c("red","blue"),
   legend="topright")

第六步:差异表达检验(2)

找表达量变化比较大的基因,对应的函数是glmTreat

#tr <- glmTreat(fit, contrast=B.LvsP, lfc=log2(1.5) #object 'B.LvsP' not found
tr <- glmTreat(fit, coef=2, lfc=log2(1.5))
plotMD(tr,status=is.de, values=c(1,-1), col=c("red","blue"),legend="topright")

原参数有问题,修改了个参数的

三、使用limma进行差异分析

1.数据预处理: Limma使用edgeR的DGEList对象,并且过滤方法都是一致的,对应edgeR的第一步,第二步, 第三步

library(edgeR)
library(limma)
group <- factor(c("control","KD","KD","control"))
genelist <- DGEList(counts=raw_count_filt[,2:5], group = group)
### filter base  use CPM
keep <- rowSums(cpm(genelist) > 0.5 ) >=2
table(keep)
'''
keep
FALSE  TRUE 
43944 16548 
'''
genelist.filted <- genelist[keep, ,keep.lib.sizes=FALSE]
### normalizaition
x <- calcNormFactors(genelist.filted, method = "TMM")

2.差异表达分析: 使用”limma-trend“

design <- model.matrix(~0+group)
colnames(design) <- levels(group)
logCPM <- cpm(genelist.norm, log=TRUE, prior.count=3)
fit <- lmFit(logCPM, design)
fit <- eBayes(fit, trend=TRUE)
topTable(fit, coef=ncol(design))
'''
   logFC  AveExprt  P.Valueadj.P.ValB
ENSG00000112378 7.908618 7.423498 107.1885 3.275833e-12 5.883721e-11 18.31544
ENSG00000070669 7.720042 7.311585 106.4612 3.430632e-12 5.883721e-11 18.28501
ENSG00000143222 7.144495 6.617131 106.3505 3.454928e-12 5.883721e-11 18.28034
ENSG00000105063 8.155105 7.754410 106.0044 3.532139e-12 5.883721e-11 18.26570
ENSG00000167513 8.504830 8.022979 105.9841 3.536740e-12 5.883721e-11 18.26484
ENSG00000204673 7.195007 6.784525 105.7979 3.579168e-12 5.883721e-11 18.25692
ENSG00000161671 7.710280 7.164736 105.7947 3.579908e-12 5.883721e-11 18.25678
ENSG00000088038 7.420823 7.047706 105.6121 3.622078e-12 5.883721e-11 18.24899
ENSG00000126461 7.864620 7.508545 105.4921 3.650112e-12 5.883721e-11 18.24386
ENSG00000057757 7.282897 6.855236 105.3629 3.680589e-12 5.883721e-11 18.23832
'''

3.差异表达分析: 使用limma-voom

### DGE with voom
v <- voom(genelist.norm, design, plot=TRUE)
#v <- voom(counts, design, plot=TRUE)
fit <- lmFit(v, design)
fit <- eBayes(fit)
all <- topTable(fit, coef=ncol(design), number=10000)
sig.limma <- all[all$adj.P.Val < 0.01, ]
fit <- treat(fit, lfc=log2(1.2))
topTreat(fit, coef=ncol(design))
'''
   logFC  AveExprt  P.Value   adj.P.Val
ENSG00000167658 12.43870 12.37409 151.8180 1.306387e-09 1.40086e-07
ENSG00000080824 12.09251 11.76151 144.6511 1.622197e-09 1.40086e-07
ENSG00000074800 12.40951 11.98962 138.0662 2.007816e-09 1.40086e-07
ENSG00000067225 11.63560 11.39947 136.4730 2.104020e-09 1.40086e-07
ENSG00000184009 11.13266 11.09332 135.8970 2.135837e-09 1.40086e-07
ENSG00000075624 11.36112 11.27684 133.9536 2.284186e-09 1.40086e-07
ENSG00000065978 11.36201 11.03076 133.6619 2.306871e-09 1.40086e-07
ENSG00000109971 11.46223 11.19645 132.2101 2.425811e-09 1.40086e-07
ENSG00000149925 10.80475 10.33477 129.4707 2.652039e-09 1.40086e-07
ENSG00000142676 11.07432 10.81495 128.3201 2.768057e-09 1.40086e-07
'''

4.提取了各种的显著性基因,用UpSetR.

#install.packages("UpSetR")
library(UpSetR)
input <- fromList(list(edgeR=rownames(ig.edger), DESeq2=rownames(res.deseq2), limma=rownames(sig.limma)))
summary(input)
'''
edgeRDESeq2   limma   
Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
Median :0.0000   Median :0.0000   Median :1.0000  
Mean   :0.3173   Mean   :0.3899   Mean   :0.9118  
3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
Max.   :1.0000   Max.   :1.0000   Max.   :1.0000 
'''
upset(input)

5.venn图

丑丑的venn图,刚学,不要见怪。

vennDiagram(input, include = "both", names = c("edger", "edger", "deseq2"),cex = 1,counts.col = "red")
#limma包不能绘制彩色图。。。。。。
#install.packages("venneuler")
#install.packages("rJava")
library("rJava")#需要java......
library(gplots)
install.packages("gplots")
venn(input)#也不能是彩色的。。。。。。

library(VennDiagram)
names(input)<-c("edger", "edger", "deseq2")
venn.diagram(input,"VennDiagram.venn.png")
'''
Error in `[[<-.data.frame`(`*tmp*`, i, value = c(1L, 0L)) : 
  replacement has 2 rows, data has 10967
'''

四、富集分析

Y叔的clusterProfiler

1.数据筛选,根据padj < 0.05 且Log2FoldChange的绝对值大于1的标准

library(DESeq2)
#summary(res)
#mcols(res, use.names = TRUE)
deseq2.sig <- subset(res, padj < 0.05 & abs(log2FoldChange) > 1)
summary(deseq2.sig)
#安装包下载注释数据(rog.HS.eg.db)
source("https://bioconductor.org/biocLite.R")
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
biocLite("clusterProfiler")
biocLite("topGO")
library("topGO")
biocLite("AnnotationHub")
biocLite("Rgraphviz")
library(Rgraphviz)
biocLite("graph")
library(graph)
library(AnnotationHub)
library(clusterProfiler)
ah <- AnnotationHub()
org.hs <- ah[['AH53766']]

2.GO富集和GESA

'''
enrichGO(gene=deseq2.sig , OrgDb=org.hs, keytype = "ENTREZID", ont = "MF",
 pvalueCutoff = 0.05, pAdjustMethod = "BH", universe, qvalueCutoff = 0.2,
 minGSSize = 10, maxGSSize = 500, readable = FALSE, pool = FALSE)
'''
ego <- enrichGO(
  gene = row.names(deseq2.sig),
  OrgDb = org.hs,
  keytype = "ENSEMBL",
  ont = "MF"
)

3.可视化分为冒泡图和网络图等,画起来也就几行代码的事情

dotplot(ego,font.size=5)
enrichMap(ego, vertex.label.cex=0.6, layout=igraph::layout.kamada.kawai)
plotGOgraph(ego)