You will form five selected groups of size 4-5 persons. You will take 5-7 minutes to sketch written plans of attack on the following questions and then submit your plans for discussion and demonstration.
load("montpick_eset.RData")
montpick.eset
## Error: object 'montpick.eset' not found
Remove pure zeroes.
pz = apply(exprs(montpick.eset),1,
function(x) all(x==0))
mnz = montpick.eset[-which(pz),]
There is some additional scrubbing needed…
dim(mnz)
## Features Samples
## 12160 129
Simple clustering approach.
hc = hclust(dist(t(exprs(mnz))))
table(cutree(hc, k=2))
##
## 1 2
## 128 1
table(cutree(hc, k=3))
##
## 1 2 3
## 84 44 1
table(cutree(hc, k=4))
##
## 1 2 3 4
## 84 33 11 1
Try with simple logarithm after shift by one.
hc.l = hclust(dist(t(log(exprs(mnz)+1))))
table(cutree(hc.l, k=2))
##
## 1 2
## 122 7
table(cutree(hc.l, k=3))
##
## 1 2 3
## 30 92 7
table(cutree(hc.l, k=4))
##
## 1 2 3 4
## 30 92 6 1
Try PAM:
library(cluster)
pam.2 = pam(dist(t(log(exprs(mnz)+1))), k=2)
pam.3 = pam(dist(t(log(exprs(mnz)+1))), k=3)
pam.4 = pam(dist(t(log(exprs(mnz)+1))), k=4)
table(pam.4$clust)
##
## 1 2 3 4
## 45 20 49 15
Try K-means:
km.2 = kmeans(t(log(exprs(mnz)+1)), 2)
km.3 = kmeans(t(log(exprs(mnz)+1)), 3)
table(km.2$clust)
##
## 1 2
## 77 52
table(km.3$clust)
##
## 1 2 3
## 33 60 36
Try kmeans with the voom transformation of the counts.
library(limma)
vmnz = voom(exprs(mnz))
txd = vmnz$E
lkm2v = kmeans(t(txd),2)
lkm4v = kmeans(t(txd),4)
table(lkm2v$clust)
##
## 1 2
## 63 66
table(lkm4v$clust)
##
## 1 2 3 4
## 15 43 60 11
Consider 'clusterStability' measure.
Try a DESeq analysis.
se = SummarizedExperiment(assays=list(counts=exprs(mnz)))
colData(se) = DataFrame(pData(mnz))
de = DESeqDataSet(se, ~population)
derun = DESeq(de)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 271 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds))
## estimating dispersions
## fitting model and testing
res = results(derun)
summarizeResults(res)
##
## out of 12160 with nonzero total read count
##
## at adjusted p-value < 0.1:
##
## LFC > 0 (up) : 3143, 26%
## LFC < 0 (down) : 2975, 24%
##
## filtered * : 1216, 10%
## flagged ** : 0, 0%
##
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()
res[which(res$padj < .005),]
## log2 fold change (MAP): population YRI vs CEU
## Wald test p-value: population YRI vs CEU
## DataFrame with 3804 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000419 41.785 0.5559 0.09824 5.658 1.528e-08
## ENSG00000000457 48.692 0.4202 0.09047 4.645 3.400e-06
## ENSG00000002330 65.562 -0.6426 0.08949 -7.180 6.966e-13
## ENSG00000002745 1.779 1.7037 0.40252 4.233 2.310e-05
## ENSG00000002834 155.210 -0.6368 0.10974 -5.803 6.516e-09
## ... ... ... ... ... ...
## ENSG00000253873 8.008 -0.6856 0.1749 -3.920 8.849e-05
## ENSG00000253954 9.913 -0.6413 0.1505 -4.262 2.023e-05
## ENSG00000254004 4.520 1.0417 0.1723 6.045 1.492e-09
## ENSG00000254128 9.615 1.0443 0.1772 5.894 3.774e-09
## ENSG00000254206 1.843 2.2559 0.3519 6.410 1.452e-10
## padj
## <numeric>
## ENSG00000000419 1.151e-07
## ENSG00000000457 1.732e-05
## ENSG00000002330 1.015e-11
## ENSG00000002745 1.007e-04
## ENSG00000002834 5.190e-08
## ... ...
## ENSG00000253873 3.470e-04
## ENSG00000253954 8.945e-05
## ENSG00000254004 1.331e-08
## ENSG00000254128 3.120e-08
## ENSG00000254206 1.511e-09
Is this sensitive to technical artifacts? Use SVA on log-transformed data and check.
num = num.sv(log(exprs(mnz)+1),
mod=model.matrix(~population,data=pData(mnz)), method="leek")
sv1 = sva(log(exprs(mnz)+1),
mod=model.matrix(~population,data=pData(mnz)), n.sv=num)
## Number of significant surrogate variables is: 1
## Iteration (out of 5 ):1 2 3 4 5
print(num)
## [1] 1
summary(sv1$sv)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1600 -0.0633 -0.0220 0.0000 0.0503 0.3120
secorr = se
secorr$sv = sv1$sv
decorr = DESeqDataSet(secorr, ~sv+population)
derun2 = DESeq(decorr)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
corres = results(derun2)
summarizeResults(corres)
##
## out of 12160 with nonzero total read count
##
## at adjusted p-value < 0.1:
##
## LFC > 0 (up) : 3161, 26%
## LFC < 0 (down) : 3006, 25%
##
## filtered * : 1216, 10%
## flagged ** : 0, 0%
##
## * for low mean count, see independentFiltering argument of results()
## ** for high Cook's distance, see cooksCutoff argument of results()
corres[which(corres$padj < .005),]
## log2 fold change (MAP): population YRI vs CEU
## Wald test p-value: population YRI vs CEU
## DataFrame with 3818 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000419 41.785 0.5780 0.09948 5.810 6.247e-09
## ENSG00000000457 48.692 0.3842 0.09019 4.260 2.040e-05
## ENSG00000002330 65.562 -0.6635 0.09199 -7.213 5.487e-13
## ENSG00000002745 1.779 1.7222 0.40362 4.267 1.982e-05
## ENSG00000002834 155.210 -0.7126 0.11082 -6.430 1.277e-10
## ... ... ... ... ... ...
## ENSG00000253873 8.008 -0.6432 0.1780 -3.612 3.034e-04
## ENSG00000253954 9.913 -0.6010 0.1526 -3.938 8.220e-05
## ENSG00000254004 4.520 1.0304 0.1752 5.880 4.095e-09
## ENSG00000254128 9.615 1.0214 0.1800 5.673 1.400e-08
## ENSG00000254206 1.843 2.3030 0.3464 6.647 2.983e-11
## padj
## <numeric>
## ENSG00000000419 4.947e-08
## ENSG00000000457 8.866e-05
## ENSG00000002330 7.980e-12
## ENSG00000002745 8.639e-05
## ENSG00000002834 1.302e-09
## ... ...
## ENSG00000253873 1.045e-03
## ENSG00000253954 3.181e-04
## ENSG00000254004 3.337e-08
## ENSG00000254128 1.048e-07
## ENSG00000254206 3.281e-10
Approaches to classification
library(randomGLM)
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:genefilter':
##
## area
##
## The following object is masked from 'package:AnnotationDbi':
##
## select
##
## Loading required package: foreach
## foreach: simple, scalable parallel programming from Revolution Analytics
## Use Revolution R for scalability, fault tolerance and more.
## http://www.revolutionanalytics.com
## Loading required package: doParallel
## Loading required package: iterators
mysamp = sample(1:129, size=75)
train = mnz[,sort(mysamp)]
test = mnz[,-sort(mysamp)]
y = 1*(train$pop == "YRI")
## Warning: Name partially matched in data frame
rglm1 = randomGLM( t(exprs(train)), y, xtest=t(exprs(test)) )
table(rglm1$predictedTest, test$pop)
## Warning: Name partially matched in data frame
##
## CEU YRI
## 0 22 3
## 1 0 29
An alternate approach to ensemble learning
library(bigrf)
## Loading required package: bigmemory
## Loading required package: bigmemory.sri
## Loading required package: BH
##
## bigmemory >= 4.0 is a major revision since 3.1.2; please see packages
## biganalytics and and bigtabulate and http://www.bigmemory.org for more information.
isyri = as.integer(1*(train$pop=="YRI")+1)
## Warning: Name partially matched in data frame
bb = bigrfc(data.frame(t(exprs(train))), isyri)
## OOB errors:
## Tree Overall error Error by class
## 1 2
## 10 12.00 5.26 18.92
## 20 10.67 7.89 13.51
## 30 6.67 5.26 8.11
## 40 6.67 2.63 10.81
## 50 5.33 2.63 8.11
bigpr = predict(bb, x=data.frame(t(exprs(test))))
## Processing tree number:
## 10
## 20
## 30
## 40
## 50
table(bigpr, test$pop)
## Warning: Name partially matched in data frame
##
## bigpr CEU YRI
## 1 22 3
## 2 0 29