hopkinsV.R
########################################################################### # Demo script # Bioconductor Short Course # JHMI Microarray Core Facility # 28-29/10/2002 # Sandrine Dudoit, Robert Gentleman, and Rafael Irizarry # # annotate, genefilter, cluster ########################################################################### # Load packages
library(mva) library(cluster) library(ctest) library(ellipse) library(lattice) library(MASS) library(XML) library(Biobase) library(annotate) library(genefilter) library(geneplotter) library(golubEsets) library(hgu95a)
########################################################################### # annotate ########################################################################### # Data packages: environments
search() ls(2) ? hgu95a hgu95a()
########################################################################### # get, multiget
g<-"41046_s_at" get(g, env = hgu95aACCNUM) get(g, env = hgu95aLOCUSID) get(g, env = hgu95aSYMBOL) get(g, env = hgu95aGENENAME) get(g, env = hgu95aSUMFUNC) get(g, env = hgu95aUNIGENE) get(g, env = hgu95aCHR) get(g, env = hgu95aCHRLOC) get(g, env = hgu95aCHRORI) get(g, env = hgu95aMAP) get(g, env = hgu95aPMID) get(g, env = hgu95aGO)
gnames<-ls(hgu95aACCNUM) multiget(gnames[1:4], env = hgu95aPMID)
########################################################################### # PubMed
# With pubmed pmids<-get(g, env = hgu95aPMID) pubmed(pmids, disp="browser")
# With pm.getabst absts2<-pm.getabst(g,"hgu95a") pm.titles(absts2) sapply(absts2, function(x) pm.abstGrep("[Pp]rotein", x))
########################################################################### # GenBank
g10<-gnames[1:10] gbacc<-multiget(g10,hgu95aACCNUM) genbank(gbacc,disp="browser") gb<-genbank(gbacc[1],disp="data")
########################################################################### # LocusLink
llid<-multiget(gnames[1:10],hgu95aLOCUSID) map<-multiget(gnames[1:10],hgu95aMAP) symb<-multiget(gnames[1:10],hgu95aSYMBOL)
res<-data.frame(gnames[1:10],cbind(unlist(symb),unlist(map))) names(res)<-c("Affy ID","Gene symbol", "Chromosomal location") ll.htmlpage(llid, filename="LocusLink.html", title="HTML report", othernames=res, table.head=c("LocusID",names(res)), table.center = TRUE)
########################################################################### # Chromosomal maps
# hgByChroms: list matching affy names to location for each chorm (24) # hgCLengths: vector with length of 24 chromosomes data(hgByChroms) data(hgCLengths)
newChrom <- buildChromClass("Human", "Cheng Li's HG data", hgByChroms, hgCLengths) newChrom
######################### # cPlot
cPlot(newChrom) cPlot(newChrom,c("1","2"),fg="blue",scale="relative")
par(mfrow=c(2,1)) for (sc in c("max","relative")) cPlot(newChrom,fg="blue",scale=sc)
######################### # alongChrom
data(eset) data(hgu95AProbLocs) cols <- c("red", "green", "blue") cols <- cols[eset$cov3]
par(mfrow=c(3,2)) alongChrom(eset, "1", newChrom, xloc="equispaced", plotFormat="cumulative", col=cols,lwd=2) alongChrom(eset, "1", newChrom, xloc="physical", col=cols,lwd=2) alongChrom(eset, "1", newChrom, xloc="equispaced", plotFormat="local", col=cols,lwd=2) alongChrom(eset, "1", newChrom, xloc="equispaced", plotFormat="local", col=cols, type="p", pch=16) alongChrom(eset, "1", newChrom, xlim = c(87511280,127717880), xloc="equispaced", plotFormat="local", col=cols, type="p", pch=16) alongChrom(eset, "1", newChrom, xloc="equispaced", plotFormat="image")
########################################################################### # Spike data ###########################################################################
load("expression.rda") e X<-exprs(e) Y<-c(1,1,1,2,2,2) gnames<-dimnames(X)[[1]]
spike<-read.table("phenodata.txt") spike<-spike[,order(as.numeric(spike[1,]))] gspike<-names(spike) gspike<-sub("\\.","_",gspike) gspike<-sub("X","",gspike) spike<-paste(spike[2,]/spike[1,],paste(spike[1,],spike[2,],sep=":"))
########################################################################### # Genefilter
med<-function(M=250, na.rm=TRUE) { function(x){ median(x, na.rm=na.rm) >= M } }
f1<-med(M=8) ffun <- filterfun(f1) which <- genefilter(X, ffun) sum(which)
# N.B. SLOW f2<-ttest(Y) ffun <- filterfun(f1,f2) which <- genefilter(X, ffun) sum(which)
########################################################################### # t-statistics # N.B. SLOW
resSpike<-t(apply(X, 1, function(z) { res<-t.test(z ~ Y) c(res$estimate,-res$statistic,res$p.value) })) resSpike<-data.frame(resSpike) ranks<-rank(resSpike[,4]) resSpike<-cbind(resSpike,ranks) dimnames(resSpike)[[2]]<-c("Avg. G","Avg. H","t-statistic","Nominal p-value","Rank")
########################################################################### # Plots
par(mfrow=c(1,1)) A<-apply(resSpike[,1:2],1,mean) M<-resSpike[,"Avg. H"]-resSpike[,"Avg. G"] tstat<-resSpike[,"t-statistic"]
names(tstat)<-names(M)<-names(A) cols<-rep(2,length(gspike)) cols[1]<-3 cols[14]<-4 pchs<-as.character(1:14)
plot(A,M,pch=".") points(A[gspike],M[gspike],col=cols,pch=16,cex=seq(0.5,1.5,length=14)) #legend(12,-2,labs,col=cols,pch=16,cex=seq(0.5,1.5,length=14)) legend(12,-2,labs,col=cols,pch=16) abline(h=1,lwd=2,col="gray")
plot(M,abs(tstat),pch=".") points(M[gspike],abs(tstat)[gspike],col=cols,pch=16,cex=seq(0.5,1.5,length=14))
plot(M,abs(tstat),pch=".",xlim=c(-2,2),ylim=c(-0,20)) points(M[gspike],abs(tstat)[gspike],col=cols,pch=16,cex=seq(0.5,1.5,length=14))
########################################################################### # HTML report for only the 14 spiked in genes
resSpikeSub<-cbind(round(resSpike[gspike,],2),spike)[order(resSpike[gspike,"Rank"]),]
llid<-multiget(gspike,hgu95aLOCUSID) map<-multiget(gspike,hgu95aMAP) symb<-multiget(gspike,hgu95aSYMBOL)
resSpikeSub<-cbind(resSpikeSub,gspike,unlist(symb),unlist(map)) names(resSpikeSub)[7:9]<-c("Affy ID","Gene symbol", "Chromosomal location") ll.htmlpage(llid, filename="Spike.html", title="HTML report for spiked in genes", othernames=resSpikeSub, table.head=c("LocusID",names(resSpikeSub)), table.center = TRUE)
########################################################################### # Golub et al. data ########################################################################### # Pre-processing: filter genes based on intensity
data(golubTrain)
X<-exprs(golubTrain) X[X<100]<-100 X[X>16000]<-16000
mmfilt <- function(r=5, d=500, na.rm=TRUE) { function(x) { minval <- min(x, na.rm=na.rm) maxval <- max(x, na.rm=na.rm) (maxval/minval > r) && (maxval-minval > d) } } mmfun <- mmfilt() ffun <- filterfun(mmfun) sub <- genefilter(X, ffun ) sum(sub) ## Should get 3051 X <- X[sub,] X <- log2(X) golubTrainSub<-golubTrain[sub,] slot(golubTrainSub,"exprs")<-X
Y <- golubTrainSub$ALL.AML
Y<-paste(golubTrain$ALL.AML,golubTrain$T.B.cell) Y<-sub("NA","",Y) table(Y)
gnames<-dimnames(X)[[1]]
########################################################################### # Distances ########################################################################### par(mfrow=c(1,1))
r<-cor(X) dimnames(r)<-list(as.vector(Y),as.vector(Y)) d<-1-r
plotcorr(r,main="Leukemia data: Correlation matrix for 38 mRNA samples\n All 3,051 genes") plotcorr(r,numbers=TRUE,main="Leukemia data: Correlation matrix for 38 mRNA samples\n All 3,051 genes") levelplot(r,col.region=heat.colors(50),main="Leukemia data: Correlation matrix for 38 mRNA samples\n All 3,051 genes")
########################################################################### # MDS ###########################################################################
mds<- cmdscale(d, k=2, eig=TRUE) plot(mds$points, type="n", xlab="", ylab="", main="MDS for ALL AML data, correlation matrix, G=3,051 genes, k=2") text(mds$points[,1],mds$points[,2],Y, col=codes(factor(Y))+1, cex=0.8)
mds<- cmdscale(d, k=3, eig=TRUE) pairs(mds$points, main="MDS for ALL AML data, correlation matrix, G=3,051 genes, k=3", pch=c("B","T","M")[codes(factor(Y))], col = codes(factor(Y))+1)
########################################################################### # Cluster analysis ########################################################################### # hclust
######################### # Actual data
hc <- hclust(as.dist(d), method="average") plot(hc, labels=golubTrainSub$B.T.cell, main="Hierarchical clustering dendrogram for ALL AML data",sub="Average linkage, correlation matrix, G=3,051 genes")
cor(cophenetic(hc),as.dist(d))
######################### # Randomized data
set.seed(-1) d0<-1-cor(t(apply(X,1,sample))) dimnames(d0)<-list(as.vector(Y),as.vector(Y)) hc0<- hclust(as.dist(d0), method="average") plot(hc0, labels=golubTrainSub$B.T.cell, main="Hierarchical clustering dendrogram for randomized ALL AML data",sub="Average linkage, correlation matrix, G=3,051 genes")
cor(cophenetic(hc0),as.dist(d0))
########################################################################### # PAM
# k=2 set.seed(12345) pm <- pam(as.dist(d), k=2, diss=TRUE)
clusplot(d, pm$clustering, diss=TRUE, labels=3, col.p=1, col.txt=codes(factor(Y))+1, main="Bivariate cluster plot for ALL AML data\n Correlation matrix, K=2, G=3,051 genes")
plot(pm,which.plots=1) plot(pm,which.plots=2)
table(pm$clustering, Y)
######################### # k=3
pm <- pam(as.dist(d), k=3, diss=TRUE)
clusplot(d, pm$clustering, diss=TRUE, labels=3, col.p=1, col.txt=codes(factor(Y))+1, main="Bivariate cluster plot for ALL AML data\n Correlation matrix, K=3, G=3,051 genes")
plot(pm,which.plots=1) plot(pm,which.plots=2)
table(pm$clustering, Y)
########################################################################### # New functions ###########################################################################
pm.getabst <- function(geneids, basename) { pmenvN <- paste(basename, "PMID", sep="") do.call("require", list(package=basename)) || stop(paste("Library", ##FIXME: use this after 1.7.0 is released ## require(basename, character.only=TRUE) || stop(paste("Library", basename,"is unavailable")) if( !exists(pmenvN, mode = "environment") ) stop("could not access PubMed ids for this data") pmenv <- get(pmenvN) pmids <- multiget(geneids, env=pmenv) numids <- length(geneids) rval <- vector("list", length=numids) names(rval) <- geneids for(i in 1:numids) { pm <- pmids[[i]] if( is.na(pm) ) rval[[i]] <- NA else { absts <- pubmed(pm) a <- xmlRoot(absts) numAbst <- length(xmlChildren(a)) absts <- vector("list", length=numAbst) for (j in 1:numAbst) absts[[j]] <- buildPubMedAbst(a[[j]]) rval[[i]] <- absts } } rval }
###########################################################################