lab5.R
##load up required libraries
library(cluster) library(Biobase) library(annotate) library(genefilter) library(golubEsets)
############################ ## data processing ###########################
data(golubTrain) golubTrain data(golubTest) golubTest
LS<-exprs(golubTrain) cl<-golubTrain$ALL.AML TS<-exprs(golubTest) clts<-golubTest$ALL.AML
##mmfilt is in the library
LS[LS<100]<-100 TS[TS<100]<-100 LS[LS>16000]<-16000 TS[TS>16000]<-16000
mmfilt <- function(r=5, d=500, na.rm=TRUE) { function(x) { minval <- min(x, na.rm=na.rm) maxval <- max(x, na.rm=na.rm) (maxval/minval > 5) && (maxval-minval > 500) } }
mmfun <- mmfilt()
ffun <- filterfun(mmfun)
good <- genefilter(cbind(LS, TS), ffun )
sum(good) ##I get 3571
LSsub <- log10(LS[good,]) TSsub <- log10(TS[good,]) LTsub <- cbind(LSsub, TSsub)
############################## ## Annotate : optional ##############################
##probably want to do a ttest filter here ##but we'll leave that to you
ll<- read.annotation("hgu68ll") #locus link gname <- row.names(LSsub)[50] llname<-get(gname, env=ll) locuslink(llname) ##open your browser to the right thing
chroms <- read.annotation("hgu68Chrom")
whichC <- multiget(row.names(LSsub[1:100,]), env=chroms, iffail=NA) cc <-as.numeric(unlist(whichC)) hist(cc)
################################### ## Clustering #################################
gf <- gapFilter(550, 800, .1) ff <- filterfun(gf) xx <- 10^LSsub ##to get back to original units good <- genefilter(xx, gf) sum(good) # should be 1093
LS2 <- LSsub[good,]
library(mva) clust.res<-hclust(as.dist(1-cor(LS2)), method = "complete") plot.hclust(clust.res, labels=cl,hang = 0.1)
clust.res<-hclust(dist(t(LS2)), method = "average") plot.hclust(clust.res, labels=cl,hang = 0.1)
pamLS <- pam(as.dist(1-cor(LS2)), 3, diss=TRUE) plot(pamLS)
####################################### ## classification ##################################
library(class)
ALL <- c(as.character(golubTrain$ALL), as.character(golubTest$ALL)) BT <- c(as.character(golubTrain$T.B), as.character(golubTest$T.B)) BTs <- ifelse(BT=="B-cell", "B", "T") BTs <- ifelse(BT=="T-cell", "T", BTs) BTs <- ifelse(BT=="NA", "", BTs) newALL <- factor(paste(ALL, BTs, sep="")) nALL.train <- newALL[1:38] nALL.test <-newALL[39:72]
##now do an Anova filter af <- Anova(nALL.train, p=0.0000001) ffA <- filterfun(af) wh.ALL <- genefilter(LSsub, af) #use only training data sum(wh.ALL) #how many
##how does it work on the test set pred.test <- knn(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train, k=5) table(pred.test, nALL.test)
##how does it work on the training set pred.tr <- knn(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train, k=5) table(pred.tr, nALL.train)
## ##use knn and cross-validation to choose the best k!
knn.cvk<-function(LS, cl, k=1:5, l=0, prob=FALSE, use.all=TRUE) { classes<-unique(cl) cv.err<-rep(0,length(k)) cl.pred<-matrix(NA, nrow(LS), length(k)) for(j in (1:length(k))) { cl.pred[,j]<-knn.cv(LS, cl, k[j], l, prob, use.all) cv.err[j]<-sum(cl!=cl.pred[,j]) } k0<-k[which.min(cv.err)] return(k=k0,pred=classes[cl.pred[,which.min(cv.err)]]) }
whichone <- knn.cvk(t(LSsub[wh.ALL,]), nALL.train)
##Tree based classifiers
library(rpart)
dorpart <- function(LS, TS, cl) { LS <- as.data.frame(LS) TS <- as.data.frame(TS) dimnames(TS)[[2]] <- dimnames(LS)[[2]] <- paste("V", 1:ncol(LS), sep = "") fit <- rpart(factor(cl) ~ ., data = LS) predict.rpart(fit, TS, type = "class") }
all.rp <- dorpart(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train) table(all.rp, nALL.test)
##Discriminant Analysis
library(MASS)
dolda <- function(LS, TS, cl, prior, pmethod="plug-in") { ndata <- data.frame(LS) ndata$cl <- cl if( missing(prior) ) mn <- lda(cl~., data=ndata) else mn <- lda(cl~., data=ndata, prior=prior) pdata<-data.frame(TS) preds <- predict(mn, pdata, method=pmethod) rval <- list(lda = mn, preds=preds) class(rval) <- "mylda" rval }
lda.test <- dolda(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train) table(lda.test$preds$class, nALL.test)
##how does it work on the training set ##not very meaningful lda.pred.tr <- dolda(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train) table(lda.pred.tr$preds$class, nALL.train)