lab5.R
##load up required libraries
library(cluster)
library(Biobase)
library(annotate)
library(genefilter)
library(golubEsets)
############################
## data processing
###########################
data(golubTrain)
golubTrain
data(golubTest)
golubTest
LS<-exprs(golubTrain)
cl<-golubTrain$ALL.AML
TS<-exprs(golubTest)
clts<-golubTest$ALL.AML
##mmfilt is in the library
LS[LS<100]<-100
TS[TS<100]<-100
LS[LS>16000]<-16000
TS[TS>16000]<-16000
mmfilt <- function(r=5, d=500, na.rm=TRUE) {
function(x) {
minval <- min(x, na.rm=na.rm)
maxval <- max(x, na.rm=na.rm)
(maxval/minval > 5) && (maxval-minval > 500)
}
}
mmfun <- mmfilt()
ffun <- filterfun(mmfun)
good <- genefilter(cbind(LS, TS), ffun )
sum(good) ##I get 3571
LSsub <- log10(LS[good,])
TSsub <- log10(TS[good,])
LTsub <- cbind(LSsub, TSsub)
##############################
## Annotate : optional
##############################
##probably want to do a ttest filter here
##but we'll leave that to you
ll<- read.annotation("hgu68ll") #locus link
gname <- row.names(LSsub)[50]
llname<-get(gname, env=ll)
locuslink(llname) ##open your browser to the right thing
chroms <- read.annotation("hgu68Chrom")
whichC <- multiget(row.names(LSsub[1:100,]), env=chroms, iffail=NA)
cc <-as.numeric(unlist(whichC))
hist(cc)
###################################
## Clustering
#################################
gf <- gapFilter(550, 800, .1)
ff <- filterfun(gf)
xx <- 10^LSsub ##to get back to original units
good <- genefilter(xx, gf)
sum(good) # should be 1093
LS2 <- LSsub[good,]
library(mva)
clust.res<-hclust(as.dist(1-cor(LS2)), method = "complete")
plot.hclust(clust.res, labels=cl,hang = 0.1)
clust.res<-hclust(dist(t(LS2)), method = "average")
plot.hclust(clust.res, labels=cl,hang = 0.1)
pamLS <- pam(as.dist(1-cor(LS2)), 3, diss=TRUE)
plot(pamLS)
#######################################
## classification
##################################
library(class)
ALL <- c(as.character(golubTrain$ALL),
as.character(golubTest$ALL))
BT <- c(as.character(golubTrain$T.B),
as.character(golubTest$T.B))
BTs <- ifelse(BT=="B-cell", "B", "T")
BTs <- ifelse(BT=="T-cell", "T", BTs)
BTs <- ifelse(BT=="NA", "", BTs)
newALL <- factor(paste(ALL, BTs, sep=""))
nALL.train <- newALL[1:38]
nALL.test <-newALL[39:72]
##now do an Anova filter
af <- Anova(nALL.train, p=0.0000001)
ffA <- filterfun(af)
wh.ALL <- genefilter(LSsub, af) #use only training data
sum(wh.ALL) #how many
##how does it work on the test set
pred.test <- knn(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train, k=5)
table(pred.test, nALL.test)
##how does it work on the training set
pred.tr <- knn(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train, k=5)
table(pred.tr, nALL.train)
##
##use knn and cross-validation to choose the best k!
knn.cvk<-function(LS, cl, k=1:5, l=0, prob=FALSE, use.all=TRUE)
{
classes<-unique(cl)
cv.err<-rep(0,length(k))
cl.pred<-matrix(NA, nrow(LS), length(k))
for(j in (1:length(k)))
{
cl.pred[,j]<-knn.cv(LS, cl, k[j], l, prob, use.all)
cv.err[j]<-sum(cl!=cl.pred[,j])
}
k0<-k[which.min(cv.err)]
return(k=k0,pred=classes[cl.pred[,which.min(cv.err)]])
}
whichone <- knn.cvk(t(LSsub[wh.ALL,]), nALL.train)
##Tree based classifiers
library(rpart)
dorpart <- function(LS, TS, cl) {
LS <- as.data.frame(LS)
TS <- as.data.frame(TS)
dimnames(TS)[[2]] <- dimnames(LS)[[2]] <- paste("V",
1:ncol(LS), sep = "")
fit <- rpart(factor(cl) ~ ., data = LS)
predict.rpart(fit, TS, type = "class")
}
all.rp <- dorpart(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train)
table(all.rp, nALL.test)
##Discriminant Analysis
library(MASS)
dolda <- function(LS, TS, cl, prior, pmethod="plug-in") {
ndata <- data.frame(LS)
ndata$cl <- cl
if( missing(prior) )
mn <- lda(cl~., data=ndata)
else
mn <- lda(cl~., data=ndata, prior=prior)
pdata<-data.frame(TS)
preds <- predict(mn, pdata, method=pmethod)
rval <- list(lda = mn, preds=preds)
class(rval) <- "mylda"
rval
}
lda.test <- dolda(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train)
table(lda.test$preds$class, nALL.test)
##how does it work on the training set
##not very meaningful
lda.pred.tr <- dolda(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train)
table(lda.pred.tr$preds$class, nALL.train)
library(cluster)
library(Biobase)
library(annotate)
library(genefilter)
library(golubEsets)
############################
## data processing
###########################
data(golubTrain)
golubTrain
data(golubTest)
golubTest
LS<-exprs(golubTrain)
cl<-golubTrain$ALL.AML
TS<-exprs(golubTest)
clts<-golubTest$ALL.AML
##mmfilt is in the library
LS[LS<100]<-100
TS[TS<100]<-100
LS[LS>16000]<-16000
TS[TS>16000]<-16000
mmfilt <- function(r=5, d=500, na.rm=TRUE) {
function(x) {
minval <- min(x, na.rm=na.rm)
maxval <- max(x, na.rm=na.rm)
(maxval/minval > 5) && (maxval-minval > 500)
}
}
mmfun <- mmfilt()
ffun <- filterfun(mmfun)
good <- genefilter(cbind(LS, TS), ffun )
sum(good) ##I get 3571
LSsub <- log10(LS[good,])
TSsub <- log10(TS[good,])
LTsub <- cbind(LSsub, TSsub)
##############################
## Annotate : optional
##############################
##probably want to do a ttest filter here
##but we'll leave that to you
ll<- read.annotation("hgu68ll") #locus link
gname <- row.names(LSsub)[50]
llname<-get(gname, env=ll)
locuslink(llname) ##open your browser to the right thing
chroms <- read.annotation("hgu68Chrom")
whichC <- multiget(row.names(LSsub[1:100,]), env=chroms, iffail=NA)
cc <-as.numeric(unlist(whichC))
hist(cc)
###################################
## Clustering
#################################
gf <- gapFilter(550, 800, .1)
ff <- filterfun(gf)
xx <- 10^LSsub ##to get back to original units
good <- genefilter(xx, gf)
sum(good) # should be 1093
LS2 <- LSsub[good,]
library(mva)
clust.res<-hclust(as.dist(1-cor(LS2)), method = "complete")
plot.hclust(clust.res, labels=cl,hang = 0.1)
clust.res<-hclust(dist(t(LS2)), method = "average")
plot.hclust(clust.res, labels=cl,hang = 0.1)
pamLS <- pam(as.dist(1-cor(LS2)), 3, diss=TRUE)
plot(pamLS)
#######################################
## classification
##################################
library(class)
ALL <- c(as.character(golubTrain$ALL),
as.character(golubTest$ALL))
BT <- c(as.character(golubTrain$T.B),
as.character(golubTest$T.B))
BTs <- ifelse(BT=="B-cell", "B", "T")
BTs <- ifelse(BT=="T-cell", "T", BTs)
BTs <- ifelse(BT=="NA", "", BTs)
newALL <- factor(paste(ALL, BTs, sep=""))
nALL.train <- newALL[1:38]
nALL.test <-newALL[39:72]
##now do an Anova filter
af <- Anova(nALL.train, p=0.0000001)
ffA <- filterfun(af)
wh.ALL <- genefilter(LSsub, af) #use only training data
sum(wh.ALL) #how many
##how does it work on the test set
pred.test <- knn(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train, k=5)
table(pred.test, nALL.test)
##how does it work on the training set
pred.tr <- knn(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train, k=5)
table(pred.tr, nALL.train)
##
##use knn and cross-validation to choose the best k!
knn.cvk<-function(LS, cl, k=1:5, l=0, prob=FALSE, use.all=TRUE)
{
classes<-unique(cl)
cv.err<-rep(0,length(k))
cl.pred<-matrix(NA, nrow(LS), length(k))
for(j in (1:length(k)))
{
cl.pred[,j]<-knn.cv(LS, cl, k[j], l, prob, use.all)
cv.err[j]<-sum(cl!=cl.pred[,j])
}
k0<-k[which.min(cv.err)]
return(k=k0,pred=classes[cl.pred[,which.min(cv.err)]])
}
whichone <- knn.cvk(t(LSsub[wh.ALL,]), nALL.train)
##Tree based classifiers
library(rpart)
dorpart <- function(LS, TS, cl) {
LS <- as.data.frame(LS)
TS <- as.data.frame(TS)
dimnames(TS)[[2]] <- dimnames(LS)[[2]] <- paste("V",
1:ncol(LS), sep = "")
fit <- rpart(factor(cl) ~ ., data = LS)
predict.rpart(fit, TS, type = "class")
}
all.rp <- dorpart(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train)
table(all.rp, nALL.test)
##Discriminant Analysis
library(MASS)
dolda <- function(LS, TS, cl, prior, pmethod="plug-in") {
ndata <- data.frame(LS)
ndata$cl <- cl
if( missing(prior) )
mn <- lda(cl~., data=ndata)
else
mn <- lda(cl~., data=ndata, prior=prior)
pdata<-data.frame(TS)
preds <- predict(mn, pdata, method=pmethod)
rval <- list(lda = mn, preds=preds)
class(rval) <- "mylda"
rval
}
lda.test <- dolda(t(LSsub[wh.ALL,]), t(TSsub[wh.ALL,]), nALL.train)
table(lda.test$preds$class, nALL.test)
##how does it work on the training set
##not very meaningful
lda.pred.tr <- dolda(t(LSsub[wh.ALL,]), t(LSsub[wh.ALL,]), nALL.train)
table(lda.pred.tr$preds$class, nALL.train)