exercises.Rnw
\documentclass[11pt,a4paper]{article} \usepackage{times} \usepackage{a4wide}
\newcommand{\func}[1]{{\tt#1}} \newcommand{\file}[1]{{\tt#1}} \newcommand{\nrsamptot}{8 } \newcommand{\nrsampgrp}{4 } \newcommand{\nrmeth}{3 } \begin{document}
%%----------------------environment: exercise----------------------- \setlength{\parindent}{0cm} \newcounter{dumbo} \newenvironment{exercise}{% \begin{list}{\textbf{\alph{dumbo}. }}{% \usecounter{dumbo} \setlength{\topsep}{0cm} \setlength{\parsep}{0cm} %% vertical separation between paragraphs \setlength{\itemsep}{0cm} %% additional vertical separation between list items \setlength{\labelsep}{0cm} \setlength{\leftmargin}{5mm} %% left indent \setlength{\rightmargin}{0mm}}}% {\end{list}} %%----------------------environment: exercises---------------------- \setlength{\parindent}{0cm} \newcounter{bambi} \newenvironment{exercises}{% \begin{list}{\textbf{\arabic{bambi}.) }}{% \usecounter{bambi} \setlength{\topsep}{0cm} \setlength{\itemsep}{1ex} %% additional vertical separation between list items \setlength{\labelsep}{0cm} \setlength{\leftmargin}{0mm} %% left indent \setlength{\rightmargin}{0mm}}}% {\end{list}} %%------------------------------------------------------------------------
\begin{center} {\bf Course in Practical Analysis of Microarray Data}\\ Introduction to R\\ Computational Exercises\\ September 2002 Wolfgang Huber\\ \end{center}
\begin{exercises} %%--------------------Ex.0------------------------------ \item {\bf Installing R.} Check whether R is installed on your computer. If not, download it from cran.r-project.org and install it.
%%--------------------read.table one file------------ \item {\bf Reading data files.} In the folder \file{data/alizadeh}, you find a file \file{lc7b048rex.DAT}. \begin{exercise} \item Open it in a text editor. \item Read it into a data frame (use the function \func{read.delim}) \item Look at the contents of the table (use the functions \func{dim}, \func{colnames}, and subsetting) \end{exercise}
<
%%-------------------histogram and scatterplot ----------------- \item {\bf Simple plots.} \begin{exercise} \item Make a histogram of the values in the column CH1I. \item Produce scatterplots of CHI1 versus CH2I, once with linear axis scaling, once with double-logarithmic. \item Find out how to decorate the plot with our own axis labels and plot title, and how to change the plot symbols. \item Save the plots as PDF, and as Windows metafiles. Copy and paste them into MS-Office applications. \end{exercise}
<xy)
plot(x$CH1I, x$CH2I, log=xy, main=lc7b048, xlab=green, ylab=red, pch=.)
plot(x$CH1I, x$CH1B, log=xy, xlab=foreground, ylab=background, pch=.)
@
%%-------------------spatial distribution -----------------
\item {\bf Spatial distribution.} {\it This exercise is rather difficult, and 
not necessary for the subsequent analyses - it may (should) be skipped by novices.}
  The spots on these arrays are
  arranged in a quadratic pattern of 96 rows and 96 columns. However,
  the order of the 9216 rows in the file does not simply reflect the
  spatial arrangement row-by-row or column-by-column. In order to
  display the spatial distribution of measured foreground and
  background intensities, we first need to rearrange the data.  The
  relationship between the $x$- and $y$-coordinates, as numbers from
  $1,\ldots,96$, and the data in the files is given by: 
<rgb))
@
\begin{exercise}
\item Read the help file for \func{pixmap} and understand 
what the above code is doing.
\item Try out other transformations than the 4-th root, like identity 
transformation, square root, logarithm, and observe their impact on 
the visual appearance of the image.
\end{exercise}
%%-------------------normalization vsn ----------------- \item {\bf Calibration and variance stabillization.} Download the package VSN from\linebreak http://www.dkfz.de/abt0840/whuber and install it. Subtract the background intensities CH1B, CH2B from the foreground intensities CH1I, CH2I. Use the function \func{vsn} to calibrate and transform, and plot the result.
<nv.RData)
@
<nv.RData)
plot(nv)
@
%%--------------------read.table all files------------ \item {\bf Reading multiple data files.} In the folder \file{data/alizadeh}, you find a file \file{samples.txt}. \begin{exercise} \item Read it into a data frame (use the function \func{read.delim} with the \func{as.is=T} option) \item Create 4 matrices of dimensions $9216 \times \nrsamptot$ that contain, respectively, CH1I, CH1B, CH2I, and CH2B intensities of the 9216 spots on the \nrsamptot slides with filenames are given in \file{samples.txt}. \item Save the matrices into an XDR file. \item Note: the bioconductor packages marrayInput and affy offer more comfortable methods for reading and managing data from a series of microarrays. \end{exercise}
<
<rex.DAT, sep='')
  dat = read.delim(file.path(datapath, filename))
  Gf[,i] = dat$CH1I
  Gb[,i] = dat$CH1B
  Rf[,i] = dat$CH2I
  Rb[,i] = dat$CH2B
}
save(Gf, Gb, Rf, Rb, file=intensities.RData)
@
<intensities.RData)
nrspots   = nrow(Gf)
nrsamples = ncol(Gf)
stopifnot(nrspots==9216 && nrsamples==nrow(samples))
@
%%--------------------install Biobase, marrayNorm \item {\bf Different normalization methods.} In the following, we are going to identify genes that appear to be differentially transcribed between the 4 CLL samples and the 4 DLCL samples. For this, we will apply a number of different normalization strategies to the data and compare their results. \begin{exercise} \item Download the packages Biobase, marrayClasses, marrayNorm, and multtest from\linebreak http://www.bioconductor.org and install them. \item Create a 3D array of dimensions $9216 \times \nrsamptot \times \nrmeth$ that contains, for all spots, the value of $M$ (that is, the log-ratio or the generalized log-ratio), for the \nrsamptot slides and the following \nrmeth different normalization methods: \begin{enumerate} \item vsn (affine normalization and variance stabilization) \item maNorm with global median location normalization \item maNorm with loess for intensity- or $A$-dependent location normalization using the `loess' smoother \end{enumerate} \item Save the array into an XDR file. \end{exercise}
< 
dummy = vsn
dummy = green in columns 1:8, red in 9:16
nw     = vsn(cbind(Gf-Gb, Rf-Rb))  
M[,,1] = nw$hy[,9:16] - nw$hy[,1:8]
A[,,1] = nw$hy[,9:16] + nw$hy[,1:8]
dummy =  
dummy = global median and loess
mar    = new(marrayRaw, maGf=Gf, maGb=Gb, maRf=Rf, maRb=Rb)
nm     = maNorm(mar, norm="median", echo=T)
nl     = maNorm(mar, norm="loess",  echo=T)
M[,,2] = nm@maM
A[,,2] = nm@maA
M[,,3] = nl@maM
A[,,3] = nl@maA
dummy =   
save(M, A, file=MA.RData)
@
<MA.RData)
nrmethods = dim(M)[3]
@
%%--------Compare different norm methods by scatterplot -------------------------- \item {\bf Qualitatively compare the results.} Look at scatterplots of the values of $M$ from the same slide, calculated with different normalization methods. Do the values generally agree? How do they differ?
<., xlab=vsn, ylab=loess, main=samples$name[4])
@
%%-------multtest----------------------
\item {\bf Testing for differential transcription.} Now we are ready
  to calculate test statistics and to select genes.  {\em Note:} The
  number of replicates (4 versus 4) that we are considering here is
  very small and no solid conclusions about individual genes or
  individual samples will be derived from that. The full data set
  contains many more chips. Here we restrict ourselves to a few of
  them in order to keep calculations simple and not too slow for the
  purpose of this course.
\begin{exercise}
\item Look at the function \func{t.test} from the package ctest (which is 
  part of the base libraries), and at \func{mt.teststat} from the package
  multtest. 
\item For each gene, and for each of the normalization methods,
  calculate the $t$-statistic for the CLL-to-DLBL class distinction. 
  Store the result in a 9216 x 3 matrix. Which of the functions 
  \func{t.test}, \func{mt.teststat} calculates faster? Look at the 
  histogram of $t$-values that they produce; you may find extreme values 
  like 3e38 in there. Where do they come from?
\item How do the $t$-statistics agree between the different normalization 
  methods? 
\end{exercise}
<
par(mfrow=c(2,2))
for (j in 2:nrmethods) {        
  for (i in 1:(j-1)) {
    plot(t[,i], t[,j], pch=.', xlab=paste(i), ylab=paste(j)) 
    lines(c(-30,30), c(-30,30), col="red")
  }
}
dummy = alternatively: use splom from library(lattice)
@
%%-------Biology------------------------------
\item {\bf Biology.} Look at the 5 top genes, and using the
  information in the file\linebreak
  \file{.../data/alizadeh/scheme.htm}, find out the curated gene
  names.  Do they correspond to genes that are mentioned in the
  Alizadeh et al.\ paper?
<chip_spot_well.tab.txt), as.is=T, header=F)
colnames(csw) = c(batch, spot, wellid)
csw[1:3,]
wcn = read.delim(file.path(datapath, well_cloneid_name.tab.txt), as.is=T, header=F)
colnames(wcn) = c(wellid, cloneid, name)
wcn[c(1,51,158),]
dummy =  
topspots = order(abs(t[,1]), decreasing=TRUE)[1:5]
wellid   = csw$wellid[ csw$batch==lc7b & csw$spot %in% topspots ]
wcn[wcn$wellid %in% wellid,]
@
%%--------write data out for Excel/SAM :(-----------
<lc7b)
ord = order(csw$spot[sel])
stopifnot(all(csw$spot[sel][ord]==1:nrspots))
wellid = csw$wellid[sel][ord]
getname = function(wellid) wcn$name[wcn$wellid==wellid] pname = sapply(wellid, getname)
pname[is.na(pname)] = NoName
cloneid = 1:nrspots
## rowsel = apply(M, 1, function(x) !any(is.na(x))) rowsel = rep(T, nrspots)
for(meth in 1:nrmethods) { dat = M[,,meth] stopifnot(ncol(dat)==8) chksum = sum(dat) outdat = rbind( c("chksum", "", samples$name), c( chksum , "", samples$sampleid), c( "" , "", paste(4:11 %/% 4)), cbind(pname, cloneid, dat)[rowsel,] ) write.table(outdat, quote=F, sep="\t", file=sprintf("for_sam_%d.txt", meth), row.names=F, col.names=F) } @
%%--------Thresholds------------------------------ \item {\bf $t$-thresholds.} Designate as {\it differentially expressed} those clones for which the absolute value of $t$ is larger than a certain threshold. What are the values of this threshold for our data, if we want to have a clone list length $10, 20, 50, 100, \ldots$?
<<>>=
dummy = clone list lengths
cll       = c(10, 20, 50, 100, 200, 500, 1000)  
threshold = matrix(NA, nrow=length(cll), ncol=nrmethods)
rownames(threshold) = paste(cll)
colnames(threshold) = c("vsn", "global median","loess")
dummy =  
for(meth in 1:nrmethods) {
  st = sort(abs(t[, meth]), decreasing=TRUE)
  for(j in 1:length(cll)) {
    threshold[j, meth]  = st[cll[j]]
  }
}
threshold
@
%%-------Permutations----------------------------
\item {\bf Permutations.}
\begin{exercise}
\item How many ways are there to split a set of \nrsamptot objects into two
  groups of \nrsampgrp and \nrsampgrp?  Use the function \func{nchoosek} from the file
  \file{nchoosek.R} to generate a numerical representation of these splits.
\item Prepare a matrix with \nrsamptot rows, corresponding to the \nrsamptot samples, and 
as many columns as there are splits. Set the matrix elements to 0 and 
1, such that each column of the matrix represents a split.
\end{exercise}
<nchoosek.R)
nck = nchoosek(7, 3)
nck
classlabel = matrix(0, nrow=nrsamples, ncol=ncol(nck))
for(p in 1:ncol(nck)) {
   classlabel[      8, p] = 1
   classlabel[nck[,p], p] = 1
}
classlabel
@
%%-------FDR------------------------------
\item {\bf False discovery rate (FDR).}  Now we want to apply these
  permutations to the data to estimate the FDR. Do the following for
  each normalization method:
\begin{exercise}
\item For each of the splits, calculate the corresponding $t$-statistics for all genes.
\item For each of the splits, and for each of the above choices for
  clone list lengths and thresholds, calculate the number of clones
  that have an absolute $t$-value greater or equal to the threshold.
\item Calculate the median of these numbers across the splits.
  Divide this by the clone list length to obtain an estimate of the FDR.
\end{exercise}
<permt is a 9216 x 35 matrix of t-values, with
  dummy = rows corresponding to the clones
  dummy = and columns to the different splits
  permt = apply(classlabel, 2, calct, M[,,meth])
  for(j in 1:length(cll)) {
    dummy = pnrsel is a vector of length 35, with the
    dummy = number of clones that had t greater or 
    dummy = equal to the threshold
    pnrsel = apply(permt, 2, function(t) length(which(abs(t)>=threshold[j, meth])))
    fdr[j, meth] = median(pnrsel) / cll[j]
  }
}
@
<fdr.RData)
@
<fdr.RData)
@
<n, log=x, xlab=No. of genes selected, ylab=Estimated FDR)
for(meth in 1:nrmethods) 
  lines(cll, fdr[,meth], type=b, pch=19, lty=meth)
legend(min(cll), max(fdr), colnames(fdr), lty=1:nrmethods)
@
%%---------------------affy-----------------------------
\item In the directory ../data/Shipp, you find a number of Affymetrix CEL files and 
a corresponding CDF file. 
\begin{exercise}
\item Using the package \file{affy}, load them into a {\em probe level object} (Plob). 
\item Look at the spatial distribution of intensities on the chips.
\item Normalize the data and calculate probe set summary values.
\end{exercise}
<../data/Shipp)
dat = ReadAffy()
dummy = show images of probe data
image(dat)
dummy = calculation probe set summaries
e = express(dat) 
dummy = scatterplot first versus second sample
plot(exprs(e1), pch=".")
@
%%--------------------install e1071 and rpam------------
\item {\bf Prepare for the classification exercises on Thursday.}
Check whether the packages \file{e1071} and \file{rpam} are installed on your 
computer. If not, install them. 
\end{exercises} \end{document}