exercises.Rnw
\documentclass[11pt,a4paper]{article} \usepackage{times} \usepackage{a4wide}
\newcommand{\func}[1]{{\tt#1}} \newcommand{\file}[1]{{\tt#1}} \newcommand{\nrsamptot}{8 } \newcommand{\nrsampgrp}{4 } \newcommand{\nrmeth}{3 } \begin{document}
%%----------------------environment: exercise----------------------- \setlength{\parindent}{0cm} \newcounter{dumbo} \newenvironment{exercise}{% \begin{list}{\textbf{\alph{dumbo}. }}{% \usecounter{dumbo} \setlength{\topsep}{0cm} \setlength{\parsep}{0cm} %% vertical separation between paragraphs \setlength{\itemsep}{0cm} %% additional vertical separation between list items \setlength{\labelsep}{0cm} \setlength{\leftmargin}{5mm} %% left indent \setlength{\rightmargin}{0mm}}}% {\end{list}} %%----------------------environment: exercises---------------------- \setlength{\parindent}{0cm} \newcounter{bambi} \newenvironment{exercises}{% \begin{list}{\textbf{\arabic{bambi}.) }}{% \usecounter{bambi} \setlength{\topsep}{0cm} \setlength{\itemsep}{1ex} %% additional vertical separation between list items \setlength{\labelsep}{0cm} \setlength{\leftmargin}{0mm} %% left indent \setlength{\rightmargin}{0mm}}}% {\end{list}} %%------------------------------------------------------------------------
\begin{center} {\bf Course in Practical Analysis of Microarray Data}\\ Introduction to R\\ Computational Exercises\\ September 2002 Wolfgang Huber\\ \end{center}
\begin{exercises} %%--------------------Ex.0------------------------------ \item {\bf Installing R.} Check whether R is installed on your computer. If not, download it from cran.r-project.org and install it.
%%--------------------read.table one file------------ \item {\bf Reading data files.} In the folder \file{data/alizadeh}, you find a file \file{lc7b048rex.DAT}. \begin{exercise} \item Open it in a text editor. \item Read it into a data frame (use the function \func{read.delim}) \item Look at the contents of the table (use the functions \func{dim}, \func{colnames}, and subsetting) \end{exercise}
<
%%-------------------histogram and scatterplot ----------------- \item {\bf Simple plots.} \begin{exercise} \item Make a histogram of the values in the column CH1I. \item Produce scatterplots of CHI1 versus CH2I, once with linear axis scaling, once with double-logarithmic. \item Find out how to decorate the plot with our own axis labels and plot title, and how to change the plot symbols. \item Save the plots as PDF, and as Windows metafiles. Copy and paste them into MS-Office applications. \end{exercise}
<xy
)
plot(x$CH1I, x$CH2I, log=xy
, main=lc7b048
, xlab=green
, ylab=red
, pch=.
)
plot(x$CH1I, x$CH1B, log=xy
, xlab=foreground
, ylab=background
, pch=.
)
@
%%-------------------spatial distribution -----------------
\item {\bf Spatial distribution.} {\it This exercise is rather difficult, and
not necessary for the subsequent analyses - it may (should) be skipped by novices.}
The spots on these arrays are
arranged in a quadratic pattern of 96 rows and 96 columns. However,
the order of the 9216 rows in the file does not simply reflect the
spatial arrangement row-by-row or column-by-column. In order to
display the spatial distribution of measured foreground and
background intensities, we first need to rearrange the data. The
relationship between the $x$- and $y$-coordinates, as numbers from
$1,\ldots,96$, and the data in the files is given by:
<rgb
))
@
\begin{exercise}
\item Read the help file for \func{pixmap} and understand
what the above code is doing.
\item Try out other transformations than the 4-th root, like identity
transformation, square root, logarithm, and observe their impact on
the visual appearance of the image.
\end{exercise}
%%-------------------normalization vsn ----------------- \item {\bf Calibration and variance stabillization.} Download the package VSN from\linebreak http://www.dkfz.de/abt0840/whuber and install it. Subtract the background intensities CH1B, CH2B from the foreground intensities CH1I, CH2I. Use the function \func{vsn} to calibrate and transform, and plot the result.
<nv.RData
)
@
<nv.RData
)
plot(nv)
@
%%--------------------read.table all files------------ \item {\bf Reading multiple data files.} In the folder \file{data/alizadeh}, you find a file \file{samples.txt}. \begin{exercise} \item Read it into a data frame (use the function \func{read.delim} with the \func{as.is=T} option) \item Create 4 matrices of dimensions $9216 \times \nrsamptot$ that contain, respectively, CH1I, CH1B, CH2I, and CH2B intensities of the 9216 spots on the \nrsamptot slides with filenames are given in \file{samples.txt}. \item Save the matrices into an XDR file. \item Note: the bioconductor packages marrayInput and affy offer more comfortable methods for reading and managing data from a series of microarrays. \end{exercise}
<
<rex.DAT
, sep='')
dat = read.delim(file.path(datapath, filename))
Gf[,i] = dat$CH1I
Gb[,i] = dat$CH1B
Rf[,i] = dat$CH2I
Rb[,i] = dat$CH2B
}
save(Gf, Gb, Rf, Rb, file=intensities.RData
)
@
<intensities.RData
)
nrspots = nrow(Gf)
nrsamples = ncol(Gf)
stopifnot(nrspots==9216 && nrsamples==nrow(samples))
@
%%--------------------install Biobase, marrayNorm \item {\bf Different normalization methods.} In the following, we are going to identify genes that appear to be differentially transcribed between the 4 CLL samples and the 4 DLCL samples. For this, we will apply a number of different normalization strategies to the data and compare their results. \begin{exercise} \item Download the packages Biobase, marrayClasses, marrayNorm, and multtest from\linebreak http://www.bioconductor.org and install them. \item Create a 3D array of dimensions $9216 \times \nrsamptot \times \nrmeth$ that contains, for all spots, the value of $M$ (that is, the log-ratio or the generalized log-ratio), for the \nrsamptot slides and the following \nrmeth different normalization methods: \begin{enumerate} \item vsn (affine normalization and variance stabilization) \item maNorm with global median location normalization \item maNorm with loess for intensity- or $A$-dependent location normalization using the `loess' smoother \end{enumerate} \item Save the array into an XDR file. \end{exercise}
<
dummy = vsn
dummy = green in columns 1:8, red in 9:16
nw = vsn(cbind(Gf-Gb, Rf-Rb))
M[,,1] = nw$hy[,9:16] - nw$hy[,1:8]
A[,,1] = nw$hy[,9:16] + nw$hy[,1:8]
dummy =
dummy = global median and loess
mar = new(marrayRaw
, maGf=Gf, maGb=Gb, maRf=Rf, maRb=Rb)
nm = maNorm(mar, norm="median", echo=T)
nl = maNorm(mar, norm="loess", echo=T)
M[,,2] = nm@maM
A[,,2] = nm@maA
M[,,3] = nl@maM
A[,,3] = nl@maA
dummy =
save(M, A, file=MA.RData
)
@
<MA.RData
)
nrmethods = dim(M)[3]
@
%%--------Compare different norm methods by scatterplot -------------------------- \item {\bf Qualitatively compare the results.} Look at scatterplots of the values of $M$ from the same slide, calculated with different normalization methods. Do the values generally agree? How do they differ?
<.
, xlab=vsn
, ylab=loess
, main=samples$name[4])
@
%%-------multtest----------------------
\item {\bf Testing for differential transcription.} Now we are ready
to calculate test statistics and to select genes. {\em Note:} The
number of replicates (4 versus 4) that we are considering here is
very small and no solid conclusions about individual genes or
individual samples will be derived from that. The full data set
contains many more chips. Here we restrict ourselves to a few of
them in order to keep calculations simple and not too slow for the
purpose of this course.
\begin{exercise}
\item Look at the function \func{t.test} from the package ctest (which is
part of the base libraries), and at \func{mt.teststat} from the package
multtest.
\item For each gene, and for each of the normalization methods,
calculate the $t$-statistic for the CLL-to-DLBL class distinction.
Store the result in a 9216 x 3 matrix. Which of the functions
\func{t.test}, \func{mt.teststat} calculates faster? Look at the
histogram of $t$-values that they produce; you may find extreme values
like 3e38
in there. Where do they come from?
\item How do the $t$-statistics agree between the different normalization
methods?
\end{exercise}
<
par(mfrow=c(2,2))
for (j in 2:nrmethods) {
for (i in 1:(j-1)) {
plot(t[,i], t[,j], pch=
.', xlab=paste(i), ylab=paste(j))
lines(c(-30,30), c(-30,30), col="red")
}
}
dummy = alternatively: use splom from library(lattice)
@
%%-------Biology------------------------------
\item {\bf Biology.} Look at the 5 top genes, and using the
information in the file\linebreak
\file{.../data/alizadeh/scheme.htm}, find out the curated gene
names. Do they correspond to genes that are mentioned in the
Alizadeh et al.\ paper?
<chip_spot_well.tab.txt
), as.is=T, header=F)
colnames(csw) = c(batch
, spot
, wellid
)
csw[1:3,]
wcn = read.delim(file.path(datapath, well_cloneid_name.tab.txt
), as.is=T, header=F)
colnames(wcn) = c(wellid
, cloneid
, name
)
wcn[c(1,51,158),]
dummy =
topspots = order(abs(t[,1]), decreasing=TRUE)[1:5]
wellid = csw$wellid[ csw$batch==lc7b
& csw$spot %in% topspots ]
wcn[wcn$wellid %in% wellid,]
@
%%--------write data out for Excel/SAM :(-----------
<lc7b
)
ord = order(csw$spot[sel])
stopifnot(all(csw$spot[sel][ord]==1:nrspots))
wellid = csw$wellid[sel][ord]
getname = function(wellid) wcn$name[wcn$wellid==wellid] pname = sapply(wellid, getname)
pname[is.na(pname)] = NoName
cloneid = 1:nrspots
## rowsel = apply(M, 1, function(x) !any(is.na(x))) rowsel = rep(T, nrspots)
for(meth in 1:nrmethods) { dat = M[,,meth] stopifnot(ncol(dat)==8) chksum = sum(dat) outdat = rbind( c("chksum", "", samples$name), c( chksum , "", samples$sampleid), c( "" , "", paste(4:11 %/% 4)), cbind(pname, cloneid, dat)[rowsel,] ) write.table(outdat, quote=F, sep="\t", file=sprintf("for_sam_%d.txt", meth), row.names=F, col.names=F) } @
%%--------Thresholds------------------------------ \item {\bf $t$-thresholds.} Designate as {\it differentially expressed} those clones for which the absolute value of $t$ is larger than a certain threshold. What are the values of this threshold for our data, if we want to have a clone list length $10, 20, 50, 100, \ldots$?
<<>>=
dummy = clone list lengths
cll = c(10, 20, 50, 100, 200, 500, 1000)
threshold = matrix(NA, nrow=length(cll), ncol=nrmethods)
rownames(threshold) = paste(cll)
colnames(threshold) = c("vsn", "global median","loess")
dummy =
for(meth in 1:nrmethods) {
st = sort(abs(t[, meth]), decreasing=TRUE)
for(j in 1:length(cll)) {
threshold[j, meth] = st[cll[j]]
}
}
threshold
@
%%-------Permutations----------------------------
\item {\bf Permutations.}
\begin{exercise}
\item How many ways are there to split a set of \nrsamptot objects into two
groups of \nrsampgrp and \nrsampgrp? Use the function \func{nchoosek} from the file
\file{nchoosek.R} to generate a numerical representation of these splits.
\item Prepare a matrix with \nrsamptot rows, corresponding to the \nrsamptot samples, and
as many columns as there are splits. Set the matrix elements to 0 and
1, such that each column of the matrix represents a split.
\end{exercise}
<nchoosek.R
)
nck = nchoosek(7, 3)
nck
classlabel = matrix(0, nrow=nrsamples, ncol=ncol(nck))
for(p in 1:ncol(nck)) {
classlabel[ 8, p] = 1
classlabel[nck[,p], p] = 1
}
classlabel
@
%%-------FDR------------------------------
\item {\bf False discovery rate (FDR).} Now we want to apply these
permutations to the data to estimate the FDR. Do the following for
each normalization method:
\begin{exercise}
\item For each of the splits, calculate the corresponding $t$-statistics for all genes.
\item For each of the splits, and for each of the above choices for
clone list lengths and thresholds, calculate the number of clones
that have an absolute $t$-value greater or equal to the threshold.
\item Calculate the median of these numbers across the splits.
Divide this by the clone list length to obtain an estimate of the FDR.
\end{exercise}
<permt is a 9216 x 35 matrix of t-values, with
dummy = rows corresponding to the clones
dummy = and columns to the different splits
permt = apply(classlabel, 2, calct, M[,,meth])
for(j in 1:length(cll)) {
dummy = pnrsel is a vector of length 35, with the
dummy = number of clones that had t greater or
dummy = equal to the threshold
pnrsel = apply(permt, 2, function(t) length(which(abs(t)>=threshold[j, meth])))
fdr[j, meth] = median(pnrsel) / cll[j]
}
}
@
<fdr.RData
)
@
<fdr.RData
)
@
<n
, log=x
, xlab=No. of genes selected
, ylab=Estimated FDR
)
for(meth in 1:nrmethods)
lines(cll, fdr[,meth], type=b
, pch=19, lty=meth)
legend(min(cll), max(fdr), colnames(fdr), lty=1:nrmethods)
@
%%---------------------affy-----------------------------
\item In the directory ../data/Shipp, you find a number of Affymetrix CEL files and
a corresponding CDF file.
\begin{exercise}
\item Using the package \file{affy}, load them into a {\em probe level object} (Plob).
\item Look at the spatial distribution of intensities on the chips.
\item Normalize the data and calculate probe set summary values.
\end{exercise}
<../data/Shipp
)
dat = ReadAffy()
dummy = show images of probe data
image(dat)
dummy = calculation probe set summaries
e = express(dat)
dummy = scatterplot first versus second sample
plot(exprs(e1), pch=".")
@
%%--------------------install e1071 and rpam------------
\item {\bf Prepare for the classification exercises on Thursday.}
Check whether the packages \file{e1071} and \file{rpam} are installed on your
computer. If not, install them.
\end{exercises} \end{document}