% \VignetteIndexEntry{RPA} %The above line is needed to remove a warning in R CMD check \documentclass[12pt]{article} \usepackage{amsmath,amssymb,amsfonts} \usepackage{hyperref} %\usepackage[authoryear,round]{natbib} \usepackage{Sweave} \textwidth=6.2in \textheight=8.5in \oddsidemargin=.1in \evensidemargin=.1in \headheight=-.3in \newcommand{\Rfunction}[1]{{\texttt{#1}}} \newcommand{\Robject}[1]{{\texttt{#1}}} \newcommand{\Rpackage}[1]{{\textit{#1}}} \def\muj{\mathbf{\mu}_j} \def\mujs{\{\mathbf{\mu}_j\}} \def\s{\{\mathbf{s}\}} \def\st{\mathbf{s}^t} \def\sj{\mathbf{s}_j} \def\sij{s_{ij}} \def\sjs{\mathbf{s}_{j=1}^J} \def\sjt{\mathbf{s}_j^t} \def\stj{s_{tj}} \def\scj{s_{cj}} \def\stjs{\{s_{tj}\}_t} \def\sjI{\mathbf{s}_j^1} \def\sjT{\mathbf{s}_j^T} \def\m{\mathbf{m}} \def\mj{\mathbf{m}_j} \def\mjs{\{\mathbf{m}_j\}} \def\mtj{m_{tj}} \def\gj{\{\mathbf{g}_j\}} \def\gi{g_i} \def\gt{g_t} \def\gc{g_c} \def\g{\mathbf{g}} \def\d{\mathbf{d}} \def\dt{d_t} \def\djt{\mathbf{d}_j^t} \def\djr{\mathbf{s}_{j}} \def\TauSq{\boldsymbol{\tau}^2} \def\tauj{\mathbf{\tau}_j} \def\taus{\{\tau_j^2\}} \def\taujSq{\mathbf{\tau}_j^2} \def\tauOneSq{\mathbf{\tau}_1^2} \def\tauPSq{\mathbf{\tau}_J^2} \def\epsilonj{\varepsilon_j} \def\epsilonij{\mathbf{\varepsilon}_{ij}} \def\epsiloncj{\varepsilon_{cj}} \def\epsilontj{\varepsilon_{tj}} \def\epsilontjs{\{\varepsilon_{tj}\}_t} \def\epsiloncjs{\{\varepsilon_{cj}\}_j} \def\Epsilonj{\boldsymbol{\varepsilon}_j} \def\Epsiloncj{\boldsymbol{\varepsilon}_{cj}} \def\alphaj{\alpha_j} \def\betaj{\beta_j} \def\alphahatj{\hat{\alpha}_j} \def\betahatj{\hat{\beta}_j} \def\zi{z_i} \def\j{\mathbf{j}} \def\sigmaj{\sigma_j} \def\lambdaj{\lambda_j} \author{Leo Lahti\footnote{http://www.iki.fi/Leo.Lahti}\\University of Helsinki\\leo.lahti@iki.fi} \begin{document} \title{Robust Probabilistic Averaging for probe performance analysis and preprocessing on short oligonucleotide arrays} \maketitle \section{Introduction} \Rpackage{RPA} (Robust Probabilistic Averaging)\footnote{\url{http://bioconductor.org/packages/release/bioc/html/RPA.html}} provides tools for probe reliability analysis and gene expression preprocessing for (Affymetrix) short oligonucleotide arrays, and more generally to summarize normally distributed multivariate observations that target the same object with varying degrees of reliability. RPA provides explicit data-driven estimates of probe performance, i.e. affinity and probe-specific noise level. Affinities are taken into account in summarizing the probes and noisy probes are downweighted, which yields more accurate estimates of gene expression \cite{Lahti11}. The probabilistic formulation allows also incorporation of prior information concerning probe reliability into analysis. \section{RPA preprocessing} RPA provides a wrapper ('rpa') for convenient preprocessing of Affymetrix arrays and support for alternative CDF environments. RPA operates on affybatch objects \cite{Gautier04a}. Load the example data: <>= require(affy) require(affydata) data(Dilution) @ The toy example uses the \Robject{Dilution} dataset provided by \Rpackage{affydata} package. To preprocess the affybatch, use: <>= eset <- rpa(Dilution) @ Input is an AffyBatch object, obtained from CEL files with the ReadAffy function of the affy package. The output is an ExpressionSet object, which allows downstream analysis of the results using standard R/BioC tools for gene expression data. \section{Probe performance analysis} Use {\it RPA.pointestimate} to investigate particular probesets: <>= require(RPA) sets <- geneNames(Dilution)[1:2] rpa.results <- RPA.pointestimate(Dilution, sets) @ The rpa.results object contains probe-specific affinity and variance estimates (affinity, sigma2), the probeset-level summary estimate (\(d\)) and other information. The results can be visualized with <>= plot(rpa.results, set = "1000_at", plots = "all") #plot(rpa.results, set = set, plots = "data") #plot(rpa.results, set = set, plots = "toydata.comparison") @ The output is shown in Figure \ref{fig:illustration}. See help('rpa.plot') for details. \begin{figure}[htbp] \begin{center} <>= dat <- plot(rpa.results, set = "1000_at", plots = "all") @ \caption{Estimated probe affinities, probe-specific noise level (standard deviation) and probeset-level summary estimate and probe-level signals for an example probe set.} \label{fig:illustration} \end{center} \end{figure} \subsection{Estimating probe reliability} The noise level of individual probes can be quantified based on the probe-specific variance parameter (\(\taujSq\)): <>= noise <- get.probe.noise.estimates(rpa.results) @ The precision \(\frac{1}{\taujSq}\), can be used to quantitate probe reliability. Comparison of probe-specific variances across probesets may benefit from probeset-specific normalization; see help(get.probe.noise.estimates) for details. \subsection{Setting probe-specific priors} Prior information of probe reliability can be set by tuning the shape (\(\alpha\)) and scale (\(\beta\)) parameters of the inverse Gamma conjugate prior for the variances. If the 'priors' parameter is not given, non-informative priors will be given for the other probesets: <>= priors <- list(alpha = NULL, beta = NULL, d = NULL) set <- "1000_at" priors$alpha <- 2 priors$beta[[set]] <- rep(1, 16) probe.index <- 5 priors$beta[[set]][[probe.index]] <- 3 rpa.results <- RPA.pointestimate(Dilution, sets, priors = priors) @ \subsection{General usage} Robust Probabilistic Averaging can be used to summarize any multivariate observations concerning the same object and having varying (Gaussian) affinities and noise levels: <>= res <- rpa.fit(S) @ \section{The probabilistic model} \subsection{Relation to other probe-level models} RPA differs from other popular preprocessing algorithms such as dChip's MBEI \cite{Li01pnas}, RMA \cite{Irizarry03rma}, or FARMS \cite{Hochreiter06} in two key respects. It calculates probe-level estimates of differential expression; {\it before} probeset-level summarization, which will avoid the modeling of unidentifiable probe affinities in determining the signal shape. Second, RPA provides tools for investigating the performance of individual probes. This can be used in microarray design and to confirm the end results of a microarray study. \subsection{Summary of RPA model} \subsubsection{Background correction and normalization} The probe-level data is background corrected, normalized, and log2-transformed before the analysis. By default, RPA uses the background correction model of RMA \cite{Irizarry03} and quantile normalization \cite{Bolstad03}. Our implementation utilizes the \Rpackage{affy} package \cite{Gautier04a} to handle probe-level data. For details about short oligonucleotide arrays and the design of the Affymetrix GeneChip arrays, see the Affymetrix MAS manual \cite{affy5}. \subsubsection{Probe reliability estimation and summarization} The RPA algorithm is used to obtain probeset-level summaries for gene expression and to estimate probe-specific noise. RPA assumes a Gaussian model for probe effects. Let us consider a probe set targeted at measuring the expression level of target transcript \(g\). Probe-level observation \(\sij\) of probe \(j\) on array \(i\) is modeled as a sum of the true expression signal (common for all probes in the probeset), and probe-specific Gaussian noise: \(\sij = \gi + \muj + \epsilonij\). The stochastic noise component is probe-specific, distributed as \(\epsilonij \sim N(0,\taujSq)\). The variance parameters \(\taus\) are of interest in probe reliability analysis; the inverse variance \(1/\taujSq\) can be used to measure of probe reliability (see get.probe.noise.estimates function). The mean parameter \(\muj\) of the noise model describes systematic probe affinity effect, which is unidentifiable. These parameters cancel out in RPA when the signal log-ratio between a user-specified 'reference' array and the remaining arrays is calculated at probe level: the differential expression signal between arrays \(t = \{1, \dots, T\} \) and the reference array \(c\) for probe \(j\) is given by \(\mtj = \stj - \scj = \gt - \gc + \epsilontj - \epsiloncj = \dt + \epsilontj - \epsiloncj\). In vector notation the differential expression profile of probe \(j\) across the arrays can be written as \(\mj = \d + \Epsilonj\). In practice, \(\d\) and the probe-specific variances \(\{\tau_j\}_{j=1}^P\) for the \(P\) probes within the probeset are estimated simultaneously based on the probabilistic model. With large sample sizes the solution will converge to estimating the mean of the probe-level observations weighted by probe reliability. The algorithm is robust to choice of the reference array since the reference effect is marginalized out in the probabilistic treatment; our experiments confirm that the probe-level noise estimates are not affected by the choice of the reference array. \subsubsection{Probe affinity estimation} Probe affinity terms and the original signal level are estimated after summarizing the probe-level differential gene expression estimates. First an estimate of the absolute signal level is calculated based on particular modeling assumptions. Then probe-specific affinities are calculated by comparing each probe to the probeset-level signal estimate. Let us write the probe-level observation in terms of differential expression signal, absolute signal level, and stochastic noise as \(\sj = \d + \mu + \varepsilon\), where \(\mu\) is a scalar (vector with identical elements). This will indicate how much probe-level observation deviates from the estimated signal shape \(\d\). This can be decomposed as \(\mu = \mu_{real} + \mu_{probe}\), where \(\mu_{real}\) describes the 'real' signal level, common for all probes and \(\mu_{probe}\) describes probe affinity effect. Let us assume that \(\mu_{probe} ~ N(0, \sigma^2_{probe})\). This encodes the assumption that in general the affinity effect of each probe tends to be close to zero. Then ML estimates of \(\mu_{real}\) and \(\mu_{probe}\) are calculated based on these particular assumptions. This part of the algorithm has not been defined in full probabilistic terms, we are only providing the point estimates. If an identical prior is used for all probes in affinity estimation then \(\mu_{real}\) is estimated as the average of the probe effects \(\mu\) and the probe-specific affinities \(\mu_{probe}\) would sum to exactly zero, analogous to RMA. We suggest an alternative approach where probes are weighted during affinity estimation according to their noisiness \(\sigma^2\). Then noisy probes that have little effect on the signal shape will contribute less to the absolute signal estimate while the expected sum of probe affinities remains at zero. \section{Validation} The model can reveal noisy probes independently of the error source noise; noise estimates have been validated by comparisons to known probe-level error sources \cite{Lahti11}. RPA has been shown to outperform other popular preprocessing methods in cross-platform studies \cite{Elo05, Lahti11} and spike-in data sets. RPA has also been compared to other preprocessing methods through the AffyCompII benchmarking website\footnote{http://affycomp.jhsph.edu/AFFY2/TABLES.hgu/0.html as of March 13, 2011} \cite{Cope04}. RPA outperformed some widely-used preprocessing algorithms such as RMA, measured in terms of average rank over the different test statistics. Overall the results indicate that RPA has a comparable preprocessing performance with standard preprocessing algorithms, which supports the validity of the probe-level affinity and noise estimates in RPA. Note that while RPA can be used for preprocessing, its primary focus is on probe performance analysis. \section{Citing RPA} Please cite \cite{Lahti11}. \section{Details} This document was written using: <
>= sessionInfo() @ %\bibliographystyle{abbrv} %\bibliography{RPA} \begin{thebibliography}{10} \bibitem{affy5} Affymetrix. \newblock {\em Affymetrix Microarray Suite User Guide}. \newblock Affymetrix, Santa Clara, CA, version 5 edition, 2001. \bibitem{Bolstad03} B.~M. Bolstad, R.~A. Irizarry, M.~Astrand, and T.~P. Speed. \newblock {A comparison of normalization methods for high density oligonucleotide array data based on variance and bias}. \newblock {\em Bioinformatics}, 19(2):185--193, 2003. \bibitem{Cope04} L.~M. Cope, R.~A. Irizarry, H.~A. Jaffee, Z.~Wu, and T.~P. Speed. \newblock {A benchmark for Affymetrix GeneChip expression measures}. \newblock {\em Bioinformatics}, 20:323--331, 2004. \bibitem{Elo05} L.~L. Elo, L.~Lahti, H.~Skottman, M.~Kyl{\"a}niemi, R.~Lahesmaa, and T.~Aittokallio. \newblock {Integrating probe-level expression changes across generations of Affymetrix arrays}. \newblock {\em Nucleic Acids Research}, 33(22):e193, 2005. \bibitem{Gautier04a} L.~Gautier, L.~Cope, B.~M. Bolstad, and R.~A. Irizarry. \newblock {affy--analysis of Affymetrix GeneChip data at the probe level}. \newblock {\em Bioinformatics}, 20(3):307--315, 2004. \bibitem{Hochreiter06} S.~Hochreiter, D.-A. Clevert, and K.~Obermayer. \newblock {A new summarization method for affymetrix probe level data}. \newblock {\em Bioinformatics}, 22(8):943--949, 2006. \bibitem{Irizarry03rma} R.~A. Irizarry, B.~M. Bolstad, F.~Collin, L.~M. Cope, B.~Hobbs, and T.~P. Speed. \newblock {Summaries of Affymetrix GeneChip probe level data}. \newblock {\em Nucl. Acids Res.}, 31(4):e15, 2003. \bibitem{Irizarry03} R.~A. Irizarry, B.~Hobbs, F.~Collin, Y.~D. Beazer-Barclay, K.~J. Antonellis, U.~Scherf, and T.~P. Speed. \newblock Exploration, normalization, and summaries of high density oligonucleotide array probe level data. \newblock {\em Biostatistics}, 4(2):249--264, 2003. \bibitem{Lahti11} L.~Lahti, L.~L. Elo, T.~Aittokallio, and S.~Kaski. \newblock Probabilistic analysis of probe reliability in differential gene expression studies with short oligonucleotide arrays. \newblock {\em {IEEE/ACM Transactions on Computational Biology and Bioinformatics}}, 8(1):217--225, 2011. \bibitem{Li01pnas} C.~Li and W.~H. Wong. \newblock Model-based analysis of oligonucleotide arrays: Expression index computation and outlier detection. \newblock {\em Proc. Natl. Acad. Sci.}, 98:31--36, 2001. \end{thebibliography} \end{document} %To investigate a particular probeset, preprocess data before the %analysis: % %<>= %prep <- RPA.preprocess(Dilution, cind = 1) %@ % %Pick probe-level data for a probe set (arrays x probes matrix): % %<>= %probe.indices <- prep$set.inds[["1000_at"]] %S <- prep$q[probe.indices, ] %@ % %Estimate probeset-level summary and probe-specific variance and %affinity terms: