%\VignetteIndexEntry{RPA}
%The above line is needed to remove a warning in R CMD check
\documentclass[12pt]{article}

\usepackage{amsmath,amssymb,amsfonts}
\usepackage{hyperref}
%\usepackage[authoryear,round]{natbib}
\usepackage{Sweave}

\textwidth=6.2in
\textheight=8.5in
\oddsidemargin=.1in
\evensidemargin=.1in
\headheight=-.3in

\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rpackage}[1]{{\textit{#1}}}

\def\muj{\mathbf{\mu}_j}
\def\mujs{\{\mathbf{\mu}_j\}}

\def\s{\{\mathbf{s}\}}
\def\st{\mathbf{s}^t}
\def\sj{\mathbf{s}_j}
\def\sij{s_{ij}}
\def\sjs{\mathbf{s}_{j=1}^J}
\def\sjt{\mathbf{s}_j^t}
\def\stj{s_{tj}}
\def\scj{s_{cj}}
\def\stjs{\{s_{tj}\}_t}
\def\sjI{\mathbf{s}_j^1}
\def\sjT{\mathbf{s}_j^T}

\def\m{\mathbf{m}}
\def\mj{\mathbf{m}_j}
\def\mjs{\{\mathbf{m}_j\}}
\def\mtj{m_{tj}}

\def\gj{\{\mathbf{g}_j\}}
\def\gi{g_i}
\def\gt{g_t}
\def\gc{g_c}
\def\g{\mathbf{g}}

\def\d{\mathbf{d}}
\def\dt{d_t}
\def\djt{\mathbf{d}_j^t}
\def\djr{\mathbf{s}_{j}}

\def\TauSq{\boldsymbol{\tau}^2}
\def\tauj{\mathbf{\tau}_j}
\def\taus{\{\tau_j^2\}}
\def\taujSq{\mathbf{\tau}_j^2}
\def\tauOneSq{\mathbf{\tau}_1^2}
\def\tauPSq{\mathbf{\tau}_J^2}

\def\epsilonj{\varepsilon_j}
\def\epsilonij{\mathbf{\varepsilon}_{ij}}
\def\epsiloncj{\varepsilon_{cj}}
\def\epsilontj{\varepsilon_{tj}}
\def\epsilontjs{\{\varepsilon_{tj}\}_t}
\def\epsiloncjs{\{\varepsilon_{cj}\}_j}
\def\Epsilonj{\boldsymbol{\varepsilon}_j}
\def\Epsiloncj{\boldsymbol{\varepsilon}_{cj}}

\def\alphaj{\alpha_j}
\def\betaj{\beta_j}
\def\alphahatj{\hat{\alpha}_j}
\def\betahatj{\hat{\beta}_j}

\def\zi{z_i}
\def\j{\mathbf{j}}
\def\sigmaj{\sigma_j}
\def\lambdaj{\lambda_j}

\author{Leo Lahti\footnote{http://www.iki.fi/Leo.Lahti}\\leo.lahti@iki.fi}
  
%  \\Helsinki Institute for Information Technology HIIT\\
%  and Adaptive Informatics Research Centre,\\Department of Information
%  and Computer Science,\\Aalto University P.O. Box 15400, FI-00076 Aalto, Finland\\{\tt
%    leo.lahti@iki.fi}}

\begin{document}
\title{RPA: analysis of probe reliability and gene expression on short
  oligonucleotide arrays}

\maketitle 

\section{Introduction}
 
\Rpackage{RPA} (Robust Probabilistic
Averaging)\footnote{\url{http://bioconductor.org/packages/release/bioc/html/RPA.html}}
provides tools for probe reliability analysis and gene expression
preprocessing for (Affymetrix) short oligonucleotide arrays. It can
also be used more generally to summarize multivariate observations
that target the same objects with varying degree of reliability.

RPA can be used for standard preprocessing tasks in gene expression
studies; it has been shown to outperform other popular preprocessing
methods in differential gene expression analysis. In addition, the
method provides explicit, data-driven estimates of probe reliability;
poorly performing probes are downweighted in the model, which yields
more accurate estimates of gene expression and can reveal noisy probes
independently of the error source. The noise estimates have been
validated by comparisons to known probe-level error sources. The
probabilistic formulation allows also incorporation of prior
information concerning probe reliability into gene expression analysis
\cite{Lahti11}.


\section{Preprocessing gene expression data with RPA}

RPA provides a wrapper ('rpa') for convenient preprocessing of
Affymetrix arrays. Alternative CDF environments are also supported
(see help(rpa) for details). Here is a preprocessing example with an
example data set:

<<rpa, results = hide, eval = FALSE>>=
library(affydata)
data(Dilution)
eset <- rpa(Dilution)
@

The input is an AffyBatch object. CEL files can be read in as
affybatch with the ReadAffy function of the affy package. The output
is an ExpressionSet object, which allows downstream analysis of the
results using standard R/BioC tools for gene expression data.


\section{Probe reliability analysis}

RPA operates on affybatch objects \cite{Gautier04a}. An affybatch
contains the probe-level data of Affymetrix arrays. Our toy examples
use the \Robject{Dilution} dataset provided by \Rpackage{affydata}
package. Load example data (the 'Dilution' affybatch):

<<startchunk,results=hide>>=
require(affy)
require(affydata)
data(Dilution)
@

{\it RPA.pointestimate} is the main function. Let us perform the
analysis for particular probesets in the Dilution data (the whole data
set will be analyzed by default if 'sets' is not given).

<<RPA.pointestimate, results = hide>>=
require(RPA)
sets <- geneNames(Dilution)[1:2]
rpa.results <- RPA.pointestimate(Dilution, sets)
@

The 'rpa2eset' function can be used to coerce the probeset-level
expression estimates into an ExpressionSet object. 


The results for a particular probeset are visualized with

<<visu, fig = FALSE, echo = TRUE, print = FALSE, results = hide, eval = FALSE>>=
plot(rpa.results, set = "1000_at")
@

The output is shown in Figure \ref{fig:illustration}. See
help('rpa.plot') for details.

\begin{figure}[htbp]
\begin{center}
<<barplots, fig = TRUE, echo = FALSE, print = FALSE, results = hide>>=
dat <- plot(rpa.results, set = "1000_at")
@
\caption{Estimated probe-specific variances and gene expression signal
  for an example probe set.}
\label{fig:illustration}
\end{center}
\end{figure}


\subsection{Estimating probe-specific noise and probe reliability}

RPA estimates the noise level of each individual probe through the
probe-specific variance parameter (\(\taujSq\)). These can be obtained
with

<<noise,results=hide>>=
noise <- get.probe.noise.estimates(rpa.results)
@ 

The higher the variance, the more noisy the probe. Inverse of the
variance, \(\frac{1}{\taujSq}\), can be used to quantitate probe
reliability. Note that the relative weight of a probe within probeset
is determined by the relative noise of the probe with respect to the
other probes in the same probeset. Comparison of probe-specific
variances across probesets may benefit from normalization of this
effect.  The get.probe.noise.estimates function can optionally provide
normalized versions of the noise estimates.


\subsection{Setting probe-specific priors}

Prior information of probe reliability can be set by tuning the shape
(\(\alpha\)) and scale (\(\beta\)) parameters of the inverse Gamma
distribution, which is the conjugate prior for the variances. Set
priors for a particular probeset. If the 'priors' parameter is not
given, non-informative priors will be given for the other probesets:

<<setpriors, results = hide>>=
alpha <- beta <- rep(1, 16)
probe.index <- 5
alpha[[probe.index]] <- 3
beta[[probe.index]] <- 1
priors <- set.priors(Dilution, set = "1000_at", alpha, beta)
@

Run RPA with priors:

<<pe, results=hide, eval = FALSE>>=
rpa.results <- RPA.pointestimate(Dilution, sets, priors = priors)
@                                  

\subsection{General usage}

RPA can be used more generally to summarize multivariate observations
of the same object with varying noise levels, see function rpa.fit:

<<iter, results=hide, eval = FALSE>>=
res <- rpa.fit(S)
@


\section{The probabilistic model}

\subsection{Relation to other probe-level models}

RPA differs from other popular preprocessing algorithms in two key
respects.  First, it utilizes probe-level estimates of differential
expression; these are calculated {\it before} probeset-level
summarization, which avoids certain probe-level effects that obscure
the results in other preprocessing methods where probes with various
affinities and contamination levels are combined into a probeset-level
summary prior to differential expression analyses. In particular, our
procedure avoids the modeling of unidentifiable probe affinities,
which is the key probe-specific parameter in many preprocessing
methods. Second, RPA provides tools for investigating the reliability
of individual probes in terms of a probe-specific variance. This can
be used in microarray design and in confirming the end results of a
microarray study. These properties distinguish RPA from other
probe-level preprocessing methods such as dChip's MBEI
\cite{Li01pnas}, RMA \cite{Irizarry03rma}, or FARMS
\cite{Hochreiter06}.

\subsection{Summary of RPA model}

\subsubsection{Background correction and normalization}

The probe-level data is background corrected, normalized, and
log2-transformed before the analysis. By default, RPA uses the
background correction model of RMA \cite{Irizarry03} and quantile
normalization \cite{Bolstad03}. Our implementation utilizes the
\Rpackage{affy} package \cite{Gautier04a} to handle probe-level data.
For details about short oligonucleotide arrays and the design of the
Affymetrix GeneChip arrays, see the Affymetrix MAS manual
\cite{affy5}. 

\subsubsection{Probe reliability estimation and summarization}

The RPA algorithm is used to obtain probeset-level summaries for gene
expression and to estimate probe-specific noise. RPA assumes a
Gaussian model for probe effects.  Let us consider a probe set
targeted at measuring the expression level of target transcript
\(g\). Probe-level observation \(\sij\) of probe \(j\) on array \(i\)
is modeled as a sum of the true expression signal (common for all
probes in the probeset), and probe-specific Gaussian noise: \(\sij =
\gi + \muj + \epsilonij\). The stochastic noise component is
probe-specific, distributed as \(\epsilonij \sim N(0,\taujSq)\). The
variance parameters \(\taus\) are of interest in probe reliability
analysis; the inverse variance \(1/\taujSq\) can be used to measure of
probe reliability (see get.probe.noise.estimates function).

The mean parameter \(\muj\) of the noise model describes systematic
probe affinity effect, which is unidentifiable. These parameters
cancel out in RPA when the signal log-ratio between a user-specified
'reference' array and the remaining arrays is calculated at probe
level: the differential expression signal between arrays \(t = \{1,
\dots, T\} \) and the reference array \(c\) for probe \(j\) is given
by \(\mtj = \stj - \scj = \gt - \gc + \epsilontj - \epsiloncj = \dt +
\epsilontj - \epsiloncj\). In vector notation the differential
expression profile of probe \(j\) across the arrays can be written as
\(\mj = \d + \Epsilonj\). In practice, \(\d\) and the probe-specific
variances \(\{\tau_j\}_{j=1}^P\) for the \(P\) probes within the
probeset are estimated simultaneously based on the probabilistic
model. With large sample sizes the solution will converge to
estimating the mean of the probe-level observations weighted by probe
reliability.  Note that the algorithm is robust to choice of the
reference array since the reference effect is marginalized out in the
probabilistic treatment; our experiments confirm that the probe-level
noise estimates are not affected by the choice of the reference array.

\subsubsection{Estimation of probe affinity terms}

Probe affinity terms and the original signal level are estimated after
summarizing the probe-level differential gene expression estimates.
First an estimate of the absolute signal level is calculated based on
particular modeling assumptions. Then probe-specific affinities are
calculated by comparing each probe to the probeset-level signal
estimate.

Let us write the probe-level observation in terms of differential
expression signal, absolute signal level, and stochastic noise as
\(\sj = \d + \mu + \varepsilon\), where \(\mu\) is a scalar (vector
with identical elements). This will indicate how much probe-level
observation deviates from the estimated signal shape \(\d\).  This can
be decomposed as \(\mu = \mu_{real} + \mu_{probe}\), where
\(\mu_{real}\) describes the 'real' signal level, common for all
probes and \(\mu_{probe}\) describes probe affinity effect. Let us
assume that \(\mu_{probe} ~ N(0, \sigma^2_{probe})\).  This encodes
the assumption that in general the affinity effect of each probe tends
to be close to zero.  Then ML estimates of \(\mu_{real}\) and
\(\mu_{probe}\) are calculated based on these particular assumptions.
This part of the algorithm has not been defined in full probabilistic
terms, we are only providing the point estimates.

If an identical prior is used for all probes in affinity estimation
then \(\mu_{real}\) is estimated as the average of the probe effects
\(\mu\) and the probe-specific affinities \(\mu_{probe}\) will sum to
exactly zero ('zeromean' option). This is analogous to the model used
in RMA, which uses medianpolish algorithm to fit this assumption. In
contrast to our model the stochastic probe effects are not
probe-specific in RMA.  We suggest an alternative approach where
probes are weighted during affinity estimation ('rpa' option).  While
\(\sigma^2\) estimated by RPA measures stochastic noise, not the
affinity effect, we utilize them to give a heuristic weigh for the
probes in affinity estimation according to how much they contribute to
the overall signal shape. Intuitively, probes that have little effect
on the signal shape (i.e. are very noisy and likely to be contaminated
by many unrelated signals) should also contribute less to the absolute
signal estimate. The probe affinities are expected to sum to zero but
the model allows some flexibility.

%Note that in contrast to the other prepreprocessing methods, the
%differential expression levels between the conditions are estimated
%first, based on probe-level analysis. Probeset-level summarization and
%estimation of probe-specific affinities and the original signal level
%is performed in the second step.


\section{Citing RPA}

Please cite \cite{Lahti11} when using the package.

\section{Details}

This document was written using:

<<details>>=
sessionInfo()
@

\bibliographystyle{abbrv}
\bibliography{RPA}

\end{document}

%To investigate a particular probeset, preprocess data before the
%analysis:
%
%<<Smat,results=hide, eval = FALSE>>=
%prep <- RPA.preprocess(Dilution, cind = 1)
%@
%
%Pick probe-level data for a probe set (arrays x probes matrix):
%
%<<pick, results=hide, eval = FALSE>>=
%probe.indices <- prep$set.inds[["1000_at"]]
%S <- prep$q[probe.indices, ]
%@
%
%Estimate probeset-level summary and probe-specific variance and
%affinity terms: