%\VignetteIndexEntry{Molecule Identification with CAMERA}\\
%\VignetteKeywords{CAMERA}\\
%\VignettePackage{CAMERA}\\

\documentclass[a4paper,12pt]{article}
\usepackage{hyperref}
\usepackage[table]{xcolor}
\setlength{\parindent}{0cm}

\newcommand{\Robject}[1]{{\texttt{#1}}}
\newcommand{\Rfunction}[1]{{\texttt{#1}}}
\newcommand{\Rpackage}[1]{{\textit{#1}}}
\newcommand{\Rclass}[1]{{\textit{#1}}}
\newcommand{\Rmethod}[1]{{\textit{#1}}}
\newcommand{\Rfunarg}[1]{{\textit{#1}}}

\newcommand{\denovo}{{\em de-Novo{}}}

\begin{document}
\title{LC-MS Peak Annotation and Identification with \Rpackage{CAMERA}}
\author{Carsten Kuhl, Ralf Tautenhahn and Steffen Neumann}
\maketitle

\section*{Introduction}
%{{{

The R-package \Rpackage{CAMERA} is a ({\bf C}ollection of {\bf
  A}lgorithms for {\bf ME}tabolite p{\bf R}ofile {\bf A}nnotation). Primarily
 used to annotate LC-MS data. It is designed to interact
directly with processed data from \Rpackage{xcms} and additional analyses with \Rpackage{Rdisop}.

It includes the annotation of isotope peaks,
adducts and fragments in peak lists generated by \Rpackage{xcms}. A
set of annotation methods is used, which group together mass
signals measured from a single metabolite, based on rules for mass
differences and peak shape comparison \cite{annobird07}.

Based on this annotation, the molecular composition can be calculated
if the mass spectrometer has a high-enough accuracy for both the mass
and the isotope pattern intensities.

%}}}

\section{Peak Annotation}
%{{{

\subsection{Adduct list and molecular mass estimation}
For soft ionisation methods such as LC/ESI-MS, different adducts (e.g.
$[M+K]^+$, $[M+Na]^+ $) and fragments (e.g.  $[M-C_3H_9N]^+$,
$[M+H-H_20]^+ $) occur. Depending on the molecule having an intrinsic
charge, $[M]^+$ may be observed as well. An estimation of the molecular mass of $[M]$ can be calculated
from at least two annotated adduct ions. To scan for adducts every theoretical possible
combination of adducts from a given list of ions are calculated.
For a small example see Tab.~\ref{tab:anno}.
\begin{table}[ht]
   \parbox{0.60\textwidth}{%
	\small
     \begin{tabular}{|l|r|}\hline
       Formula & Mass difference in amu \\ \hline
       $[M+H]^+$  & 1.007276 \\
       $[M+Na]^+$ & 22.98977 \\
       $[M+K]^+$  & 38.963708 \\
       $[2M+Na]^+$ & 22.98977 \\
       $[M+H+Na]^{2+}$ & 23.9976 \\
	... & ... \\ \hline
     \end{tabular}
   }
   \parbox{0.38\textwidth}{%
     \caption{{\footnotesize{Examples of calculated adducts for the Kations (K,H,Na)
           with their mass differences occuring in positive ion
           mode. The actual difference is calculated considering the
           charge and the number of molecules $M$ in the observed ion.}}}  \label{tab:anno}}
\end{table}
Every group of peaks is scanned, if these combinations fits with the mass differences and then molecular masses are computated.

\section{Processing with \Rpackage{CAMERA}}

\subsection{Preprocessing with \Rpackage{xcms}}

At first, create an \Rclass{xcmsSet} with your favourite parameters, e.g.:
\begin{verbatim}
  library(CAMERA)
  file <- system.file('mzdata/MM14.mzdata', package = "CAMERA")
  xs <- xcmsSet(file,method="centWave",ppm=30,peakwidth=c(5,10))
\end{verbatim}

\subsection{Annotation}
The annotatation wants to answer the questions which peaks belongs together and computate the exact mass of the molecule,
from which the ions originate.
A annotation for an \Rclass{xcmsSet} with one sample can be done quick with
\begin{verbatim}
 library(CAMERA)
 an <- annotate(xs)
 peaklist <- getPeaklist(an)
 \end{verbatim}
Note: if xs is a grouped xcmsSet, CAMERA needs additional parameters\\
The annotation workflow contains following steps:
\begin{enumerate}
 \item peak grouping after retention time
 \item peak group verification with EICs correlation
 \item annotation of possible isotopes
 \item annotation of adducts and calculating hypothetical masses for the group
\end{enumerate}

The result of an annotaton an is a data-frame similiar to a peak table and can be easily stored in a 
comma separated table (Excel-readable).

\begin{verbatim}
 write.csv(peaklist,file='xsannotated.csv')
\end{verbatim}

\subsubsection{Paramater for \Rfunction{annotate}}
Annotate is a wrapper function for many \Rpackage{CAMERA} S4 methods, so every parameter of the methods can be transfered via annotate. The parameters of these functions
 are here short summerised. For addional information see the manpage of \Rfunction{annotate}.

For the peak grouping a retention time window is calculated after the FWHM (full width half maximum) of the local highest peak.
Therefore one additional parameter can be pass for the FWHM calculation (\Rfunarg{perfwhm = 0.3}, which is 30$\%$ of used FWHM width).

The peak group verification step use a pearson correlation with the parameter \Rfunarg{cor\_eic\_th = 0.75}, which is the height of the peak correlation, that two peak must have to be consider to originate from the same molecule.

The annotation of isotopes and adducts share the parameters (\Rfunarg{ppm = 5}) and (\Rfunarg{mzabs = 0.01}), which are the relative and absolute error for m/z diffs.

Another parameters for isotope finding are the maximum charge (\Rfunarg{maxcharge = 3}) and maximum number of isotopes (\Rfunarg{maxiso = 4}) which are expectate to occur.
For addional information see \Rfunction{findIsotopes}.

The adduct annotation has one additional parameter (\Rfunarg{multiplier = 3}), which is the maximum number \textit{n} of molecules in clusterions (e.g. [nM+H]).
For information about creating a ruleset für annotation see \Rfunction{findAdducts}.

If the xcmsSet contains more than one sample or several different classes e.g. "wildtype" and "knockout", you must choose which one should be annotated.
Therefore the parameter sample and category exist. For an example see section \ref{section:example}.

So with more parameters a call of annotate looks like:
\begin{verbatim}
 an <- annotate(xs,sigma=6, perfwhm=0.3, cor_eic_th=0.75, 
 maxcharge=3, maxiso=3, ppm=5, mzabs=0.01,polarity="positive")
\end{verbatim}

\subsection{Annotation without verification by correlation}
A short notice for former \Rpackage{esi} user, this step is now obsolete and not longer supported.
All annotations use the peak correlation if possible.

\subsection{Interpretation of the Results}
\begin{table}[ht]
\begin{tabular}{|c|cc|c|c|l|}\hline
 id & mz & rt & isotopes & adduct & pc \\
\hline
		     65	& 176.04 & 280.09 &		& 			&  \\
\rowcolor{blue!20}   76	& 136.05 & 280.43 &[14][M+1]1+ 	& 			& 5\\
\rowcolor{blue!20}   77	& 135.05 & 280.43 &[14][M]1+ 	& 			& 5\\
\rowcolor{blue!20}   74	& 153.06 & 280.43 & 		&[M+H]+ 152.05437 	& 5\\
\rowcolor{blue!20}   75	& 175.04 & 280.43 & 		&[M+Na]+ 152.05437 	& 5\\
\rowcolor{blue!20}   73	& 197.02 & 280.76 & 		&[M+2Na-H]+ 152.05437 	& 5\\
		     78	& 377.74 & 286.15 &		&			&  \\
  		     79	& 732.5  & 286.49 &		&			&  \\
\rowcolor{red!20}    83	& 488.32 & 286.82 &		&[M+Na]+ 465.33205 	& 7\\
\rowcolor{red!20}    82	& 466.34 & 286.82 &		&[M+H]+ 465.33205 	& 7\\
...&&&&&\\
\hline
\end{tabular}
     \caption{{\footnotesize{Example of annotation results. Colums with intensity values are omitted. \newline blue-line: annotated group 5, red-line: annotated group 7}}} \label{tab:int}
\end{table}

%\rowcolor{blue!20} 1594	& 149.02 & 746.02 & &\small [36] [M+1]1+ & 7 \\
%\rowcolor{red!20} 36	& 150.03 & 746.02 & & &	7 \\
%155	& 205.09 & 745.69 & & & 7 \\
%323	& 279.05 & 745.69 & & &	7 \\
%\rowcolor{blue!20} 359	& 279.16 & 746.02 &\small [M+H]+ 278.15 &\small [419] [M+1]1+ & 7 \\
%\rowcolor{red!20} 419	& 280.16 & 746.02 & & &	7 \\
%459	& 297.06 & 746.02 & & &	7 \\
%\rowcolor{blue!20} 1592	& 301.14 & 746.02 &\small [M+Na]+ 278.15 &\small [1593] [M+1]1+ [426] [M+2]1+ & 7\\
%\rowcolor{red!20} 1593	& 302.14 & 746.02 & & & 7\\
%\rowcolor{red!20} 426	& 303.15 & 746.02 & & & 7\\
%\rowcolor{blue!20} 446	& 317.12 & 745.69 &\small [M+K]+ 278.15 &\small [500] [M+1]1+ & 7\\
%\rowcolor{red!20} 500	& 318.12 & 746.02 & & &	7 \\
%\rowcolor{blue!20} 623	& 363.11 & 746.02 & &\small [642] [M+1]1+ & 7\\
%\rowcolor{red!20} 642	& 364.11 & 746.36 & & &	7 \\
%597	& 381.12 & 746.02 & & &	7 \\
%\rowcolor{blue!20} 756	& 579.29 & 746.02 &\small [2M+Na]+ 278.15 &\small [758] [M+1]1+ [760] [M+2]1+ & 7\\
%\rowcolor{red!20} 758	& 580.29 & 746.02 & & & 7\\
%\rowcolor{red!20} 760	& 581.30 & 745.69 & & &	7 \\
Table \ref{tab:int} shows an example of annotation results. A small
cutout of the result table is displayed, the columns with the
intensity values are omitted and the rows are ordered by there rt values
 for better readability. The column \textit{pc} shows the result of the peak correlation based
annotation (independent of the annotations \textit{iso} and
\textit{adduct}). Peaks with the same label are supposed to belong to
the same spectrum. The column \textit{adduct} shows the annotation hypotheses for the ions. The value after the brackets
is the estimated molecular mass.

The column \textit{isotopes} contains the annotated isotopes for a monoisotopic peak. The values in the first square brackets denote the 
isotope-group-id(column \textit{id}), the second is the isotope annotation and the number after the brackets is the charge of the isotope.

% TODO: NYI
% \subsection{Visualisation}
%
% After the inspection of the annotated peaklist one might be interested
% in visualising a group of peaks, using the row numbers as indices.
% \begin{verbatim}
% plot.pspcetra(xs_annotate,idx=c(289,287,314,320))
% \end{verbatim}

%}}}

% \section{Metabolite Identification}
%{{{

% Sooner or later you want to know which metabolites have been
% measured. The package supports two approaches with the help of the \Rpackage{Rdisop}:


% TODO JOE: Bitte mal deine beiden Funktionen collectIsotopes und von der anderen fällt mir gerade der Name nicht ein,
% beschreiben und deren Handling mit Rdisop

% \begin{description}
% \item{A targeted} search based on a list of compounds or spectra. You
%  can search for the occurence of any compounds known in e.g. KEGG,
%  or an in-house library of known spectra.
% \item{An untargeted} approach which collects as many isotope patterns
%  as possible for a compound, and performs a \denovo{}
%  interpretation. Optionally followed by a check whether the compound
%  is known in e.g. PubChem.
% \end{description}
%
% \subsection{Compound library lookup}
%
%If you have a set of metabolites of interest, you might want to know
%whether they are present in an acquired profile run, and if so, their
%intensities.
%
%The best source for {\em recognition} would be a library of known
%spectra obtained specifically for your experimental setting. This
%might even include retention times. The second best approach is to
%calculate a set of expected pseudo-spectra resembling a molecule of
%interest.
%
%Finally, the (pseudo-) spectra have to be matched against the XCMS
%peak list.
%
%\subsection{\denovo{} interpretation}
%
%The \denovo{} identification is based on the package \Rpackage{Rdisop}.
%Based on the exact mass and optionally the intensities of the isotopes,
%hypotheses for the molecular formula can be calculated:
%
%<<>>=
%library(Rdisop)
%decomposeMass(46.042)
%@
%
%However, this works only for the molecular mass, and not the ions
%measured in ESI-MS. So we first annotate the xcmsSet, and obtain
%the adduct and isotope list:
%
%<<>>=
%# library(xcms)
%# library(esi)
%# cdfpath <- system.file("cdf", package = "faahKO")
%# cdffiles <- list.files(cdfpath, recursive = TRUE,full=T)
%
%# xset <- xcmsSet(cdffiles,snthresh=3,max=10)
%# xsg <- group(xset)
%# xsg <- retcor(xsg)
%# xsg <- group(xsg,bw=10)
%
%# xsann <- annotate(xsg)
%@
%
%We now extract those peaks, for which we can guesstimate the [M] Mass,
%and collect their isotope peaks:
%
%<<>>=
%# createPseudoSpectra(xs, xsann)
%@
%
%For some of the monoisotopic peaks we might not have the higher
%isotope peaks. So optionally we check back in the raw data if they
%actually do exist:
%
%<<>>=
%# fillSpectrumPeaks(xs, expectedPeaks)
%@
%
%
%If they are known adducts, we should remove these adducts first:
%
%<<>>=
%# subMolecule(, adduct)
%@
%
%Now we are ready to ask \Rpackage{Rdisop} for the molecular formulae:
%
%<<>>=
%# decomposeIsotope(isotopesMatrix)
%@
%
%Optionally, we can check whether these molecule are known in some
%compound library. A few lookup-functions are predefined:
%
%<<>>=
%# lookupPubchem()
%@
%
%and also
%
%<<>>=
%# library(KEGGSOAP)
%# lookupKEGG()
%@
%
%You might now ant to use these compounds as legend for your exported
%data matrix, in your clustered heatmap, or whereever.
%
%}}}

\section{Examples using CAMERA test dataset}
\label{section:example}
%{{{

\textbf{Example 1 }
Fast annotation without further using of xsAnnotate of the MM14 dataset.
\begin{verbatim}
  library(CAMERA)
  file <- system.file('mzdata/MM14.mzdata', package = "CAMERA")
  xs <- xcmsSet(file,method="centWave",ppm=30,peakwidth=c(5,10))
  an <- annotate(xs)
  peaklist <- getPeaklist(an)
  write.csv(peaklist,'/tmp/mm14.csv')
\end{verbatim}
%xset <- xcmsSet(cdffiles,snthresh=6,method="centWave",ppm=30,peakwidth=c(5,10))
There are 126 peaks in 10 groups, of which 48 peaks get isotope annotations and 25
peaks are annotated as adducts. {length( which(peaklist[,"adduct"]!=""))}

\bigskip
\textbf{Example 2 }
Annotation with exact use of an xsAnnotate object.
\begin{verbatim}
 library(CAMERA)
 cdfpath <- system.file("cdf", package = "faahKO")
 cdffiles <- list.files(cdfpath, recursive = TRUE,full=T)

 xset <- xcmsSet(cdffiles,snthresh=3,max=10)
 xsg <- group(xset)
 xsg <- retcor(xsg)
 xsg <- group(xsg,bw=10)
 
 #create xsAnnotate object
 xanno<-xsAnnotate(xsg,sample=1,category="WT")
 
 #group according to retention time
 xanno<-groupFWHM(xanno)
 
 #check grouping with EIC correlation, when indicated regroup
 xanno<-groupCorr(xanno)

 #search for isotopes
 xanno<-findIsotopes(xanno)

 #calculate possible adducts
 xanno<-findAdducts(xanno,polarity="positive")

 #get annotated peaklist
 an<-getPeaklist(xanno)
 write.csv(an,'/tmp/faah-an2.csv')
\end{verbatim}
There are 1829 peaks in 221 groups, of which 126 peaks get isotope annotations and
126 peaks are annotated as adducts.

%\bigskip
%\textbf{Example 3 }
%Annotation with preceding EIC extraction. The most comprehensive
%verification method is used and the peak-correlation-only based
%annotations can be computed.
%\begin{verbatim}
% library(esi)
% cdfpath <- system.file("cdf", package = "faahKO")
% cdffiles <- list.files(cdfpath, recursive = TRUE,full=T)
%
% xset <- xcmsSet(cdffiles,snthresh=3,max=10)
% xsg <- group(xset)
% xsg <- retcor(xsg)
% xsg <- group(xsg,bw=10)
%
% an <- annotate(xsg, gcmfrac=.4, cor_exp_th=0.5, cor_eic_th=0.7, na.ok=F,pc=TRUE)
% write.csv(an$annotated,'/tmp/faah-an3-gc.csv')
%\end{verbatim}
%More verifications are performed, 200 peaks get isotope annotations
%and 221 peaks are annotated as adducts/fragments.  76 peak correlation
%groups are found.

%}}}

\begin{thebibliography}{t1}
\bibitem{annobird07} Ralf Tautenhahn, Christoph B\"ottcher, Steffen
  Neumann : Annotation of LC/ESI--MS Mass Signals, BIRD 2007 Proc. of
  BIRD 2007 -- 1st International Conference on Bioinformatics Research
  and Development, 2007.
  \url{http://www.springerlink.com/content/473l404001787974/}
  and \url{http://msbi.ipb-halle.de/~rtautenh/bird07.pdf}
%\bibitem{disopwabi06} B\"ocker
\end{thebibliography}

\end{document}