\name{anomDetectBAF} \alias{anomSegmentBAF} \alias{anomFilterBAF} \alias{anomDetectBAF} \title{ BAF Method for Chromosome Anomaly Detection } \description{ \code{anomSegmentBAF} for each sample and chromosome, breaks the chromosome up into segments marked by change points of a metric based on B Allele Frequency (BAF) values. \code{anomFilterBAF} selects segments which are likely to be anomalous. \code{anomDetectBAF} is a wrapper to run \code{anomSegmentBAF} and \code{anomFilterBAF} in one step. } \usage{ anomSegmentBAF(intenData, genoData, scan.ids, chrom.ids, snp.ids, smooth = 50, min.width = 5, nperm = 10000, alpha = 0.001, verbose = TRUE) anomFilterBAF(intenData, genoData, segments, snp.ids, centromere, low.qual.ids = NULL, num.mark.thresh = 15, long.num.mark.thresh = 200, sd.reg = 2, sd.long = 1, low.frac.used = 0.1, run.size = 10, inter.size = 2, low.frac.used.num.mark = 30, very.low.frac.used = 0.01, low.qual.frac.num.mark = 150, lrr.cut = -2, ct.thresh = 10, frac.thresh = 0.1, verbose=TRUE) anomDetectBAF(intenData, genoData, scan.ids, chrom.ids, snp.ids, centromere, low.qual.ids = NULL, ...) } \arguments{ \item{intenData}{ An \code{\link{IntensityData}} object containing the B Allele Frequency. The order of the rows of intenData and the snp annotation are expected to be by chromosome and then by position within chromosome. The scan annotation should contain sex, coded as "M" for male and "F" for female. } \item{genoData}{ A \code{\link{GenotypeData}} object. The order of the rows of genoData and the snp annotation are expected to be by chromosome and then by position within chromosome. } \item{scan.ids}{ vector of scan ids (sample numbers) to process } \item{chrom.ids}{ vector of (unique) chromosomes to process. Should correspond to integer chromosome codes in \code{intenData}. Recommended to include all autosomes, and optionally X (males will be ignored) and the pseudoautosomal (XY) region. } \item{snp.ids}{ vector of eligible snp ids. Usually exclude failed and intensity-only SNPs. Also recommended to exclude an HLA region on chromosome 6 and XTR region on X chromosome. See \code{\link{HLA}} and \code{\link{pseudoautosomal}}. If there are SNPs annotated in the centromere gap, exclude these as well (see \code{\link{centromeres}}). } \item{smooth}{ number of markers for smoothing region. See \code{\link{smooth.CNA}} in the \code{\link{DNAcopy}} package. } \item{min.width}{ minimum number of markers for a segment. See \code{\link{segment}} in the \code{\link{DNAcopy}} package. } \item{nperm}{ number of permutations for deciding significance in segmentation. See \code{\link{segment}} in the \code{\link{DNAcopy}} package. } \item{alpha}{ significance level. See \code{\link{segment}} in the \code{\link{DNAcopy}} package. } \item{verbose}{ logical indicator whether to print information about the scan id currently being processed. anomSegmentBAF prints each scan id; anomFilterBAF prints a message after every 10 samples: "processing ith scan id out of n" where "ith" with be 10, 10, etc. and "n" is the total number of samples } \item{segments}{ data.frame of segments from \code{anomSegmentBAF}. Names must include "scanID", "chromosome", "num.mark", "left.index", "right.index", "seg.mean". Here "left.index" and "right.index" are row indices of intenData. Left and right refer to start and end of anomaly,respectively, in position order. } \item{centromere}{ data.frame with centromere position information. Names must include "chrom", "left.base", "right.base". Valid values for "chrom" are 1:22, "X", "Y", "XY". Here "left.base" and "right.base" are base positions of start and end of centromere location in position order. Centromere data tables are provided in \code{\link{centromeres}}. } \item{low.qual.ids}{ scan ids determined to be low quality for which some segments are filtered based on more stringent criteria. Default is NULL. Usual choice are scan ids for which median BAF across autosomes > 0.05. See \code{\link{sdByScanChromWindow}} and \code{\link{medianSdOverAutosomes}}. } \item{num.mark.thresh}{ minimum number of SNP markers in a segment to be considered for anomaly } \item{long.num.mark.thresh}{ min number of markers for "long" segment to be considered for anomaly for which significance threshold criterion is allowed to be less stringent } \item{sd.reg}{ number of baseline standard deviations of segment mean from a baseline mean for "normal" needed to declare segment anomalous. This number is given by abs(mean of segment - baseline mean)/(baseline standard deviation) } \item{sd.long}{ same meaning as \code{sd.reg} but applied to "long" segments } \item{low.frac.used}{ if fraction of heterozygous or missing SNP markers compared with number of eligible SNP markers in segment is below this, more stringent criteria are applied to declare them anomalous. } \item{run.size}{ min length of run of missing or heterozygous SNP markers for possible determination of homozygous deletions } \item{inter.size}{ number of homozygotes allowed to "interrupt" run for possible determination of homozygous deletions } \item{low.frac.used.num.mark}{ number of markers threshold for \code{low.frac.used} segments (which are not declared homozygous deletions } \item{very.low.frac.used}{ any segments with (num.mark)/(number of markers in interval) less than this are filtered out since they tend to be false positives } \item{low.qual.frac.num.mark}{ minimum num.mark threshold for low quality scans (\code{low.qual.ids}) for segments that are also below low.frac.used threshold } \item{lrr.cut}{ look for runs of LRR values below \code{lrr.cut} to adjust homozygous deletion endpoints } \item{ct.thresh}{ minimum number of LRR values below \code{lrr.cut} needed in order to adjust } \item{frac.thresh}{ investigate interval for homozygous deletion only if \code{lrr.cut} and \code{ct.thresh} thresholds met and (# LRR values below \code{lrr.cut})/(# eligible SNPs in segment) > \code{frac.thresh} } \item{...}{ arguments to pass to \code{anomFilterBAF} } } \details{ \code{anomSegmentBAF} uses the function \code{\link{segment}} from the \code{DNAcopy} package to perform circular binary segmentation on a metric based on BAF values. The metric for a given sample/chromosome is sqrt(min(BAF,1-BAF,abs(BAF-median(BAF))) where the median is across BAF values on the chromosome. Only BAF values for heterozygous or missing SNPs are used. \code{anomFilterBAF} determines anomalous segments based on a combination of thresholds for number of SNP markers in the segment and on deviation from a "normal" baseline. (See \code{num.mark.thresh},\code{long.num.mark.thresh}, \code{sd.reg}, and \code{sd.long}.) The "normal" baseline metric mean and standard deviation are found across all autosomes not segmented by \code{anomSegmentBAF}. This is why it is recommended to include all autosomes for the argument \code{chrom.ids} to ensure a more accurate baseline. Some initial filtering is done, including possible merging of consecutive segments meeting \code{sd.reg} threshold along with other criteria (such as not spanning the centromere) and adjustment for accurate break points for possible homozygous deletions (see \code{lrr.cut}, \code{ct.thresh}, \code{frac.thresh}, \code{run.size}, and \code{inter.size}). Male samples for X chromosome are not processed. More stringent criteria are applied to some segments (see \code{low.frac.used},\code{low.frac.used.num.mark}, \code{very.low.frac.used}, \code{low.qual.ids}, and \code{low.qual.frac.num.mark}). \code{anomDetectBAF} runs \code{anomSegmentBAF} with default values and then runs \code{anomFilterBAF}. Additional parameters for \code{anomFilterBAF} may be passed as arguments. } \value{ \code{anomSegmentBAF} returns a data.frame with the following elements: Left and right refer to start and end of anomaly, respectively, in position order. \item{scanID}{integer id of scan} \item{chromosome}{chromosome as integer code} \item{left.index}{row index of intenData indicating left endpoint of segment} \item{right.index}{row index of intenData indicating right endpoint of segment} \item{num.mark}{number of heterozygous or missing SNPs in the segment } \item{seg.mean}{mean of the BAF metric over the segment } \code{anomFilterBAF} and \code{anomDetectBAF} return a list with the following elements: \item{raw}{data.frame of raw segmentation data, with same output as \code{anomSegmentBAF} as well as: \itemize{ \item \code{left.base}: base position of left endpoint of segment \item \code{right.base}: base position of right endpoint of segment \item \code{sex}: sex of scan.id coded as "M" or "F" \item \code{sd.fac}: measure of deviation from baseline equal to abs(mean of segment - baseline mean)/(baseline standard deviation); used in determining anomalous segments } } \item{filtered}{data.frame of the segments identified as anomalies, with the same columns as \code{raw} as well as: \itemize{ \item \code{merge}: TRUE if segment was a result of merging. Consecutive segments from output of \code{anomSegmentBAF} that meet certain criteria are merged. \item \code{homodel.adjust}: TRUE if original segment was adjusted to narrow in on a homozygous deletion \item \code{frac.used}: fraction of (eligible) heterozygous or missing SNP markers compared with total number of eligible SNP markers in segment } } \item{base.info}{data frame with columns: \itemize{ \item \code{scanID}: integer id of scan \item \code{base.mean}: mean of non-anomalous baseline. This is the mean of the BAF metric for heterozygous and missing SNPs over all unsegmented autosomes that were considered. \item \code{base.sd}: standard deviation of non-anomalous baseline \item \code{chr.ct}: number of unsegmented chromosomes used in determining the non-anomalous baseline } } \item{seg.info}{data frame with columns: \itemize{ \item \code{scanID}: integer id of scan \item \code{chromosome}: chromosome as integer \item \code{num.segs}: number of segments produced by \code{anomSegmentBAF} } } } \references{ %% ~put references to the literature/web site here ~ See references in \code{\link{segment}} in the package \code{\link{DNAcopy}}. The BAF metric used is modified from Itsara,A., \emph{et.al} (2009) Population Analysis of Large Copy Number Variants and Hotspots of Human Genetic Disease. \emph{American Journal of Human Genetics}, \bold{84}, 148--161. } \author{ Cecelia Laurie } \note{ It is recommended to include all autosomes as input. This ensures a more accurate determination of baseline information. } \seealso{ \code{\link{segment}} and \code{\link{smooth.CNA}} in the package \code{\link{DNAcopy}}, also \code{\link{findBAFvariance}}, \code{\link{anomDetectLOH}} } \examples{ library(GWASdata) data(illuminaScanADF) data(illuminaSnpADF) blfile <- system.file("extdata", "illumina_bl.nc", package="GWASdata") blnc <- NcdfIntensityReader(blfile) blData <- IntensityData(blnc, scanAnnot=illuminaScanADF, snpAnnot=illuminaSnpADF) genofile <- system.file("extdata", "illumina_geno.nc", package="GWASdata") genonc <- NcdfGenotypeReader(genofile) genoData <- GenotypeData(genonc, scanAnnot=illuminaScanADF, snpAnnot=illuminaSnpADF) # segment BAF scan.ids <- illuminaScanADF$scanID[1:2] chrom.ids <- unique(illuminaSnpADF$chromosome) snp.ids <- illuminaSnpADF$snpID[illuminaSnpADF$missing.n1 < 1] seg <- anomSegmentBAF(blData, genoData, scan.ids=scan.ids, chrom.ids=chrom.ids, snp.ids=snp.ids) # filter segments to detect anomalies data(centromeres.hg18) filt <- anomFilterBAF(blData, genoData, segments=seg, snp.ids=snp.ids, centromere=centromeres.hg18) # alternatively, run both steps at once anom <- anomDetectBAF(blData, genoData, scan.ids=scan.ids, chrom.ids=chrom.ids, snp.ids=snp.ids, centromere=centromeres.hg18) } \keyword{manip}