### R code from vignette source 'D4_CalledVariants.Rnw'

###################################################
### code chunk number 1: setup
###################################################
options(digits=2)
library(VariantAnnotation)
library(ggplot2)
library(SNPlocs.Hsapiens.dbSNP.20101109)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
library(BSgenome.Hsapiens.UCSC.hg19)
library(SIFT.Hsapiens.dbSNP132)


###################################################
### code chunk number 2: readVcf
###################################################
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
(hdr <- scanVcfHeader(fl))
info(hdr)[c("VT", "RSQ"),]


###################################################
### code chunk number 3: readVcf
###################################################
(vcf <- readVcf(fl, "hg19"))
head(rowData(vcf), 3)


###################################################
### code chunk number 4: renameSeqlevels
###################################################
seqlevels(vcf, force=TRUE)  <- c("22"="ch22")


###################################################
### code chunk number 5: vcf-AMI (eval = FALSE)
###################################################
## vcfDir <- "~/BigData/CompleteGenomics"
## dir(vcfDir)
## vcfFile <- dir(vcfDir, ".gz$", full=TRUE)
## stopifnot(length(vcfFile) == 1)


###################################################
### code chunk number 6: D4_CalledVariants.Rnw:214-215 (eval = FALSE)
###################################################
## hdr <- scanVcfHeader(vcfFile)


###################################################
### code chunk number 7: D4_CalledVariants.Rnw:221-223 (eval = FALSE)
###################################################
## info(hdr)
## geno(hdr)


###################################################
### code chunk number 8: chunks (eval = FALSE)
###################################################
## tbx <- TabixFile(vcfFile, yieldSize=10000)
## open(tbx)
## while (len <- nrow(readVcf(tbx, "hg19")))
##     cat("read", len, "rows\n")


###################################################
### code chunk number 9: germline-1 (eval = FALSE)
###################################################
## grepl("Germline", x, fixed=TRUE)


###################################################
### code chunk number 10: FitlerRules
###################################################
isGermline <- function(x)
    grepl("Germline", x, fixed=TRUE)
filters <- FilterRules(list(isGermline=isGermline))


###################################################
### code chunk number 11: prefilter (eval = FALSE)
###################################################
## destination <- tempfile()               # temporary location
## filterVcf(vcfFile, "hg19", destination, prefilters=filters)


###################################################
### code chunk number 12: allelicDepth
###################################################
allelicDepth <- function(x) 
{
    ##  ratio of AD of the 'alternate allele' for the tumor sample
    ##  OR 'reference allele' for normal samples to total reads for
    ##  the sample should be greater than some threshold (say 0.1,
    ##  that is: at least 10% of the sample should have the allele
    ##  of interest)
    ad <- geno(x)[["AD"]]
    tumorPct <- ad[,1,2,drop=FALSE] / rowSums(ad[,1,,drop=FALSE])
    normPct <- ad[,2,1, drop=FALSE] / rowSums(ad[,2,,drop=FALSE])
    test <- (tumorPct > 0.1) | (normPct > 0.1)
    !is.na(test) & test
}


###################################################
### code chunk number 13: filters
###################################################
filters <- FilterRules(list(isGermline=isGermline, 
                            allelicDepth=allelicDepth))


###################################################
### code chunk number 14: filterVcf (eval = FALSE)
###################################################
## destination <- tempfile()
## filterVcf(vcfFile, "hg19", destination, prefilters=filters[1],
##           filters=filters[2])


###################################################
### code chunk number 15: filterVariant-final (eval = FALSE)
###################################################
## vcf <- readVcf(destination, "hg19")
## all(info(vcf)$SS == "Germline")
## table(allelicDepth(vcf))


###################################################
### code chunk number 16: isInDbSNP
###################################################
library(Morgan2013)
isInDbSNP


###################################################
### code chunk number 17: dbSNP (eval = FALSE)
###################################################
## library(SNPlocs.Hsapiens.dbSNP.20101109)
## inDbSNP <- .isInDbSNP(vcf, "ch22")
## table(inDbSNP)


###################################################
### code chunk number 18: SNP-quality (eval = FALSE)
###################################################
## metrics <-
##     data.frame(inDbSNP=inDbSNP, RSQ=info(vcf)$RSQ)


###################################################
### code chunk number 19: RSQ-plot (eval = FALSE)
###################################################
## library(ggplot2)
## ggplot(metrics, aes(RSQ, fill=inDbSNP)) +
##     geom_density(alpha=0.5) +
##     scale_x_continuous(name="MaCH / Thunder Imputation Quality") +
##     scale_y_continuous(name="Density") +
##     theme(legend.position="top")


###################################################
### code chunk number 20: seqlevels_rename
###################################################
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene 

fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf, force=TRUE) <- c("22"="chr22")


###################################################
### code chunk number 21: locateVariants_find
###################################################
rd <- rowData(vcf)
loc <- locateVariants(rd, txdb, CodingVariants())
head(loc, 3)


###################################################
### code chunk number 22: locateVariants_example
###################################################
## Did any coding variants match more than one gene?
splt <- split(loc$GENEID, loc$QUERYID)
table(sapply(splt, function(x) length(unique(x)) > 1))

## Summarize the number of coding variants by gene ID
splt <- split(loc$QUERYID, loc$GENEID)
head(sapply(splt, function(x) length(unique(x))), 3)


###################################################
### code chunk number 23: predictCoding
###################################################
library(BSgenome.Hsapiens.UCSC.hg19)
coding <- predictCoding(vcf, txdb, seqSource=Hsapiens)
coding[5:9]


###################################################
### code chunk number 24: predictCoding_frameshift
###################################################
coding[coding$CONSEQUENCE == "frameshift"]


###################################################
### code chunk number 25: nonsynonymous
###################################################
nms <- names(coding)
idx <- coding$CONSEQUENCE == "nonsynonymous"
nonsyn <- coding[idx]
names(nonsyn) <- nms[idx]
rsids <- unique(names(nonsyn)[grep("rs", names(nonsyn), fixed=TRUE)])


###################################################
### code chunk number 26: sift
###################################################
library(SIFT.Hsapiens.dbSNP132)

## rsids in the package 
head(keys(SIFT.Hsapiens.dbSNP132), 3)
## list available columns
cols(SIFT.Hsapiens.dbSNP132)
## select a subset of columns
## a warning is thrown when a key is not found in the database
subst <- c("RSID", "PREDICTION", "SCORE", "AACHANGE", "PROTEINID")
sift <- select(SIFT.Hsapiens.dbSNP132, keys=rsids, cols=subst)
head(sift, 3)


###################################################
### code chunk number 27: ensemblVEP-setup
###################################################
CONNECTION_OK <- tryCatch({ 
    con <- url("http://google.com", "r"); close(con); TRUE
}, error=function(...) FALSE)

VEP_OK <- CONNECTION_OK && require(ensemblVEP) && tryCatch({
    file.exists(ensemblVEP:::.getVepPath())
}, error=function(...) FALSE)
stopifnot(VEP_OK)