## ----setup, echo=FALSE--------------------------------------------------------
options(width=80)

## ----library_install, message=FALSE, cache=FALSE, eval=FALSE------------------
#  source("http://bioconductor.org/biocLite.R")
#  biocLite("GenomicScores")

## ----library_upload, message=FALSE, warning=FALSE, cache=FALSE----------------
library(GenomicScores)

## ---- message=FALSE, warning=FALSE, cache=FALSE-------------------------------
library(phastCons100way.UCSC.hg19)
gsco <- phastCons100way.UCSC.hg19
class(gsco)

## -----------------------------------------------------------------------------
scores(gsco, GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1)))

## -----------------------------------------------------------------------------
gsco

## -----------------------------------------------------------------------------
citation(gsco)

## -----------------------------------------------------------------------------
provider(gsco)
providerVersion(gsco)
organism(gsco)
seqlevelsStyle(gsco)

## ---- echo=FALSE--------------------------------------------------------------
avgs <- readRDS(system.file("extdata", "avgs.rds", package="GenomicScores"))

## ----retrieve2, message=FALSE, cache=FALSE, eval=FALSE------------------------
#  availableGScores()

## ---- echo=FALSE--------------------------------------------------------------
avgs

## ----retrieve3, message=FALSE, cache=FALSE, eval=FALSE------------------------
#  gsco <- getGScores("phastCons100way.UCSC.hg19")

## ----retrieve4, message=FALSE, cache=FALSE------------------------------------
scores(gsco, GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1)))

## ----eval=FALSE---------------------------------------------------------------
#  makeGScoresPackage(gsco, maintainer="Me <me@example.com>", author="Me", version="1.0.0")

## ----echo=FALSE---------------------------------------------------------------
cat("Creating package in ./phastCons100way.UCSC.hg19\n")

## ---- echo=FALSE--------------------------------------------------------------
obj <- readRDS(system.file("extdata", "mcap.v1.0.hg19.chr22.rds", package="GenomicScores"))
mdobj <- metadata(obj)
mcap <- GScores(provider=mdobj$provider,
                provider_version=mdobj$provider_version,
                download_url=mdobj$download_url,
                download_date=mdobj$download_date,
                reference_genome=mdobj$reference_genome,
                data_pkgname=mdobj$data_pkgname,
                data_dirpath="../inst/extdata",
                data_serialized_objnames=c(mcap.v1.0.hg19.chr22.rds="mcap.v1.0.hg19.chr22.rds"))
scorlelist <- get(mdobj$data_pkgname, envir=mcap@.data_cache)
scorlelist[[mdobj$seqname]] <- obj
assign(mdobj$data_pkgname, scorlelist, envir=mcap@.data_cache)

## ---- eval=FALSE--------------------------------------------------------------
#  mcap <- getGScores("mcap.v1.0.hg19")

## -----------------------------------------------------------------------------
mcap
citation(mcap)
gr <- GRanges(seqnames="chr22", IRanges(50967020:50967025, width=1))
scores(mcap, gr)

## ---- message=FALSE-----------------------------------------------------------
library(BSgenome.Hsapiens.UCSC.hg19)

refAlleles <- as.character(getSeq(Hsapiens, gr))
altAlleles <- DNA_BASES[(match(refAlleles, DNA_BASES)) %% 4 + 1]
cbind(REF=refAlleles, ALT=altAlleles)
scores(mcap, gr, ref=refAlleles, alt=altAlleles)

## -----------------------------------------------------------------------------
gr1 <- GRanges(seqnames="chr22", IRanges(start=50967020:50967025, width=1))
gr1 <- scores(gsco, gr1)
gr1
mean(gr1$scores)
gr2 <- GRanges(seqnames="chr22", IRanges(start=50967020, width=6))
scores(gsco, gr2)

## -----------------------------------------------------------------------------
scores(gsco, gr2, summaryFun=max)
scores(gsco, gr2, summaryFun=min)
scores(gsco, gr2, summaryFun=median)

## -----------------------------------------------------------------------------
phastqfun <- qfun(gsco)
phastqfun
phastdqfun <- dqfun(gsco)
phastdqfun

## -----------------------------------------------------------------------------
gr1 <- scores(gsco, gr1, quantized=TRUE)
gr1

## -----------------------------------------------------------------------------
phastdqfun(gr1$scores)

## ---- message=FALSE-----------------------------------------------------------
library(MafDb.1Kgenomes.phase1.hs37d5)

mafdb <- MafDb.1Kgenomes.phase1.hs37d5
mafdb

populations(mafdb)

## -----------------------------------------------------------------------------
citation(mafdb)
provider(mafdb)
providerVersion(mafdb)
organism(mafdb)
seqlevelsStyle(mafdb)

## ---- message=FALSE-----------------------------------------------------------
library(gwascat)
data(ebicat37)
eyersids <- subsetByTraits(ebicat37, tr="Eye color")$SNPS
eyersids

## ---- message=FALSE-----------------------------------------------------------
library(SNPlocs.Hsapiens.dbSNP144.GRCh37)
rng <- snpsById(SNPlocs.Hsapiens.dbSNP144.GRCh37, ids=eyersids)
rng

## -----------------------------------------------------------------------------
eyecolormafs <- mafByOverlaps(mafdb, rng, c("AF", "EUR_AF", "AFR_AF"))
eyecolormafs

## ----eyecolormafs, fig.cap = "Eye color MAFs. Minor allele frequencies (MAFs) of variants associated with eye color for global, european and african populations of the Phase 1 data from the 1000 Genomes Project.", fig.height=5, fig.wide = TRUE, echo=TRUE----
par(mar=c(3, 5, 1, 1))
bp <- barplot(t(as.matrix(mcols(eyecolormafs))), beside=TRUE,
              col=c("darkblue", "darkgreen", "darkorange"),
              ylim=c(0, 0.55), las=1, ylab="Minor allele frequency")
axis(1, bp[2, ], 1:length(eyecolormafs))
mtext("Variant", side=1, line=2)
legend("topright", colnames(mcols(eyecolormafs)), fill=c("darkblue", "darkgreen", "darkorange"))

## ---- message=FALSE-----------------------------------------------------------
library(VariantAnnotation)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)

## -----------------------------------------------------------------------------
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevelsStyle(vcf)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
seqlevelsStyle(txdb)

## -----------------------------------------------------------------------------
seqlevelsStyle(vcf) <- seqlevelsStyle(txdb)

## ---- message=FALSE-----------------------------------------------------------
loc <- locateVariants(vcf, txdb, AllVariants())
loc[1:3]
table(loc$LOCATION)

## -----------------------------------------------------------------------------
loc$PHASTCONS <- scores(gsco, loc, scores.only=TRUE)
loc[1:3]

## ----plot1, fig.cap = "Distribution of phastCons conservation scores in variants across different annotated regions. Diamonds indicate mean values.", echo = FALSE, fig.height=5, fig.wide = TRUE, echo=TRUE----
x <- split(loc$PHASTCONS, loc$LOCATION)
mask <- elementNROWS(x) > 0
boxplot(x[mask], ylab="phastCons score", las=1, cex.axis=1.2, cex.lab=1.5, col="gray")
points(1:length(x[mask])+0.25, sapply(x[mask], mean, na.rm=TRUE), pch=23, bg="black")

## -----------------------------------------------------------------------------
maskSNVs <- isSNV(vcf)[loc$QUERYID]
loc$MCAP <- rep(NA_real_, length(loc))
loc$MCAP[maskSNVs] <- scores(mcap, loc[maskSNVs],
                             ref=ref(vcf)[loc$QUERYID[maskSNVs]],
                             alt=alt(vcf)[loc$QUERYID[maskSNVs]],
                             scores.only=TRUE)

## ---- echo=FALSE--------------------------------------------------------------
obj <- readRDS(system.file("extdata", "cadd.v1.3.hg19.chr22sub.rds", package="GenomicScores"))
mdobj <- metadata(obj)
cadd <- GScores(provider=mdobj$provider,
                provider_version=mdobj$provider_version,
                download_url=mdobj$download_url,
                download_date=mdobj$download_date,
                reference_genome=mdobj$reference_genome,
                data_pkgname=mdobj$data_pkgname,
                data_dirpath="../inst/extdata",
                data_serialized_objnames=c(cadd.v1.3.hg19.chr22.rds="cadd.v1.3.hg19.chr22sub.rds"))
scorlelist <- get(mdobj$data_pkgname, envir=cadd@.data_cache)
scorlelist[[mdobj$seqname]] <- obj
assign(mdobj$data_pkgname, scorlelist, envir=cadd@.data_cache)

## -----------------------------------------------------------------------------
maskSNVs <- isSNV(vcf)[loc$QUERYID]
loc$CADD <- rep(NA_real_, length(loc))
loc$CADD[maskSNVs] <- scores(cadd, loc[maskSNVs],
                             ref=ref(vcf)[loc$QUERYID[maskSNVs]],
                             alt=alt(vcf)[loc$QUERYID[maskSNVs]],
                             scores.only=TRUE)

## ----mcapvscadd, fig.cap = "Comparison of M-CAP and CADD scores. Values on the x- and y-axis are jittered to facilitate visualization.", echo = FALSE, fig.height=5, fig.width=7, dpi=100, echo=TRUE----
library(RColorBrewer)
par(mar=c(4, 5, 1, 1))
hmcol <- colorRampPalette(brewer.pal(nlevels(loc$LOCATION), "Set1"))(nlevels(loc$LOCATION))
plot(jitter(loc$MCAP, factor=2), jitter(loc$CADD, factor=2), pch=19,
     col=hmcol, xlab="M-CAP scores", ylab="CADD scores",
     las=1, cex.axis=1.2, cex.lab=1.5, panel.first=grid())
legend("bottomright", levels(loc$LOCATION), pch=19, col=hmcol, inset=0.01)

## -----------------------------------------------------------------------------
seqlevelsStyle(loc) <- seqlevelsStyle(mafdb)[1]
seqinfo(loc, new2old=match(seqlevels(mafdb), seqlevels(loc))) <- seqinfo(mafdb)
loc$MAF[width(loc) == 1] <- mafByOverlaps(mafdb, loc[width(loc) == 1])$AF
loc$MAF[width(loc) > 1] <- mafByOverlaps(mafdb, loc[width(loc) > 1], type="nonsnvs")$AF
loc[1:3]

## ----showpositions, message=FALSE, cache=FALSE--------------------------------
origpcscoCDS <- readRDS(system.file("extdata", "origphastCons100wayhg19CDS.rds", package="GenomicScores"))
origpcscoCDS

length(unique(origpcscoCDS$score))

## -----------------------------------------------------------------------------
numDecimals <- function(x) {
  spl <- strsplit(as.character(x+1), "\\.")
  spl <- sapply(spl, "[", 2)
  spl[is.na(spl)] <- ""
  nchar(spl)
}

nd1 <- numDecimals(origpcscoCDS$score)
table(nd1)

## ----showpositions2, message=FALSE, cache=FALSE-------------------------------
origpcsco3UTRs <- readRDS(system.file("extdata", "origphastCons100wayhg193UTR.rds", package="GenomicScores"))
origpcsco3UTRs

length(table(origpcsco3UTRs$score))

nd2 <- numDecimals(origpcsco3UTRs$score)
table(nd2)

## -----------------------------------------------------------------------------
pkgpcscoCDS <- scores(gsco, origpcscoCDS, scores.only=TRUE)
pkgpcsco3UTRs <- scores(gsco, origpcsco3UTRs, scores.only=TRUE)

## ----plot2, fig.height=9, fig.width=8, dpi=100, fig.cap = "Original and lossy-compressed phastCons scores. Top panels (a, b): comparison of the distribution of values. Bottom panels (c, d): comparison of the cumulative distribution", echo = FALSE----
labelPanel <- function(lab, font=2, cex=2, offsetx=0.05, offsety=0.05) {
  par(xpd=TRUE)
  w <- par("usr")[2] - par("usr")[1]
  h <- par("usr")[4] - par("usr")[3]
  text(par("usr")[1]-w*offsetx, par("usr")[4]+h*offsety, lab, font=font, cex=cex)
  par(xpd=FALSE)
}

par(mfrow=c(2, 2), mar=c(4, 5, 2, 1))
plot(origpcscoCDS$score, jitter(pkgpcscoCDS), pch=19, cex=1, cex.lab=1.2,
     xaxt="n", yaxt="n", xlab="Original phastCons scores (CDS)",
     ylab="Compressed phastCons scores (CDS)")
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
abline(0, 1)
labelPanel(letters[1])
plot(origpcsco3UTRs$score, jitter(pkgpcsco3UTRs), pch=19, cex=1, cex.lab=1.2,
     xaxt="n", yaxt="n", xlab="Original phastCons scores (3' UTR)",
     ylab="Compressed phastCons scores (3' UTR)")
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
abline(0, 1)
labelPanel(letters[2])
ForigCDS <- ecdf(origpcscoCDS$score)
FpkgCDS <- ecdf(pkgpcscoCDS)
plot(sort(origpcscoCDS$score), ForigCDS(sort(origpcscoCDS$score)), xaxt="n", yaxt="n", cex.lab=1.2,
     pch=".", cex=4, xlab="phastCons scores (CDS)", ylab="F(x)", ylim=c(0, 1))
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
points(sort(pkgpcscoCDS), FpkgCDS(sort(pkgpcscoCDS)), pch=19, cex=1)
legend("topleft", c("Original score", "Rounded score"), pch=c(46, 19),
       pt.cex=c(4, 1), inset=0.01, bg="white")
labelPanel(letters[3])
Forig3UTRs <- ecdf(origpcsco3UTRs$score)
Fpkg3UTRs <- ecdf(pkgpcsco3UTRs)
plot(sort(origpcsco3UTRs$score), Forig3UTRs(sort(origpcsco3UTRs$score)), xaxt="n", yaxt="n", cex.lab=1.2,
     pch=".", cex=4, xlab="phastCons scores (3'UTR)", ylab="F(x)", ylim=c(0, 1))
axis(1, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
axis(2, at=seq(0, 1, by=0.1), labels=seq(0, 1, by=0.1), las=1)
abline(h=seq(0, 1, by=0.1), v=seq(0, 1, by=0.1), lty=3, col="gray")
points(sort(pkgpcsco3UTRs), Fpkg3UTRs(sort(pkgpcsco3UTRs)), pch=19, cex=1)
legend("topleft", c("Original score", "Rounded score"), pch=c(46, 19),
       pt.cex=c(4, 1), inset=0.01, bg="white")
labelPanel(letters[4])

## ----session_info, cache=FALSE------------------------------------------------
sessionInfo()