## ----install_h5vcData, eval=FALSE---------------------------------------------
#  if (!requireNamespace("BiocManager", quietly=TRUE))
#      install.packages("BiocManager")
#  BiocManager::install("h5vcData")

## ----loadPackages-------------------------------------------------------------
suppressPackageStartupMessages(library(h5vc))
suppressPackageStartupMessages(library(rhdf5))

tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )

## ----listTallyFile------------------------------------------------------------
h5ls(tallyFile)

## ----getSampleData------------------------------------------------------------
sampleData <- getSampleData( tallyFile, "/ExampleStudy/16" )
sampleData

## ----setSampleData------------------------------------------------------------
sampleData$ClinicalVariable <- rnorm(nrow(sampleData))
setSampleData( tallyFile, "/ExampleStudy/16", sampleData )
sampleData

## ----h5readBlockExample-------------------------------------------------------
data <- h5readBlock(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  names = c( "Coverages", "Counts" ),
  range = c(29000000,29001000)
  )
str(data)

## ----h5dapplyGRanges----------------------------------------------------------
suppressPackageStartupMessages(require(GenomicRanges))
data <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy",
  names = c( "Coverages" ),
  dims = c(3),
  range = GRanges("16", ranges = IRanges(start = seq(29e6, 30e6, 5e6), width = 1000))
  )
str(data)

## ----h5dapplyExample----------------------------------------------------------
#rangeA <- GRanges("16", ranges = IRanges(start = seq(29e6, 29.5e6, 1e5), width = 1000))
#rangeB <- GRanges("22", ranges = IRanges(start = seq(39e6, 39.5e6, 1e5), width = 1000))
range <- GRanges(
  rep(c("16", "22"), each = 6),
  ranges = IRanges(
    start = c(seq(29e6, 29.5e6, 1e5),seq(39e6, 39.5e6, 1e5)),
    width = 1000
  ))
coverages <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy",
  names = c( "Coverages" ),
  dims = c(3),
  range = range,
  FUN = binnedCoverage,
  sampledata = sampleData
  )
#options(scipen=10)
coverages <- do.call( rbind, lapply( coverages, function(x) do.call(rbind, x) ))
#rownames(coverages) <- NULL #remove block-ids used as row-names
coverages

## ----callVariantsExample------------------------------------------------------
variants <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  names = c( "Coverages", "Counts", "Deletions", "Reference" ),
  range = c(29950000,30000000),
  blocksize = 10000,
  FUN = callVariantsPaired,
  sampledata = sampleData,
  cl = vcConfParams(returnDataPoints = TRUE)
  )
variants <- do.call( rbind, variants )
variants$AF <- (variants$caseCountFwd + variants$caseCountRev) / (variants$caseCoverageFwd + variants$caseCoverageRev)
variants <- variants[variants$AF > 0.2,]
rownames(variants) <- NULL # remove rownames to save some space on output :D
variants

## ----mismatchPlotExample, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px", fig.retina=1----
windowsize <- 35
position <- variants$Start[2]
data <- h5readBlock(
    filename = tallyFile,
    group = paste( "/ExampleStudy", variants$Chrom[2], sep="/" ),
    names = c("Coverages","Counts","Deletions", "Reference"),
    range = c( position - windowsize, position + windowsize)
  )
patient <- sampleData$Patient[sampleData$Sample == variants$Sample[2]]
samples <- sampleData$Sample[sampleData$Patient == patient]
p <- mismatchPlot(
  data = data,
  sampledata = sampleData,
  samples = samples,
  windowsize = windowsize,
  position = position
  )
print(p)

## ----mismatchPlotExamplesTheming, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px", fig.show='asis'----
print(p + theme(strip.text.y = element_text(family="serif", size=16, angle=0)))

## -----------------------------------------------------------------------------
suppressPackageStartupMessages(require(IRanges))
suppressPackageStartupMessages(require(biomaRt))
mart <- useDataset(dataset = "hsapiens_gene_ensembl", mart = useMart("ENSEMBL_MART_ENSEMBL", host = "www.ensembl.org"))
exons <- getBM(attributes = c("ensembl_exon_id", "exon_chrom_start", "exon_chrom_end"), filters = c("chromosome_name"), values = c("16"), mart)
exons <- subset(exons, exon_chrom_start > 29e6 & exon_chrom_end < 30e6)
ranges <- IRanges(start = exons$exon_chrom_start, end = exons$exon_chrom_end)
coverages <- h5dapply(
  filename = tallyFile,
  group = "/ExampleStudy/16",
  names = c( "Coverages" ),
  dims = c(3),
  range = ranges,
  FUN = binnedCoverage,
  sampledata = sampleData
  )
options(scipen=10)
coverages <- do.call( rbind, coverages )
rownames(coverages) <- NULL #remove block-ids used as row-names
coverages$ExonID <- exons$ensembl_exon_id
head(coverages)

## ----mismatchPlotRangesExample, fig.width=10.5, fig.height=8.5, dpi=300, out.width="750px", fig.retina=1----
windowsize <- 35
position <- variants$Start[2]
data <- h5dapply(
    filename = tallyFile,
    group = paste( "/ExampleStudy", variants$Chrom[2], sep="/" ),
    names = c("Coverages","Counts","Deletions", "Reference"),
    range = flank( IRanges(start = c(position - 10, position, position + 10), width = 1), width = 15, both = TRUE )
  )
p <- mismatchPlot(
  data = data,
  sampledata = sampleData,
  samples <- c("PT5ControlDNA", "PT5PrimaryDNA", "PT5RelapseDNA", "PT8ControlDNA", "PT8EarlyStageDNA", "PT8PrimaryDNA")
  )
print(p)

## ----loadFiles, eval = .Platform$OS.type == "unix", results="hide"------------
suppressPackageStartupMessages(library("h5vc"))
suppressPackageStartupMessages(library("rhdf5"))
files <- list.files( system.file("extdata", package = "h5vcData"), "Pt.*bam$" )
files
bamFiles <- file.path( system.file("extdata", package = "h5vcData"), files)

## ----chromDim, eval = (.Platform$OS.type == "unix")---------------------------
suppressPackageStartupMessages(library("Rsamtools"))
chromdim <- sapply( scanBamHeader(bamFiles), function(x) x$targets )
colnames(chromdim) <- files
head(chromdim)

## ----hdf5SetUp, eval = .Platform$OS.type == "unix"----------------------------
chrom <- "2"
chromlength <- chromdim[chrom,1]
study <- "/DNMT3A"
tallyFile <- file.path( tempdir(), "DNMT3A.tally.hfs5" )
if( file.exists(tallyFile) ){
  file.remove(tallyFile)
}
if( prepareTallyFile( tallyFile, study, chrom, chromlength, nsamples = length(files) ) ){
  h5ls(tallyFile)
}else{
  message( paste( "Preparation of:", tallyFile, "failed" ) )
}

## ----sampleDataSetUp, eval = .Platform$OS.type == "unix"----------------------
sampleData <- data.frame(
  File = files,
  Type = "Control",
  stringsAsFactors = FALSE
  )
sampleData$Sample <- gsub(x = sampleData$File, pattern = ".bam", replacement = "")
sampleData$Patient <- substr(sampleData$Sample, start = 1, 4)
sampleData$Column <- seq_along(files)
sampleData$Type[grep(pattern = "Cancer", x = sampleData$Sample)] <- "Case"
group <- paste( study, chrom, sep = "/" )
setSampleData( tallyFile, group, sampleData )
getSampleData( tallyFile, group )

## ----tallyingTheBamFiles, eval = .Platform$OS.type == "unix"------------------
suppressPackageStartupMessages(require(BSgenome.Hsapiens.NCBI.GRCh38))
suppressPackageStartupMessages(require(GenomicRanges))
dnmt3a <- read.table(system.file("extdata", "dnmt3a.txt", package = "h5vcData"), header=TRUE, stringsAsFactors = FALSE)
dnmt3a <- with( dnmt3a, GRanges(seqname, ranges = IRanges(start = start, end = end)))
dnmt3a <- reduce(dnmt3a)
require(BiocParallel)
register(MulticoreParam())
theData <- tallyRanges( bamFiles, ranges = dnmt3a, reference = Hsapiens )
str(theData[[1]])

## ----writingToTallyFile, eval = .Platform$OS.type == "unix"-------------------
writeToTallyFile(theData, tallyFile, study = "/DNMT3A", ranges = dnmt3a)

## ----extractingData, eval = .Platform$OS.type == "unix"-----------------------
data <- h5dapply(
    filename = tallyFile,
    group = "/DNMT3A",
    range = dnmt3a
  )
str(data[["2"]][[1]])

## ----callingVariants, eval = .Platform$OS.type == "unix"----------------------
vars <- h5dapply(
    filename = tallyFile,
    group = "/DNMT3A",
    FUN = callVariantsPaired,
    sampledata = getSampleData(tallyFile,group),
    cl = vcConfParams(),
    range = dnmt3a
  )
vars <- do.call(rbind, vars[["2"]])
vars

## ----plottingVariant, eval = .Platform$OS.type == "unix"----------------------
position <- vars$End[1]
windowsize <- 30
data <- h5readBlock(
    filename = tallyFile,
    group = group,
    range = c(position - windowsize, position + windowsize)
  )
sampleData <- getSampleData(tallyFile,group)
p <- mismatchPlot( data, sampleData, samples = c("Pt17Control", "Pt17Cancer"), windowsize=windowsize, position=position )
print(p)

## -----------------------------------------------------------------------------
tallyFileMS <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
data( "example.variants", package = "h5vcData" ) #example variant calls
head(variantCalls)
ms = mutationSpectrum( variantCalls, tallyFileMS, "/ExampleStudy" )
head(ms)

## ----plottingMS, fig.width=10.5, fig.height=7, dpi=300, out.width="750px"-----
plotMutationSpectrum(ms) + theme(
  strip.text.y = element_text(angle=0, size=10),
  axis.text.x = element_text(size = 7),
  axis.text.y = element_text(size = 10)) + scale_y_continuous(breaks = c(0,5,10,15))

## ----parallelTally, eval = .Platform$OS.type == "unix"------------------------
register(MulticoreParam())
multicore.time <- system.time(theData <- tallyRanges( bamFiles, ranges = dnmt3a, reference = Hsapiens ))
register(SerialParam())
serial.time <- system.time(theData <- tallyRanges( bamFiles, ranges = dnmt3a, reference = Hsapiens ))
serial.time["elapsed"]
multicore.time["elapsed"]

## ----parallelTallyToFile, eval = .Platform$OS.type == "unix"------------------
register(MulticoreParam())
multicore.time <- system.time(tallyRangesToFile( tallyFile, study, bamFiles, ranges = dnmt3a, reference = Hsapiens ))
register(SerialParam())
serial.time <- system.time(tallyRangesToFile( tallyFile, study, bamFiles, ranges = dnmt3a, reference = Hsapiens ))
serial.time["elapsed"]
multicore.time["elapsed"]

## ---- eval = .Platform$OS.type == "unix"--------------------------------------
tallyFile <- system.file( "extdata", "example.tally.hfs5", package = "h5vcData" )
sampleData <- getSampleData(tallyFile, "/ExampleStudy/16")
theRanges <- GRanges("16", ranges = IRanges(start = seq(29e6,29.2e6,1000), width = 1000))
register(SerialParam())
system.time(
  coverages_serial <- h5dapply(
    filename = tallyFile,
    group = "/ExampleStudy",
    names = c( "Coverages" ),
    dims = c(3),
    range = theRanges,
    FUN = binnedCoverage,
    sampledata = sampleData
  )
)
register(MulticoreParam())
system.time(
  coverages_parallel <- h5dapply(
    filename = tallyFile,
    group = "/ExampleStudy",
    names = c( "Coverages" ),
    dims = c(3),
    range = theRanges,
    FUN = binnedCoverage,
    sampledata = sampleData
  )
)

## ---- eval = FALSE------------------------------------------------------------
#  tallyRangesBatch( tallyFile, study = "/DNMT3A", bamfiles = bamFiles, ranges = dnmt3a, reference = Hsapiens )

## ----batchjobsconf, eval=FALSE------------------------------------------------
#  cluster.functions <- makeClusterFunctionsLSF("/home/pyl/batchjobs/lsf.tmpl")
#  mail.start <- "first"
#  mail.done <- "last"
#  mail.error <- "all"
#  mail.from <- "<paul.theodor.pyl@embl.de>"
#  mail.to <- "<pyl@embl.de>"
#  mail.control <- list(smtpServer="smtp.embl.de")

## ----eval=FALSE---------------------------------------------------------------
#  ## Default resources can be set in your .BatchJobs.R by defining the variable
#  ## 'default.resources' as a named list.
#  
#  ## remove everthing in [] if your cluster does not support arrayjobs
#  #BSUB-J <%= job.name %>[1-<%= arrayjobs %>]         # name of the job / array jobs
#  #BSUB-o <%= log.file %>                             # output is sent to logfile, stdout + stderr by default
#  #BSUB-n <%= resources$ncpus %>                      # Number of CPUs on the node
#  #BSUB-q <%= resources$queue %>                      # Job queue
#  #BSUB-W <%= resources$walltime %>                   # Walltime in minutes
#  #BSUB-M <%= resources$memory %>                     # Memory requirements in Kbytes
#  
#  # we merge R output with stdout from LSF, which gets then logged via -o option
#  R CMD BATCH --no-save --no-restore "<%= rscript %>" /dev/stdout

## ----eval=FALSE---------------------------------------------------------------
#  library("BiocParallel")
#  library("BatchJobs")
#  cf <- makeClusterFunctionsLSF("/home/pyl/batchjobs/lsf.tmpl")
#  bjp <- BatchJobsParam( cluster.functions=cf, resources = list("queue" = "medium_priority", "memory"="4000", "ncpus"="4", walltime="00:30") )
#  bplapply(1:10, sqrt)
#  bplapply(1:10, sqrt, BPPARAM=bjp)