Martin Morgan
February 3, 2015
system.time()
, Rprof()
, microbenchmarkopen()
, read chunk(s), close()
.yieldSize
argument to Rsamtools::BamFile()
Rsamtools::ScanBamParam()
ShortRead::FastqSampler()
lapply()
-like operationsbplapply()
for lapply()
-like functions,
increasingly used by package developers to provide easy, standard
way of gaining parallel evaluation.Write the following as a function. Use system.time()
to explore how
long this takes to execute as n
increases from 100 to 10000. Use
identical()
and microbenchmark to compare
alternatives f1()
, f2()
, and f3()
for both correctness and performance of
these three different functions. What strategies are these functions
using?
f0 <- function(n) {
## inefficient!
ans <- numeric()
for (i in seq_len(n))
ans <- c(ans, exp(i))
ans
}
f1 <- function(n) {
ans <- numeric(n)
for (i in seq_len(n))
ans[[i]] <- exp(i)
ans
}
f2 <- function(n)
sapply(seq_len(n), exp)
f3 <- function(n)
exp(seq_len(n))
Go to sleep for 1 second, then return i
. This takes 8 seconds.
library(BiocParallel)
fun <- function(i) {
Sys.sleep(1)
i
}
## serial
f0 <- function(n)
lapply(seq_len(n), fun)
## parallel
f1 <- function(n)
bplapply(seq_len(n), fun)
Regions of interest, named like the chromosomes in the bam file.
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
exByTx <- exonsBy(TxDb.Hsapiens.UCSC.hg19.knownGene, "tx")
map0 <- read.delim("~/igv/genomes/hg19_alias.tab", header=FALSE,
stringsAsFactors=FALSE)
map <- setNames(map0$V1, map0$V2)
seqlevels(exByTx, force=TRUE) <- map
A function to iterate through a bam file
count1 <- function(filename, roi) {
## Create and open BAM file
bf <- BamFile(filename, yieldSize=1000000)
open(bf)
## initialize variables
n <- 0 # number of reads examined
count <- integer(length(roi)) # running count of reads overlapping roi
names(counts) <- names(roi)
## read in and count chunks of data, until done
repeat {
## input
aln <- readGAlignments(bf) # input next chunk
if (length(aln) == 0) # stopping condition
break
n <- n + length(aln) # how are we doing?
message(n)
## overlaps
olaps <- findOverlaps(aln, roi, type="within", ignore.strand=TRUE)
count <- count + tabulate(subjectHits(olaps), subjectLength(olaps))
}
## finish and return result
close(bf)
count
}
An improvement: GenomicFiles::reduceByYield()
replaces repeat {}
section
suppressPackageStartupMessages({
library(GenomicFiles)
})
## how to input the next chunk of data
yield <- function(X, ...)
readGAlignments(X)
}
## what to do to each chunk
map <- function(VALUE, ..., roi) {
olaps <- findOverlaps(VALUE, roi, type="within", ignore.strand=TRUE)
tabulate(subjectHits(olaps), subjectLength(olaps))
}
## how to combine mapped chunks
reduce <- `+`
reduceByYield(bf, yield, map, reduce, roi=roi)
In action
filename <- "~/bam/SRR1039508_sorted.bam"
count <- count1(filename, exByTx)
Parallelize
library(BiocParallel)
## all bam files
filenames <- dir("~/bam", pattern="bam$", full=TRUE)
names(filenames) <- sub("_sorted.bam", "", basename(filenames))
## iterate
counts <- bplapply(filenames, count1, exByTx)
counts <- simplify2array(counts)
head(counts)