?select
?exonsBy
page to retrieve all exons grouped by gene or transcript.Static packages
org.*: identifier mappings
select()
, columns()
, keys()
mapIds()
library(org.Hs.eg.db)
org <- org.Hs.eg.db
select(org, "BRCA1", c("ENSEMBL", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENSEMBL GENENAME
## 1 BRCA1 ENSG00000012048 BRCA1, DNA repair associated
TxDb.*: gene models
exons()
, transcripts()
, genes()
, promoters()
, …exonsBy()
, transcriptsBy()
select()
, etc.library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
promoters(txdb)
## GRanges object with 82960 ranges and 2 metadata columns:
## seqnames ranges strand | tx_id tx_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr1 [ 9874, 12073] + | 1 uc001aaa.3
## [2] chr1 [ 9874, 12073] + | 2 uc010nxq.1
## [3] chr1 [ 9874, 12073] + | 3 uc010nxr.1
## [4] chr1 [ 67091, 69290] + | 4 uc001aal.1
## [5] chr1 [319084, 321283] + | 5 uc001aaq.2
## ... ... ... ... . ... ...
## [82956] chrUn_gl000237 [ 2487, 4686] - | 82956 uc011mgu.1
## [82957] chrUn_gl000241 [36676, 38875] - | 82957 uc011mgv.2
## [82958] chrUn_gl000243 [ 9501, 11700] + | 82958 uc011mgw.1
## [82959] chrUn_gl000243 [11608, 13807] + | 82959 uc022brq.1
## [82960] chrUn_gl000247 [ 5617, 7816] - | 82960 uc022brr.1
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Web-based resources, e.g., biomaRt, PSICQUIC, GEOquery, …
Genome-scale resources via AnnotationHub
library(AnnotationHub)
hub = AnnotationHub()
## snapshotDate(): 2016-11-15
hub
## AnnotationHub with 43905 records
## # snapshotDate(): 2016-11-15
## # $dataprovider: BroadInstitute, UCSC, Ensembl, EncodeDCC, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/...
## # $species: Homo sapiens, Mus musculus, Bos taurus, Pan troglodytes, Danio rerio, Rattus norvegi...
## # $rdataclass: GRanges, BigWigFile, FaFile, TwoBitFile, ChainFile, OrgDb, Inparanoid8Db, data.fr...
## # additional mcols(): taxonomyid, genome, description, coordinate_1_based, maintainer,
## # rdatadateadded, preparerclass, tags, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH2"]]'
##
## title
## AH2 | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel.fa
## AH3 | Ailuropoda_melanoleuca.ailMel1.69.dna_rm.toplevel.fa
## AH4 | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel.fa
## AH5 | Ailuropoda_melanoleuca.ailMel1.69.ncrna.fa
## AH6 | Ailuropoda_melanoleuca.ailMel1.69.pep.all.fa
## ... ...
## AH51083 | Pteropus_vampyrus.pteVam1.85.gtf
## AH51084 | Rattus_norvegicus.Rnor_6.0.85.abinitio.gtf
## AH51085 | Rattus_norvegicus.Rnor_6.0.85.chr.gtf
## AH51086 | Rattus_norvegicus.Rnor_6.0.85.gtf
## AH51087 | Saccharomyces_cerevisiae.R64-1-1.85.abinitio.gtf
query(hub, c("ensembl", "81.gtf"))
## AnnotationHub with 69 records
## # snapshotDate(): 2016-11-15
## # $dataprovider: Ensembl
## # $species: Ailuropoda melanoleuca, Anas platyrhynchos, Anolis carolinensis, Astyanax mexicanus,...
## # $rdataclass: GRanges
## # additional mcols(): taxonomyid, genome, description, coordinate_1_based, maintainer,
## # rdatadateadded, preparerclass, tags, sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH47937"]]'
##
## title
## AH47937 | Ailuropoda_melanoleuca.ailMel1.81.gtf
## AH47938 | Anas_platyrhynchos.BGI_duck_1.0.81.gtf
## AH47939 | Anolis_carolinensis.AnoCar2.0.81.gtf
## AH47940 | Astyanax_mexicanus.AstMex102.81.gtf
## AH47941 | Bos_taurus.UMD3.1.81.gtf
## ... ...
## AH48001 | Tupaia_belangeri.TREESHREW.81.gtf
## AH48002 | Tursiops_truncatus.turTru1.81.gtf
## AH48003 | Vicugna_pacos.vicPac1.81.gtf
## AH48004 | Xenopus_tropicalis.JGI_4.2.81.gtf
## AH48005 | Xiphophorus_maculatus.Xipmac4.4.2.81.gtf
hub[["AH48004"]]
## loading from cache '/home/mtmorgan//.AnnotationHub/54310'
## using guess work to populate seqinfo
## GRanges object with 581787 ranges and 19 metadata columns:
## seqnames ranges strand | source type score phase
## <Rle> <IRanges> <Rle> | <factor> <factor> <numeric> <integer>
## [1] GL172637.1 [ 34, 148] - | ensembl gene <NA> <NA>
## [2] GL172637.1 [ 34, 148] - | ensembl transcript <NA> <NA>
## [3] GL172637.1 [ 34, 148] - | ensembl exon <NA> <NA>
## [4] GL172637.1 [606, 720] - | ensembl gene <NA> <NA>
## [5] GL172637.1 [606, 720] - | ensembl transcript <NA> <NA>
## ... ... ... ... . ... ... ... ...
## [581783] GL180121.1 [ 865, 867] + | ensembl start_codon <NA> 0
## [581784] GL180121.1 [ 992, 1334] + | ensembl exon <NA> <NA>
## [581785] GL180121.1 [ 992, 1334] + | ensembl CDS <NA> 2
## [581786] GL180121.1 [1817, 1835] + | ensembl exon <NA> <NA>
## [581787] GL180121.1 [1817, 1835] + | ensembl CDS <NA> 1
## gene_id gene_version gene_name gene_source gene_biotype
## <character> <numeric> <character> <character> <character>
## [1] ENSXETG00000030486 1 U5 ensembl snRNA
## [2] ENSXETG00000030486 1 U5 ensembl snRNA
## [3] ENSXETG00000030486 1 U5 ensembl snRNA
## [4] ENSXETG00000031766 1 U5 ensembl snRNA
## [5] ENSXETG00000031766 1 U5 ensembl snRNA
## ... ... ... ... ... ...
## [581783] ENSXETG00000033193 1 <NA> ensembl protein_coding
## [581784] ENSXETG00000033193 1 <NA> ensembl protein_coding
## [581785] ENSXETG00000033193 1 <NA> ensembl protein_coding
## [581786] ENSXETG00000033193 1 <NA> ensembl protein_coding
## [581787] ENSXETG00000033193 1 <NA> ensembl protein_coding
## transcript_id transcript_version transcript_name transcript_source
## <character> <numeric> <character> <character>
## [1] <NA> <NA> <NA> <NA>
## [2] ENSXETT00000065882 1 U5-201 ensembl
## [3] ENSXETT00000065882 1 U5-201 ensembl
## [4] <NA> <NA> <NA> <NA>
## [5] ENSXETT00000061796 1 U5-201 ensembl
## ... ... ... ... ...
## [581783] ENSXETT00000053735 2 <NA> ensembl
## [581784] ENSXETT00000053735 2 <NA> ensembl
## [581785] ENSXETT00000053735 2 <NA> ensembl
## [581786] ENSXETT00000053735 2 <NA> ensembl
## [581787] ENSXETT00000053735 2 <NA> ensembl
## transcript_biotype exon_number exon_id exon_version protein_id
## <character> <numeric> <character> <numeric> <character>
## [1] <NA> <NA> <NA> <NA> <NA>
## [2] snRNA <NA> <NA> <NA> <NA>
## [3] snRNA 1 ENSXETE00000393193 1 <NA>
## [4] <NA> <NA> <NA> <NA> <NA>
## [5] snRNA <NA> <NA> <NA> <NA>
## ... ... ... ... ... ...
## [581783] protein_coding 1 <NA> <NA> <NA>
## [581784] protein_coding 2 ENSXETE00000303775 2 <NA>
## [581785] protein_coding 2 <NA> <NA> ENSXETP00000053735
## [581786] protein_coding 3 ENSXETE00000416553 1 <NA>
## [581787] protein_coding 3 <NA> <NA> ENSXETP00000053735
## protein_version
## <numeric>
## [1] <NA>
## [2] <NA>
## [3] <NA>
## [4] <NA>
## [5] <NA>
## ... ...
## [581783] <NA>
## [581784] <NA>
## [581785] 2
## [581786] <NA>
## [581787] 2
## -------
## seqinfo: 2375 sequences from JGI_4 genome; no seqlengths
Load the org package for Homo sapiens.
library(org.Hs.eg.db)
Use select()
to annotate the HNRNPC gene symbol with its Entrez identifier and less formal gene name. Create a map between SYMBOL and ENTREZID using mapIds()
.
select(org.Hs.eg.db, "HNRNPC", c("ENTREZID", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENTREZID GENENAME
## 1 HNRNPC 3183 heterogeneous nuclear ribonucleoprotein C (C1/C2)
sym2eg <- mapIds(org.Hs.eg.db, "HNRNPC", "ENTREZID", "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
Load the TxDb package for the UCSC hg19 knownGene track
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
Extract coordinates of genes, and of exons grouped by gene for the HNRNPC gene.
gns <- genes(txdb)
exonsBy(txdb, "gene")[sym2eg]
## GRangesList object of length 1:
## $3183
## GRanges object with 19 ranges and 2 metadata columns:
## seqnames ranges strand | exon_id exon_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr14 [21677296, 21679465] - | 184100 <NA>
## [2] chr14 [21678927, 21679725] - | 184101 <NA>
## [3] chr14 [21679565, 21679672] - | 184102 <NA>
## [4] chr14 [21679565, 21679725] - | 184103 <NA>
## [5] chr14 [21679969, 21680062] - | 184104 <NA>
## ... ... ... ... . ... ...
## [15] chr14 [21702237, 21702388] - | 184114 <NA>
## [16] chr14 [21730760, 21730927] - | 184115 <NA>
## [17] chr14 [21731470, 21731495] - | 184116 <NA>
## [18] chr14 [21731826, 21731988] - | 184117 <NA>
## [19] chr14 [21737457, 21737638] - | 184118 <NA>
##
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
The RNAseqData.HNRNPC.bam.chr14 package is an example of an experiment data package. It contains a subset of BAM files used in a gene knock-down experiment, as described in ?RNAseqData.HNRNPC.bam.chr14
. Load the package and get the path to the BAM files.
library(RNAseqData.HNRNPC.bam.chr14)
fls = RNAseqData.HNRNPC.bam.chr14_BAMFILES
basename(fls)
## [1] "ERR127306_chr14.bam" "ERR127307_chr14.bam" "ERR127308_chr14.bam" "ERR127309_chr14.bam"
## [5] "ERR127302_chr14.bam" "ERR127303_chr14.bam" "ERR127304_chr14.bam" "ERR127305_chr14.bam"
Create BamFileList()
, basically telling R that these are paths to BAM files rather than, say, text files from a spreadsheet.
library(GenomicAlignments)
bfls = BamFileList(fls)
bfl = bfls[[1]]
Use the gene coordinates to query the BAM file for a specific genomic region; see ?ScanBamParam()
for other ways of restricting data input.
library(Rsamtools)
param <- ScanBamParam(which=gns[sym2eg])
readGAlignments(bfl, param=param)
## GAlignments object with 5422 alignments and 0 metadata columns:
## seqnames strand cigar qwidth start end width njunc
## <Rle> <Rle> <character> <integer> <integer> <integer> <integer> <integer>
## [1] chr14 + 72M 72 21677347 21677418 72 0
## [2] chr14 + 72M 72 21677352 21677423 72 0
## [3] chr14 + 72M 72 21677354 21677425 72 0
## [4] chr14 + 72M 72 21677355 21677426 72 0
## [5] chr14 + 72M 72 21677373 21677444 72 0
## ... ... ... ... ... ... ... ... ...
## [5418] chr14 - 72M 72 21737512 21737583 72 0
## [5419] chr14 - 72M 72 21737520 21737591 72 0
## [5420] chr14 - 72M 72 21737520 21737591 72 0
## [5421] chr14 - 72M 72 21737521 21737592 72 0
## [5422] chr14 - 72M 72 21737534 21737605 72 0
## -------
## seqinfo: 93 sequences from an unspecified genome
Load the airway experiment data package
library(airway)
data(airway)
airway
## class: RangedSummarizedExperiment
## dim: 64102 8
## metadata(1): ''
## assays(1): counts
## rownames(64102): ENSG00000000003 ENSG00000000005 ... LRG_98 LRG_99
## rowData names(0):
## colnames(8): SRR1039508 SRR1039509 ... SRR1039520 SRR1039521
## colData names(9): SampleName cell ... Sample BioSample
The row names are Ensembl gene identifiers. Use mapIds()
to map from these to gene symbols.
symid <- mapIds(org.Hs.eg.db, rownames(airway), "SYMBOL", "ENSEMBL")
## 'select()' returned 1:many mapping between keys and columns
Add the gene symbols to the summarized experiment object.
mcols(rowRanges(airway))$symid <- symid
The Roadmap Epigenomics Project generated genome-wide maps of regulatory marks across a number of cell lines.
Retrieve the Epigenome Roadmap table from AnnotationHub…
library(AnnotationHub)
hub <- AnnotationHub()
## snapshotDate(): 2016-11-15
query(hub, c("epigenome", "metadata"))
## AnnotationHub with 1 record
## # snapshotDate(): 2016-11-15
## # names(): AH41830
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: data.frame
## # $title: EID_metadata.tab
## # $description: Metadata for EpigenomeRoadMap Project
## # $taxonomyid: 9606
## # $genome: hg19
## # $sourcetype: tab
## # $sourceurl: http://egg2.wustl.edu/roadmap/data/byFileType/metadata/EID_metadata.tab
## # $sourcelastmodifieddate: 2015-02-15
## # $sourcesize: 18035
## # $tags: c("EpigenomeRoadMap", "Metadata")
## # retrieve record with 'object[["AH41830"]]'
meta <- hub[["AH41830"]]
## loading from cache '/home/mtmorgan//.AnnotationHub/47270'
Explore the metadata to identify a cell line of interest to you; see also the metadata spreadsheet version of the data made available by the Epigenome roadmap project.
table(meta$ANATOMY)
##
## ADRENAL BLOOD BONE BRAIN BREAST
## 1 27 1 13 3
## CERVIX ESC ESC_DERIVED FAT GI_COLON
## 1 8 9 3 3
## GI_DUODENUM GI_ESOPHAGUS GI_INTESTINE GI_RECTUM GI_STOMACH
## 2 1 3 3 4
## HEART IPSC KIDNEY LIVER LUNG
## 4 5 1 2 5
## MUSCLE MUSCLE_LEG OVARY PANCREAS PLACENTA
## 7 1 1 2 2
## SKIN SPLEEN STROMAL_CONNECTIVE THYMUS VASCULAR
## 8 1 2 2 2
meta[meta$ANATOMY == "LIVER",]
## EID GROUP COLOR MNEMONIC STD_NAME
## 64 E066 Other #999999 LIV.ADLT Liver
## 116 E118 ENCODE2012 #000000 LIV.HEPG2.CNCR HepG2 Hepatocellular Carcinoma Cell Line
## EDACC_NAME ANATOMY TYPE AGE SEX SOLID_LIQUID ETHNICITY
## 64 Adult_Liver LIVER PrimaryTissue Unknown Mixed SOLID Unknown
## 116 HepG2_Hepatocellular_Carcinoma LIVER CellLine Male
## SINGLEDONOR_COMPOSITE
## 64 C
## 116 SD
Use the ‘EID’ to query for and retrieve the ‘mnemonic’ file summarizing chromatin state
query(hub, c("E118", "mnemonic"))
## AnnotationHub with 1 record
## # snapshotDate(): 2016-11-15
## # names(): AH46971
## # $dataprovider: BroadInstitute
## # $species: Homo sapiens
## # $rdataclass: GRanges
## # $title: E118_15_coreMarks_mnemonics.bed.gz
## # $description: 15 state chromatin segmentations from EpigenomeRoadMap Project
## # $taxonomyid: 9606
## # $genome: hg19
## # $sourcetype: BED
## # $sourceurl: http://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/cor...
## # $sourcelastmodifieddate: 2013-10-11
## # $sourcesize: 3231313
## # $tags: c("EpigenomeRoadMap", "chromhmmSegmentations", "ChmmModels", "coreMarks",
## # "E118", "ENCODE2012", "LIV.HEPG2.CNCR", "HepG2 Hepatocellular Carcinoma Cell Line")
## # retrieve record with 'object[["AH46971"]]'
E118 <- hub[["AH46971"]]
## loading from cache '/home/mtmorgan//.AnnotationHub/52411'
E118
## GRanges object with 561497 ranges and 4 metadata columns:
## seqnames ranges strand | abbr name
## <Rle> <IRanges> <Rle> | <character> <character>
## [1] chr10 [ 1, 113200] * | 15_Quies Quiescent/Low
## [2] chr10 [113201, 119600] * | 14_ReprPCWk Weak Repressed PolyComb
## [3] chr10 [119601, 120000] * | 10_TssBiv Bivalent/Poised TSS
## [4] chr10 [120001, 120200] * | 1_TssA Active TSS
## [5] chr10 [120201, 120400] * | 2_TssAFlnk Flanking Active TSS
## ... ... ... ... . ... ...
## [561493] chrY [58907201, 58967400] * | 15_Quies Quiescent/Low
## [561494] chrY [58967401, 58972000] * | 9_Het Heterochromatin
## [561495] chrY [58972001, 58997400] * | 8_ZNF/Rpts ZNF genes & repeats
## [561496] chrY [58997401, 59033600] * | 9_Het Heterochromatin
## [561497] chrY [59033601, 59373400] * | 15_Quies Quiescent/Low
## color_name color_code
## <character> <character>
## [1] White #FFFFFF
## [2] Gainsboro #C0C0C0
## [3] IndianRed #CD5C5C
## [4] Red #FF0000
## [5] Orange Red #FF4500
## ... ... ...
## [561493] White #FFFFFF
## [561494] PaleTurquoise #8A91D0
## [561495] Medium Aquamarine #66CDAA
## [561496] PaleTurquoise #8A91D0
## [561497] White #FFFFFF
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Explore the object, e.g., tabulating the different chromatin state classifications (in the name
column). Subset the object to return, e.g., just those regions marked as ‘Heterochromatin’
table(E118$name)
##
## Active TSS Bivalent Enhancer Bivalent/Poised TSS
## 20010 23155 13214
## Enhancers Flanking Active TSS Flanking Bivalent TSS/Enh
## 110260 45115 15844
## Genic enhancers Heterochromatin Quiescent/Low
## 14995 31193 61759
## Repressed PolyComb Strong transcription Transcr. at gene 5' and 3'
## 44013 32522 2515
## Weak Repressed PolyComb Weak transcription ZNF genes & repeats
## 60867 83738 2297
E118[E118$name %in% "Heterochromatin"]
## GRanges object with 31193 ranges and 4 metadata columns:
## seqnames ranges strand | abbr name color_name
## <Rle> <IRanges> <Rle> | <character> <character> <character>
## [1] chr10 [ 140201, 143800] * | 9_Het Heterochromatin PaleTurquoise
## [2] chr10 [ 806201, 807800] * | 9_Het Heterochromatin PaleTurquoise
## [3] chr10 [ 842001, 843800] * | 9_Het Heterochromatin PaleTurquoise
## [4] chr10 [1024601, 1027200] * | 9_Het Heterochromatin PaleTurquoise
## [5] chr10 [1191601, 1192600] * | 9_Het Heterochromatin PaleTurquoise
## ... ... ... ... . ... ... ...
## [31189] chrY [58883001, 58885400] * | 9_Het Heterochromatin PaleTurquoise
## [31190] chrY [58890001, 58891000] * | 9_Het Heterochromatin PaleTurquoise
## [31191] chrY [58906401, 58907200] * | 9_Het Heterochromatin PaleTurquoise
## [31192] chrY [58967401, 58972000] * | 9_Het Heterochromatin PaleTurquoise
## [31193] chrY [58997401, 59033600] * | 9_Het Heterochromatin PaleTurquoise
## color_code
## <character>
## [1] #8A91D0
## [2] #8A91D0
## [3] #8A91D0
## [4] #8A91D0
## [5] #8A91D0
## ... ...
## [31189] #8A91D0
## [31190] #8A91D0
## [31191] #8A91D0
## [31192] #8A91D0
## [31193] #8A91D0
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
Can you, using a TxDb package and the genes()
and subsetByOverlaps()
functions, determine how many genes overlap heterochromatic states, or the genes nearest()
each enhancer?
Visit the biomart website and figure out how to browse data to retreive, e.g., genes on chromosmes 21 and 22. You’ll need to browse to the ensembl mart, Homo spaiens data set, establish filters for chromosomes 21 and 22, and then specify that you’d like the Ensembl gene id attribute returned.
Now do the same process in biomaRt:
library(biomaRt)
head(listMarts(), 3) ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <- ## fully specified mart
useMart("ensembl", dataset = "hsapiens_gene_ensembl")
head(listFilters(ensembl), 3) ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3) ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")
## assemble and query the mart
res <- getBM(attributes = myAttributes, filters = myFilter,
values = myValues, mart = ensembl)
The goal of this section is to highlight practices for writing correct, robust and efficient R code.
identical()
, all.equal()
)NA
values, …system.time()
or the microbenchmark package.Rprof()
function, or packages such as lineprof or aprofVectorize – operate on vectors, rather than explicit loops
x <- 1:10
log(x) ## NOT for (i in seq_along(x)) x[i] <- log(x[i])
## [1] 0.0000000 0.6931472 1.0986123 1.3862944 1.6094379 1.7917595 1.9459101 2.0794415 2.1972246
## [10] 2.3025851
Pre-allocate memory, then fill in the result
result <- numeric(10)
result[1] <- runif(1)
for (i in 2:length(result))
result[i] <- runif(1) * result[i - 1]
result
## [1] 3.441333e-01 6.069639e-02 2.268026e-02 1.419491e-03 1.792039e-04 1.242687e-04 4.268918e-05
## [8] 4.236969e-05 4.437225e-06 3.167490e-07
for
looplm.fit()
rather than repeatedly fitting the same design matrix.tabulate()
, rowSums()
and friends, %in%
, …Here’s an obviously inefficient function:
f0 <- function(n, a=2) {
## stopifnot(is.integer(n) && (length(n) == 1) &&
## !is.na(n) && (n > 0))
result <- numeric()
for (i in seq_len(n))
result[[i]] <- a * log(i)
result
}
Use system.time()
to investigate how this algorithm scales with n
, focusing on elapsed time.
system.time(f0(10000))
## user system elapsed
## 0.116 0.000 0.114
n <- 1000 * seq(1, 20, 2)
t <- sapply(n, function(i) system.time(f0(i))[[3]])
plot(t ~ n, type="b")
Remember the current ‘correct’ value, and an approximate time
n <- 10000
system.time(expected <- f0(n))
## user system elapsed
## 0.112 0.000 0.112
head(expected)
## [1] 0.000000 1.386294 2.197225 2.772589 3.218876 3.583519
Revise the function to hoist the common multiplier, a
, out of the loop. Make sure the result of the ‘optimization’ and the original calculation are the same. Use the microbenchmark package to compare the two versions
f1 <- function(n, a=2) {
result <- numeric()
for (i in seq_len(n))
result[[i]] <- log(i)
a * result
}
identical(expected, f1(n))
## [1] TRUE
library(microbenchmark)
microbenchmark(f0(n), f1(n), times=5)
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## f0(n) 101.6763 141.9907 134.4311 142.1036 142.8454 143.5394 5 a
## f1(n) 102.2081 141.9909 134.3802 142.3994 142.5739 142.7289 5 a
Adopt a ‘pre-allocate and fill’ strategy
f2 <- function(n, a=2) {
result <- numeric(n)
for (i in seq_len(n))
result[[i]] <- log(i)
a * result
}
identical(expected, f2(n))
## [1] TRUE
microbenchmark(f0(n), f2(n), times=5)
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## f0(n) 104.145114 142.395721 135.414263 142.434957 143.815981 144.279540 5 b
## f2(n) 6.604505 6.707636 6.862423 6.908245 6.950165 7.141563 5 a
Use an *apply()
function to avoid having to explicitly pre-allocate, and make opportunities for vectorization more apparent.
f3 <- function(n, a=2)
a * sapply(seq_len(n), log)
identical(expected, f3(n))
## [1] TRUE
microbenchmark(f0(n), f2(n), f3(n), times=10)
## Unit: milliseconds
## expr min lq mean median uq max neval cld
## f0(n) 101.765217 141.698686 181.256096 142.881494 145.519752 565.655864 10 b
## f2(n) 6.476120 6.575258 7.189349 6.704238 7.097990 9.449406 10 a
## f3(n) 2.975692 3.113096 3.342878 3.305314 3.562432 3.828177 10 a
Now that the code is presented in a single line, it is apparent that it could be easily vectorized. Seize the opportunity to vectorize it:
f4 <- function(n, a=2)
a * log(seq_len(n))
identical(expected, f4(n))
## [1] TRUE
microbenchmark(f0(n), f3(n), f4(n), times=10)
## Unit: microseconds
## expr min lq mean median uq max neval cld
## f0(n) 99705.017 102091.968 129560.0144 141659.351 142267.701 142727.076 10 b
## f3(n) 3000.622 3148.994 7250.2057 3218.886 3305.377 43790.584 10 a
## f4(n) 205.374 209.281 228.5647 229.648 246.669 259.864 10 a
f4()
definitely seems to be the winner. How does it scale with n
? (Repeat several times)
n <- 10 ^ (5:8) # 100x larger than f0
t <- sapply(n, function(i) system.time(f4(i))[[3]])
plot(t ~ n, log="xy", type="b")
Any explanations for the different pattern of response?
Lessons learned:
*apply()
functions help avoid need for explicit pre-allocation and make opportunities for vectorization more apparent. This may come at a small performance cost, but is generally worth itWhen data are too large to fit in memory, we can iterate through files in chunks or subset the data by fields or genomic positions.
Iteration
open()
, read chunk(s), close()
.yieldSize
argument to Rsamtools::BamFile()
GenomicFiles::reduceByYield()
Restriction
Rsamtools::ScanBamParam()
Rsamtools::PileupParam()
VariantAnnotation::ScanVcfParam()
Parallel evalutation
BiocParallel provides a standardized interface for simple parallel evaluation. The package builds provides access to the snow
and multicore
functionality in the parallel
package as well as BatchJobs
for running cluster jobs.
General ideas:
bplapply()
instead of lapply()
Argument BPPARAM
influences how parallel evaluation occurs
MulticoreParam()
: threads on a single (non-Windows) machineSnowParam()
: processes on the same or different machinesBatchJobsParam()
: resource scheduler on a clusterDoparParam()
: parallel execution with foreach()
This small example motivates the use of parallel execution and demonstrates how bplapply()
can be a drop in for lapply
.
Use system.time()
to explore how long this takes to execute as n
increases from 1 to 10. Use identical()
and microbenchmark to compare alternatives f0()
and f1()
for both correctness and performance.
fun
sleeps for 1 second, then returns i
.
library(BiocParallel)
fun <- function(i) {
Sys.sleep(1)
i
}
## serial
f0 <- function(n)
lapply(seq_len(n), fun)
## parallel
f1 <- function(n)
bplapply(seq_len(n), fun)