The material in this course requires R version 3.2 and Bioconductor version 3.2
stopifnot(
getRversion() >= '3.2' && getRversion() < '3.3',
BiocInstaller::biocVersion() == "3.2"
)
TxDb
packagese.g., `{r Biocpkg(“TxDb.Hsapiens.UCSC.hg19.knownGene”)}
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
methods(class=class(txdb))
## [1] annotatedDataFrameFrom asBED asGFF as.list
## [5] assayData assayData<- cdsByOverlaps cdsBy
## [9] cds coerce columns combine
## [13] contents dbconn dbfile dbInfo
## [17] dbmeta dbschema disjointExons distance
## [21] $<- $ exonsByOverlaps exonsBy
## [25] exons ExpressionSet extractUpstreamSeqs featureNames<-
## [29] featureNames fiveUTRsByTranscript genes initialize
## [33] intronsByTranscript isActiveSeq<- isActiveSeq isNA
## [37] keys keytypes locateVariants mapIds
## [41] mappedkeys mapToTranscripts metadata microRNAs
## [45] nhit organism predictCoding promoters
## [49] revmap sample sampleNames<- sampleNames
## [53] saveDb select seqinfo seqlevels0
## [57] seqlevels<- show species storageMode<-
## [61] storageMode summarizeVariants taxonomyId threeUTRsByTranscript
## [65] transcriptsByOverlaps transcriptsBy transcripts tRNAs
## [69] updateObject
## see '?methods' for accessing help and source code
TxDb
objects
dbfile(txdb)
GenomicFeatures::makeTxDbFrom*()
Accessing gene models
exons()
, transcripts()
, genes()
, cds()
(coding sequence)promoters()
& friendsexonsBy()
& friends – exons by gene, transcript, …keytypes()
, columns()
, keys()
, select()
, mapIds()
exons(txdb)
## GRanges object with 289969 ranges and 1 metadata column:
## seqnames ranges strand | exon_id
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr1 [11874, 12227] + | 1
## [2] chr1 [12595, 12721] + | 2
## [3] chr1 [12613, 12721] + | 3
## [4] chr1 [12646, 12697] + | 4
## [5] chr1 [13221, 14409] + | 5
## ... ... ... ... ... ...
## [289965] chrUn_gl000241 [35706, 35859] - | 289965
## [289966] chrUn_gl000241 [36711, 36875] - | 289966
## [289967] chrUn_gl000243 [11501, 11530] + | 289967
## [289968] chrUn_gl000243 [13608, 13637] + | 289968
## [289969] chrUn_gl000247 [ 5787, 5816] - | 289969
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
exonsBy(txdb, "tx")
## GRangesList object of length 82960:
## $1
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12613, 12721] + | 3 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## $2
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12595, 12721] + | 2 <NA> 2
## [3] chr1 [13403, 14409] + | 6 <NA> 3
##
## $3
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12646, 12697] + | 4 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
BSgenome
packagese.g., `{r Biocpkg(“BSgenome.Hsapiens.UCSC.hg19”)}
library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
getSeq(genome, exons(txdb)[1:100])
## A DNAStringSet instance of length 100
## width seq
## [1] 354 CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGT...AAAGATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCA
## [2] 127 GCTCCTGTCTCCCCCCAGGTGTGTGGTGATGCCAGGCATGCCC...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
## [3] 109 GTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCAGGTCT...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
## [4] 52 CATCAGGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTT
## [5] 1189 GCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCT...ACAGCTAGAGATCCTTTATTAAAAGCACACTGTTGGTTTCTG
## ... ... ...
## [96] 251 CCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGC...ATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAAG
## [97] 262 GTTCGGGTCTGGCGGTACTTGAAGGGCAAAGACCTGGTGGCCC...TCACCCTGCGGAACCTGGAGGAGGTGGAGTTCTGTGTGGAAG
## [98] 48 ATAAACCCGGGACCCACTTCACTCCAGTGCCTCCGACGCCTCCTGATG
## [99] 216 CGTGCCGGGGAATGCTGTGCGGCTTCGGCGCCGTGTGCGAGCC...GCCAGCAGCGCCGCATCCGCCTGCTCAGCCGCGGGCCGTGCG
## [100] 225 GCTCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAG...CCCGCCAGGAGAATGTCTTCAAGAAGTTCGACGGCCCTTGTG
OrgDb
library(org.Hs.eg.db)
org.Hs.eg.db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2015-Sep27
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20150919
## | GOEGSOURCEDATE: 2015-Sep27
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2015-Jul16
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Fri Oct 9 19:54:03 2015
##
## Please see: help('select') for usage information
OrgDb
objects
TxDb
keytypes()
, columns()
, keys()
, select()
, mapIds()
select()
Specification of key type
select(org.Hs.eg.db, c("BRCA1", "PTEN"), c("ENTREZID", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENTREZID GENENAME
## 1 BRCA1 672 breast cancer 1, early onset
## 2 PTEN 5728 phosphatase and tensin homolog
keytypes(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID"
## [7] "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH"
## [19] "PFAM" "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID"
## [7] "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH"
## [19] "PFAM" "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
Related functionality
mapIds()
– special case for mapping from 1 identifier to anotherOrganismDb
objects: combined org.*
, TxDb.*
, and other annotation resources for easy access
library(Homo.sapiens)
## Loading required package: OrganismDbi
## Loading required package: GO.db
##
## Now getting the GODb Object directly
## Now getting the OrgDb Object directly
## Now getting the TxDb Object directly
select(Homo.sapiens, c("BRCA1", "PTEN"),
c("TXNAME", "TXCHROM", "TXSTART", "TXEND"),
"SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
## SYMBOL TXNAME TXCHROM TXSTART TXEND
## 1 BRCA1 uc010whl.2 chr17 41196312 41276132
## 2 BRCA1 uc002icp.4 chr17 41196312 41277340
## 3 BRCA1 uc010whm.2 chr17 41196312 41277340
## 4 BRCA1 uc002icu.3 chr17 41196312 41277468
## 5 BRCA1 uc010cyx.3 chr17 41196312 41277468
## 6 BRCA1 uc002icq.3 chr17 41196312 41277500
## 7 BRCA1 uc002ict.3 chr17 41196312 41277500
## 8 BRCA1 uc010whn.2 chr17 41196312 41277500
## 9 BRCA1 uc010who.3 chr17 41196312 41277500
## 10 BRCA1 uc010whp.2 chr17 41196312 41322420
## 11 BRCA1 uc010whq.1 chr17 41215350 41256973
## 12 BRCA1 uc002idc.1 chr17 41215350 41277468
## 13 BRCA1 uc010whr.1 chr17 41215350 41277468
## 14 BRCA1 uc002idd.3 chr17 41243117 41276132
## 15 BRCA1 uc002ide.1 chr17 41243452 41256973
## 16 BRCA1 uc010cyy.1 chr17 41243452 41277340
## 17 BRCA1 uc010whs.1 chr17 41243452 41277468
## 18 BRCA1 uc010cyz.2 chr17 41243452 41277500
## 19 BRCA1 uc010cza.2 chr17 41243452 41277500
## 20 BRCA1 uc010wht.1 chr17 41243452 41277500
## 21 PTEN uc001kfb.3 chr10 89623195 89728532
## 22 PTEN uc021pvw.1 chr10 89623195 89728532
biomaRt
, AnnotationHub
http://biomart.org; Bioconductor package biomaRt
## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3) ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <- ## fully specified mart
useMart("ensembl", dataset = "hsapiens_gene_ensembl")
head(listFilters(ensembl), 3) ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3) ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")
## assemble and query the mart
res <- getBM(attributes = myAttributes, filters = myFilter,
values = myValues, mart = ensembl)
Other internet resources
Example: Ensembl ‘GTF’ files to R / Bioconductor GRanges and TxDb
library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub) # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)
Example: non-model organism OrgDb
packages
library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")
Example: Map Roadmap epigenomic marks to hg38
Roadmap BED file as GRanges
library(AnnotationHub)
hub <- AnnotationHub()
query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
E126 <- hub[["AH29817"]]
UCSC ‘liftOver’ file to map coordinates
query(hub , c("hg19", "hg38", "chainfile"))
chain <- hub[["AH14150"]]
lift over – possibly one-to-many mapping, so GRanges to GRangesList
library(rtracklayer)
E126hg38 <- liftOver(E126, chain)
E126hg38
Example: read variants from a VCF file, and annotate with respect to a known gene model
## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
TxDb.Hsapiens.UCSC.hg19.knownGene,
CodingVariants())
head(coding)
## GRanges object with 6 ranges and 9 metadata columns:
## seqnames ranges strand | LOCATION LOCSTART LOCEND QUERYID TXID
## <Rle> <IRanges> <Rle> | <factor> <integer> <integer> <integer> <character>
## 1 chr22 [50301422, 50301422] - | coding 939 939 24 75253
## 2 chr22 [50301476, 50301476] - | coding 885 885 25 75253
## 3 chr22 [50301488, 50301488] - | coding 873 873 26 75253
## 4 chr22 [50301494, 50301494] - | coding 867 867 27 75253
## 5 chr22 [50301584, 50301584] - | coding 777 777 28 75253
## 6 chr22 [50302962, 50302962] - | coding 698 698 57 75253
## CDSID GENEID PRECEDEID FOLLOWID
## <IntegerList> <character> <CharacterList> <CharacterList>
## 1 218562 79087
## 2 218562 79087
## 3 218562 79087
## 4 218562 79087
## 5 218562 79087
## 6 218563 79087
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths
Acknowledgements
Core (Seattle): Sonali Arora, Marc Carlson, Nate Hayden, Jim Hester, Valerie Obenchain, Hervé Pagès, Paul Shannon, Dan Tenenbaum.
The research reported in this presentation was supported by the National Cancer Institute and the National Human Genome Research Institute of the National Institutes of Health under Award numbers U24CA180996 and U41HG004059, and the National Science Foundation under Award number 1247813. The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health or the National Science Foundation.
sessionInfo()
sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux stretch/sid
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C LC_TIME=en_US.UTF-8
## [4] LC_COLLATE=en_US.UTF-8 LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] Homo.sapiens_1.3.1 GO.db_3.2.2
## [3] OrganismDbi_1.11.43 biomaRt_2.25.3
## [5] AnnotationHub_2.1.45 VariantAnnotation_1.15.34
## [7] RNAseqData.HNRNPC.bam.chr14_0.7.0 GenomicAlignments_1.5.18
## [9] Rsamtools_1.21.21 ALL_1.11.0
## [11] org.Hs.eg.db_3.2.3 RSQLite_1.0.0
## [13] DBI_0.3.1 ggplot2_1.0.1
## [15] airway_0.103.1 limma_3.25.18
## [17] DESeq2_1.9.51 RcppArmadillo_0.6.100.0.0
## [19] Rcpp_0.12.1 BSgenome.Hsapiens.UCSC.hg19_1.4.0
## [21] BSgenome_1.37.6 rtracklayer_1.29.28
## [23] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicFeatures_1.21.33
## [25] AnnotationDbi_1.31.19 SummarizedExperiment_0.3.11
## [27] Biobase_2.29.1 GenomicRanges_1.21.32
## [29] GenomeInfoDb_1.5.16 microbenchmark_1.4-2
## [31] Biostrings_2.37.8 XVector_0.9.4
## [33] IRanges_2.3.26 S4Vectors_0.7.23
## [35] BiocGenerics_0.15.11 BiocStyle_1.7.9
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 RColorBrewer_1.1-2 httr_1.0.0
## [4] tools_3.2.2 R6_2.1.1 rpart_4.1-10
## [7] Hmisc_3.17-0 colorspace_1.2-6 nnet_7.3-11
## [10] gridExtra_2.0.0 graph_1.47.2 formatR_1.2.1
## [13] sandwich_2.3-4 labeling_0.3 scales_0.3.0
## [16] mvtnorm_1.0-3 genefilter_1.51.1 RBGL_1.45.1
## [19] stringr_1.0.0 digest_0.6.8 foreign_0.8-66
## [22] rmarkdown_0.8.1 htmltools_0.2.6 BiocInstaller_1.19.14
## [25] shiny_0.12.2 zoo_1.7-12 BiocParallel_1.3.54
## [28] acepack_1.3-3.3 RCurl_1.95-4.7 magrittr_1.5
## [31] Formula_1.2-1 futile.logger_1.4.1 munsell_0.4.2
## [34] proto_0.3-10 stringi_0.5-5 multcomp_1.4-1
## [37] yaml_2.1.13 MASS_7.3-44 zlibbioc_1.15.0
## [40] plyr_1.8.3 grid_3.2.2 lattice_0.20-33
## [43] splines_3.2.2 annotate_1.47.4 locfit_1.5-9.1
## [46] knitr_1.11 geneplotter_1.47.0 reshape2_1.4.1
## [49] codetools_0.2-14 futile.options_1.0.0 XML_3.98-1.3
## [52] evaluate_0.8 latticeExtra_0.6-26 lambda.r_1.1.7
## [55] httpuv_1.3.3 gtable_0.1.2 mime_0.4
## [58] xtable_1.7-4 survival_2.38-3 cluster_2.0.3
## [61] TH.data_1.0-6 interactiveDisplayBase_1.7.3