## Now getting the GODb Object directly
## Now getting the OrgDb Object directly
## Now getting the TxDb Object directly
Author: Martin Morgan (mtmorgan@fredhutch.org)
Date: 7 September, 2015
Back to Workshop Outline
The material in this document requires R version 3.2 and Bioconductor version 3.1
stopifnot(
getRversion() >= '3.2' && getRversion() < '3.3',
BiocInstaller::biocVersion() >= "3.1"
)
TxDb
packagesTxDb.Hsapiens.UCSC.hg19.knownGene
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-08-20 18:12:27 -0700 (Thu, 20 Aug 2015)
## # GenomicFeatures version at creation time: 1.21.16
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
methods(class=class(txdb))
## [1] $ $<- ExpressionSet annotatedDataFrameFrom
## [5] as.list asBED asGFF assayData
## [9] assayData<- cds cdsBy cdsByOverlaps
## [13] coerce columns combine contents
## [17] dbInfo dbconn dbfile dbmeta
## [21] dbschema disjointExons distance exons
## [25] exonsBy exonsByOverlaps extractUpstreamSeqs featureNames
## [29] featureNames<- fiveUTRsByTranscript genes initialize
## [33] intronsByTranscript isActiveSeq isActiveSeq<- isNA
## [37] keys keytypes locateVariants mapIds
## [41] mapToTranscripts mappedkeys metadata microRNAs
## [45] nhit organism predictCoding promoters
## [49] revmap sample sampleNames sampleNames<-
## [53] saveDb select seqinfo seqinfo<-
## [57] seqlevels0 show species storageMode
## [61] storageMode<- summarizeVariants tRNAs taxonomyId
## [65] threeUTRsByTranscript transcripts transcriptsBy transcriptsByOverlaps
## [69] updateObject
## see '?methods' for accessing help and source code
TxDb
objects
dbfile(txdb)
GenomicFeatures::makeTxDbFrom*()
Accessing gene models
exons()
, transcripts()
, genes()
, cds()
(coding sequence)promoters()
& friendsexonsBy()
& friends – exons by gene, transcript, …keytypes()
, columns()
, keys()
, select()
,
mapIds()
exons(txdb)
## GRanges object with 289969 ranges and 1 metadata column:
## seqnames ranges strand | exon_id
## <Rle> <IRanges> <Rle> | <integer>
## [1] chr1 [11874, 12227] + | 1
## [2] chr1 [12595, 12721] + | 2
## [3] chr1 [12613, 12721] + | 3
## [4] chr1 [12646, 12697] + | 4
## [5] chr1 [13221, 14409] + | 5
## ... ... ... ... ... ...
## [289965] chrY [27607404, 27607432] - | 277746
## [289966] chrY [27635919, 27635954] - | 277747
## [289967] chrY [59358329, 59359508] - | 277748
## [289968] chrY [59360007, 59360115] - | 277749
## [289969] chrY [59360501, 59360854] - | 277750
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
exonsBy(txdb, "tx")
## GRangesList object of length 82960:
## $1
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## <Rle> <IRanges> <Rle> | <integer> <character> <integer>
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12613, 12721] + | 3 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## $2
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12595, 12721] + | 2 <NA> 2
## [3] chr1 [13403, 14409] + | 6 <NA> 3
##
## $3
## GRanges object with 3 ranges and 3 metadata columns:
## seqnames ranges strand | exon_id exon_name exon_rank
## [1] chr1 [11874, 12227] + | 1 <NA> 1
## [2] chr1 [12646, 12697] + | 4 <NA> 2
## [3] chr1 [13221, 14409] + | 5 <NA> 3
##
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
OrgDb
library(org.Hs.eg.db)
org.Hs.eg.db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2015-Aug11
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20150808
## | GOEGSOURCEDATE: 2015-Aug11
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2015-Jul16
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Thu Aug 20 15:34:08 2015
##
## Please see: help('select') for usage information
OrgDb
objects
TxDb
keytypes()
, columns()
, keys()
, select()
,
mapIds()
select()
Specification of key type
select(org.Hs.eg.db, c("BRCA1", "PTEN"), c("ENTREZID", "GENENAME"), "SYMBOL")
## 'select()' returned 1:1 mapping between keys and columns
## SYMBOL ENTREZID GENENAME
## 1 BRCA1 672 breast cancer 1, early onset
## 2 PTEN 5728 phosphatase and tensin homolog
keytypes(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID"
## [7] "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH"
## [19] "PFAM" "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
columns(org.Hs.eg.db)
## [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "ENTREZID"
## [7] "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" "GO" "GOALL"
## [13] "IPI" "MAP" "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH"
## [19] "PFAM" "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG"
## [25] "UNIGENE" "UNIPROT"
Related functionality
mapIds()
– special case for mapping from 1 identifier to anotherOrganismDb
objects: combined org.*
, TxDb.*
, and other
annotation resources for easy access
library(Homo.sapiens)
select(Homo.sapiens, c("BRCA1", "PTEN"),
c("TXNAME", "TXCHROM", "TXSTART", "TXEND"),
"SYMBOL")
## 'select()' returned 1:many mapping between keys and columns
## SYMBOL TXNAME TXCHROM TXSTART TXEND
## 1 BRCA1 uc010whl.2 chr17 41196312 41276132
## 2 BRCA1 uc002icp.4 chr17 41196312 41277340
## 3 BRCA1 uc010whm.2 chr17 41196312 41277340
## 4 BRCA1 uc002icu.3 chr17 41196312 41277468
## 5 BRCA1 uc010cyx.3 chr17 41196312 41277468
## 6 BRCA1 uc002icq.3 chr17 41196312 41277500
## 7 BRCA1 uc002ict.3 chr17 41196312 41277500
## 8 BRCA1 uc010whn.2 chr17 41196312 41277500
## 9 BRCA1 uc010who.3 chr17 41196312 41277500
## 10 BRCA1 uc010whp.2 chr17 41196312 41322420
## 11 BRCA1 uc010whq.1 chr17 41215350 41256973
## 12 BRCA1 uc002idc.1 chr17 41215350 41277468
## 13 BRCA1 uc010whr.1 chr17 41215350 41277468
## 14 BRCA1 uc002idd.3 chr17 41243117 41276132
## 15 BRCA1 uc002ide.1 chr17 41243452 41256973
## 16 BRCA1 uc010cyy.1 chr17 41243452 41277340
## 17 BRCA1 uc010whs.1 chr17 41243452 41277468
## 18 BRCA1 uc010cyz.2 chr17 41243452 41277500
## 19 BRCA1 uc010cza.2 chr17 41243452 41277500
## 20 BRCA1 uc010wht.1 chr17 41243452 41277500
## 21 PTEN uc001kfb.3 chr10 89623195 89728532
## 22 PTEN uc021pvw.1 chr10 89623195 89728532
biomaRt
, AnnotationHub
http://biomart.org; Bioconductor package biomaRt
## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3) ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <- ## fully specified mart
useMart("ensembl", dataset = "hsapiens_gene_ensembl")
head(listFilters(ensembl), 3) ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3) ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")
## assemble and query the mart
res <- getBM(attributes = myAttributes, filters = myFilter,
values = myValues, mart = ensembl)
Other internet resources
Example: Ensembl 'GTF' files to R / Bioconductor GRanges and TxDb
library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub) # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)
Example: non-model organism OrgDb
packages
library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")
Example: Map Roadmap epigenomic marks to hg38
Roadmap BED file as GRanges
library(AnnotationHub)
hub <- AnnotationHub()
query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
E126 <- hub[["AH29817"]]
UCSC 'liftOver' file to map coordinates
query(hub , c("hg19", "hg38", "chainfile"))
chain <- hub[["AH14150"]]
lift over – possibly one-to-many mapping, so GRanges to GRangesList
library(rtracklayer)
E126hg38 <- liftOver(E126, chain)
E126hg38
Example: read variants from a VCF file, and annotate with respect to a known gene model
## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
TxDb.Hsapiens.UCSC.hg19.knownGene,
CodingVariants())
head(coding)
## GRanges object with 6 ranges and 9 metadata columns:
## seqnames ranges strand | LOCATION LOCSTART LOCEND QUERYID TXID
## <Rle> <IRanges> <Rle> | <factor> <integer> <integer> <integer> <character>
## 1 chr22 [50301422, 50301422] - | coding 939 939 24 75253
## 2 chr22 [50301476, 50301476] - | coding 885 885 25 75253
## 3 chr22 [50301488, 50301488] - | coding 873 873 26 75253
## 4 chr22 [50301494, 50301494] - | coding 867 867 27 75253
## 5 chr22 [50301584, 50301584] - | coding 777 777 28 75253
## 6 chr22 [50302962, 50302962] - | coding 698 698 57 75253
## CDSID GENEID PRECEDEID FOLLOWID
## <IntegerList> <character> <CharacterList> <CharacterList>
## 1 218562 79087
## 2 218562 79087
## 3 218562 79087
## 4 218562 79087
## 5 218562 79087
## 6 218563 79087
## -------
## seqinfo: 1 sequence from an unspecified genome; no seqlengths