## Now getting the GODb Object directly
## Now getting the OrgDb Object directly
## Now getting the TxDb Object directly

Author: Martin Morgan (mtmorgan@fredhutch.org)
Date: 7 September, 2015
Back to Workshop Outline

The material in this document requires R version 3.2 and Bioconductor version 3.1

stopifnot(
    getRversion() >= '3.2' && getRversion() < '3.3',
    BiocInstaller::biocVersion() >= "3.1"
)

Annotation

Gene model annotation resources – `TxDb` packages

TxDb.Hsapiens.UCSC.hg19.knownGene

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb

## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-08-20 18:12:27 -0700 (Thu, 20 Aug 2015)
## # GenomicFeatures version at creation time: 1.21.16
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1

methods(class=class(txdb))

##  [1] $                      $<-                    ExpressionSet          annotatedDataFrameFrom
##  [5] as.list                asBED                  asGFF                  assayData             
##  [9] assayData<-            cds                    cdsBy                  cdsByOverlaps         
## [13] coerce                 columns                combine                contents              
## [17] dbInfo                 dbconn                 dbfile                 dbmeta                
## [21] dbschema               disjointExons          distance               exons                 
## [25] exonsBy                exonsByOverlaps        extractUpstreamSeqs    featureNames          
## [29] featureNames<-         fiveUTRsByTranscript   genes                  initialize            
## [33] intronsByTranscript    isActiveSeq            isActiveSeq<-          isNA                  
## [37] keys                   keytypes               locateVariants         mapIds                
## [41] mapToTranscripts       mappedkeys             metadata               microRNAs             
## [45] nhit                   organism               predictCoding          promoters             
## [49] revmap                 sample                 sampleNames            sampleNames<-         
## [53] saveDb                 select                 seqinfo                seqinfo<-             
## [57] seqlevels0             show                   species                storageMode           
## [61] storageMode<-          summarizeVariants      tRNAs                  taxonomyId            
## [65] threeUTRsByTranscript  transcripts            transcriptsBy          transcriptsByOverlaps 
## [69] updateObject          
## see '?methods' for accessing help and source code

TxDb objects

Curatated annotation resources – http://bioconductor.org/packages/biocViews
Underlying sqlite database – dbfile(txdb)
Make your own: GenomicFeatures::makeTxDbFrom*()

Accessing gene models

exons(), transcripts(), genes(), cds() (coding sequence)
promoters() & friends
exonsBy() & friends – exons by gene, transcript, …
'select' interface: keytypes(), columns(), keys(), select(), mapIds()

exons(txdb)

## GRanges object with 289969 ranges and 1 metadata column:
##            seqnames               ranges strand   |   exon_id
##               <Rle>            <IRanges>  <Rle>   | <integer>
##        [1]     chr1       [11874, 12227]      +   |         1
##        [2]     chr1       [12595, 12721]      +   |         2
##        [3]     chr1       [12613, 12721]      +   |         3
##        [4]     chr1       [12646, 12697]      +   |         4
##        [5]     chr1       [13221, 14409]      +   |         5
##        ...      ...                  ...    ... ...       ...
##   [289965]     chrY [27607404, 27607432]      -   |    277746
##   [289966]     chrY [27635919, 27635954]      -   |    277747
##   [289967]     chrY [59358329, 59359508]      -   |    277748
##   [289968]     chrY [59360007, 59360115]      -   |    277749
##   [289969]     chrY [59360501, 59360854]      -   |    277750
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome

exonsBy(txdb, "tx")

## GRangesList object of length 82960:
## $1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

Identifier mapping – `OrgDb`

library(org.Hs.eg.db)
org.Hs.eg.db

## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2015-Aug11
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20150808
## | GOEGSOURCEDATE: 2015-Aug11
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2015-Jul16
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Thu Aug 20 15:34:08 2015

## 
## Please see: help('select') for usage information

OrgDb objects

Curated resources, underlying sqlite data base, like TxDb
make your own: AnnotationForge (but see the AnnotationHub, below!)
'select' interface: keytypes(), columns(), keys(), select(), mapIds()

select()

Vector of keys, desired columns

Specification of key type

select(org.Hs.eg.db, c("BRCA1", "PTEN"), c("ENTREZID", "GENENAME"), "SYMBOL")

## 'select()' returned 1:1 mapping between keys and columns

##   SYMBOL ENTREZID                       GENENAME
## 1  BRCA1      672   breast cancer 1, early onset
## 2   PTEN     5728 phosphatase and tensin homolog

keytypes(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"

columns(org.Hs.eg.db)

##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"

Related functionality

mapIds() – special case for mapping from 1 identifier to another

OrganismDb objects: combined org.*, TxDb.*, and other annotation resources for easy access

library(Homo.sapiens)
select(Homo.sapiens, c("BRCA1", "PTEN"), 
       c("TXNAME", "TXCHROM", "TXSTART", "TXEND"), 
       "SYMBOL")

## 'select()' returned 1:many mapping between keys and columns

##    SYMBOL     TXNAME TXCHROM  TXSTART    TXEND
## 1   BRCA1 uc010whl.2   chr17 41196312 41276132
## 2   BRCA1 uc002icp.4   chr17 41196312 41277340
## 3   BRCA1 uc010whm.2   chr17 41196312 41277340
## 4   BRCA1 uc002icu.3   chr17 41196312 41277468
## 5   BRCA1 uc010cyx.3   chr17 41196312 41277468
## 6   BRCA1 uc002icq.3   chr17 41196312 41277500
## 7   BRCA1 uc002ict.3   chr17 41196312 41277500
## 8   BRCA1 uc010whn.2   chr17 41196312 41277500
## 9   BRCA1 uc010who.3   chr17 41196312 41277500
## 10  BRCA1 uc010whp.2   chr17 41196312 41322420
## 11  BRCA1 uc010whq.1   chr17 41215350 41256973
## 12  BRCA1 uc002idc.1   chr17 41215350 41277468
## 13  BRCA1 uc010whr.1   chr17 41215350 41277468
## 14  BRCA1 uc002idd.3   chr17 41243117 41276132
## 15  BRCA1 uc002ide.1   chr17 41243452 41256973
## 16  BRCA1 uc010cyy.1   chr17 41243452 41277340
## 17  BRCA1 uc010whs.1   chr17 41243452 41277468
## 18  BRCA1 uc010cyz.2   chr17 41243452 41277500
## 19  BRCA1 uc010cza.2   chr17 41243452 41277500
## 20  BRCA1 uc010wht.1   chr17 41243452 41277500
## 21   PTEN uc001kfb.3   chr10 89623195 89728532
## 22   PTEN uc021pvw.1   chr10 89623195 89728532

Other annotation resources – `biomaRt`, `AnnotationHub`

biomaRt & friends

http://biomart.org; Bioconductor package biomaRt

## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3)                      ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <-                                ## fully specified mart
    useMart("ensembl", dataset = "hsapiens_gene_ensembl")

head(listFilters(ensembl), 3)             ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3)          ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")

## assemble and query the mart
res <- getBM(attributes =  myAttributes, filters =  myFilter,
             values =  myValues, mart = ensembl)

Other internet resources

biomaRt Ensembl and other annotations
PSICQUIC Protein interactions
uniprot.ws Protein annotations
KEGGREST KEGG pathways
SRAdb Sequencing experiments
rtracklayer USCS genome tracks
GEOquery Array and other data
ArrayExpress Array and other data
…

AnnotationHub

Bioconductor package AnnotationHub
Meant to ease use of 'consortium' and other genome-scale resources
Simplify discovery, retrieval, local management, and import to standard Bioconductor representations

Example: Ensembl 'GTF' files to R / Bioconductor GRanges and TxDb

library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub)                   # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)

Example: non-model organism OrgDb packages

library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")

Example: Map Roadmap epigenomic marks to hg38

Roadmap BED file as GRanges

library(AnnotationHub)
hub <- AnnotationHub()
query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
E126 <- hub[["AH29817"]]

UCSC 'liftOver' file to map coordinates

query(hub , c("hg19", "hg38", "chainfile"))
chain <- hub[["AH14150"]]

lift over – possibly one-to-many mapping, so GRanges to GRangesList
```
library(rtracklayer)
E126hg38 <- liftOver(E126, chain)
E126hg38
```

Annotating Variants

Example: read variants from a VCF file, and annotate with respect to a known gene model

## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
    TxDb.Hsapiens.UCSC.hg19.knownGene,
    CodingVariants())
head(coding)

## GRanges object with 6 ranges and 9 metadata columns:
##     seqnames               ranges strand | LOCATION  LOCSTART    LOCEND   QUERYID        TXID
##        <Rle>            <IRanges>  <Rle> | <factor> <integer> <integer> <integer> <character>
##   1    chr22 [50301422, 50301422]      - |   coding       939       939        24       75253
##   2    chr22 [50301476, 50301476]      - |   coding       885       885        25       75253
##   3    chr22 [50301488, 50301488]      - |   coding       873       873        26       75253
##   4    chr22 [50301494, 50301494]      - |   coding       867       867        27       75253
##   5    chr22 [50301584, 50301584]      - |   coding       777       777        28       75253
##   6    chr22 [50302962, 50302962]      - |   coding       698       698        57       75253
##             CDSID      GENEID       PRECEDEID        FOLLOWID
##     <IntegerList> <character> <CharacterList> <CharacterList>
##   1        218562       79087                                
##   2        218562       79087                                
##   3        218562       79087                                
##   4        218562       79087                                
##   5        218562       79087                                
##   6        218563       79087                                
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths