Contents

The material in this course requires R version 3.2 and Bioconductor version 3.2

stopifnot(
    getRversion() >= '3.2' && getRversion() < '3.3',
    BiocInstaller::biocVersion() == "3.2"
)

1 Annotation

1.1 Model organisms

1.1.1 Gene model annotation resources – TxDb packages

e.g., `{r Biocpkg(“TxDb.Hsapiens.UCSC.hg19.knownGene”)}

library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
txdb
## TxDb object:
## # Db type: TxDb
## # Supporting package: GenomicFeatures
## # Data source: UCSC
## # Genome: hg19
## # Organism: Homo sapiens
## # Taxonomy ID: 9606
## # UCSC Table: knownGene
## # Resource URL: http://genome.ucsc.edu/
## # Type of Gene ID: Entrez Gene ID
## # Full dataset: yes
## # miRBase build ID: GRCh37
## # transcript_nrow: 82960
## # exon_nrow: 289969
## # cds_nrow: 237533
## # Db created by: GenomicFeatures package from Bioconductor
## # Creation time: 2015-10-07 18:11:28 +0000 (Wed, 07 Oct 2015)
## # GenomicFeatures version at creation time: 1.21.30
## # RSQLite version at creation time: 1.0.0
## # DBSCHEMAVERSION: 1.1
methods(class=class(txdb))
##  [1] annotatedDataFrameFrom asBED                  asGFF                  as.list               
##  [5] assayData              assayData<-            cdsByOverlaps          cdsBy                 
##  [9] cds                    coerce                 columns                combine               
## [13] contents               dbconn                 dbfile                 dbInfo                
## [17] dbmeta                 dbschema               disjointExons          distance              
## [21] $<-                    $                      exonsByOverlaps        exonsBy               
## [25] exons                  ExpressionSet          extractUpstreamSeqs    featureNames<-        
## [29] featureNames           fiveUTRsByTranscript   genes                  initialize            
## [33] intronsByTranscript    isActiveSeq<-          isActiveSeq            isNA                  
## [37] keys                   keytypes               locateVariants         mapIds                
## [41] mappedkeys             mapToTranscripts       metadata               microRNAs             
## [45] nhit                   organism               predictCoding          promoters             
## [49] revmap                 sample                 sampleNames<-          sampleNames           
## [53] saveDb                 select                 seqinfo                seqlevels0            
## [57] seqlevels<-            show                   species                storageMode<-         
## [61] storageMode            summarizeVariants      taxonomyId             threeUTRsByTranscript 
## [65] transcriptsByOverlaps  transcriptsBy          transcripts            tRNAs                 
## [69] updateObject          
## see '?methods' for accessing help and source code

TxDb objects

Accessing gene models

  • exons(), transcripts(), genes(), cds() (coding sequence)
  • promoters() & friends
  • exonsBy() & friends – exons by gene, transcript, …
  • ‘select’ interface: keytypes(), columns(), keys(), select(), mapIds()
exons(txdb)
## GRanges object with 289969 ranges and 1 metadata column:
##                  seqnames         ranges strand   |   exon_id
##                     <Rle>      <IRanges>  <Rle>   | <integer>
##        [1]           chr1 [11874, 12227]      +   |         1
##        [2]           chr1 [12595, 12721]      +   |         2
##        [3]           chr1 [12613, 12721]      +   |         3
##        [4]           chr1 [12646, 12697]      +   |         4
##        [5]           chr1 [13221, 14409]      +   |         5
##        ...            ...            ...    ... ...       ...
##   [289965] chrUn_gl000241 [35706, 35859]      -   |    289965
##   [289966] chrUn_gl000241 [36711, 36875]      -   |    289966
##   [289967] chrUn_gl000243 [11501, 11530]      +   |    289967
##   [289968] chrUn_gl000243 [13608, 13637]      +   |    289968
##   [289969] chrUn_gl000247 [ 5787,  5816]      -   |    289969
##   -------
##   seqinfo: 93 sequences (1 circular) from hg19 genome
exonsBy(txdb, "tx")
## GRangesList object of length 82960:
## $1 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand |   exon_id   exon_name exon_rank
##          <Rle>      <IRanges>  <Rle> | <integer> <character> <integer>
##   [1]     chr1 [11874, 12227]      + |         1        <NA>         1
##   [2]     chr1 [12613, 12721]      + |         3        <NA>         2
##   [3]     chr1 [13221, 14409]      + |         5        <NA>         3
## 
## $2 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12595, 12721]      + |       2      <NA>         2
##   [3]     chr1 [13403, 14409]      + |       6      <NA>         3
## 
## $3 
## GRanges object with 3 ranges and 3 metadata columns:
##       seqnames         ranges strand | exon_id exon_name exon_rank
##   [1]     chr1 [11874, 12227]      + |       1      <NA>         1
##   [2]     chr1 [12646, 12697]      + |       4      <NA>         2
##   [3]     chr1 [13221, 14409]      + |       5      <NA>         3
## 
## ...
## <82957 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome

1.1.2 Whole-genome sequence – BSgenome packages

e.g., `{r Biocpkg(“BSgenome.Hsapiens.UCSC.hg19”)}

library(BSgenome.Hsapiens.UCSC.hg19)
genome <- BSgenome.Hsapiens.UCSC.hg19
getSeq(genome, exons(txdb)[1:100])
##   A DNAStringSet instance of length 100
##       width seq
##   [1]   354 CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGT...AAAGATGAGTGAGAGCATCAACTTCTCTCACAACCTAGGCCA
##   [2]   127 GCTCCTGTCTCCCCCCAGGTGTGTGGTGATGCCAGGCATGCCC...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
##   [3]   109 GTGTGTGGTGATGCCAGGCATGCCCTTCCCCAGCATCAGGTCT...GACTTGGATCACACTCTTGTGAGTGTCCCCAGTGTTGCAGAG
##   [4]    52 CATCAGGTCTCCAGAGCTGCAGAAGACGACGGCCGACTTGGATCACACTCTT
##   [5]  1189 GCAGGGCCATCAGGCACCAAAGGGATTCTGCCAGCATAGTGCT...ACAGCTAGAGATCCTTTATTAAAAGCACACTGTTGGTTTCTG
##   ...   ... ...
##  [96]   251 CCCGTCCCCGGCGCGGCCCGCGCGCTCCTCCGCCGCCTCTCGC...ATCCTCAACGTGGACCCGGTGCAGCACACGTACTCCTGCAAG
##  [97]   262 GTTCGGGTCTGGCGGTACTTGAAGGGCAAAGACCTGGTGGCCC...TCACCCTGCGGAACCTGGAGGAGGTGGAGTTCTGTGTGGAAG
##  [98]    48 ATAAACCCGGGACCCACTTCACTCCAGTGCCTCCGACGCCTCCTGATG
##  [99]   216 CGTGCCGGGGAATGCTGTGCGGCTTCGGCGCCGTGTGCGAGCC...GCCAGCAGCGCCGCATCCGCCTGCTCAGCCGCGGGCCGTGCG
## [100]   225 GCTCGCGGGACCCCTGCTCCAACGTGACCTGCAGCTTCGGCAG...CCCGCCAGGAGAATGTCTTCAAGAAGTTCGACGGCCCTTGTG

1.1.3 Identifier mapping – OrgDb

library(org.Hs.eg.db)
org.Hs.eg.db
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
## | DBSCHEMA: HUMAN_DB
## | ORGANISM: Homo sapiens
## | SPECIES: Human
## | EGSOURCEDATE: 2015-Sep27
## | EGSOURCENAME: Entrez Gene
## | EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | CENTRALID: EG
## | TAXID: 9606
## | GOSOURCENAME: Gene Ontology
## | GOSOURCEURL: ftp://ftp.geneontology.org/pub/go/godatabase/archive/latest-lite/
## | GOSOURCEDATE: 20150919
## | GOEGSOURCEDATE: 2015-Sep27
## | GOEGSOURCENAME: Entrez Gene
## | GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA
## | KEGGSOURCENAME: KEGG GENOME
## | KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes
## | KEGGSOURCEDATE: 2011-Mar15
## | GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens)
## | GPSOURCEURL: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19
## | GPSOURCEDATE: 2010-Mar22
## | ENSOURCEDATE: 2015-Jul16
## | ENSOURCENAME: Ensembl
## | ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta
## | UPSOURCENAME: Uniprot
## | UPSOURCEURL: http://www.UniProt.org/
## | UPSOURCEDATE: Fri Oct  9 19:54:03 2015
## 
## Please see: help('select') for usage information

OrgDb objects

  • Curated resources, underlying sqlite data base, like TxDb
  • make your own: AnnotationForge (but see the AnnotationHub, below!)
  • ‘select’ interface: keytypes(), columns(), keys(), select(), mapIds()

select()

  • Vector of keys, desired columns
  • Specification of key type

    select(org.Hs.eg.db, c("BRCA1", "PTEN"), c("ENTREZID", "GENENAME"), "SYMBOL")
    ## 'select()' returned 1:1 mapping between keys and columns
    ##   SYMBOL ENTREZID                       GENENAME
    ## 1  BRCA1      672   breast cancer 1, early onset
    ## 2   PTEN     5728 phosphatase and tensin homolog
    keytypes(org.Hs.eg.db)
    ##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
    ##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
    ## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
    ## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
    ## [25] "UNIGENE"      "UNIPROT"
    columns(org.Hs.eg.db)
    ##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT"  "ENSEMBLTRANS" "ENTREZID"    
    ##  [7] "ENZYME"       "EVIDENCE"     "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
    ## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"     "ONTOLOGYALL"  "PATH"        
    ## [19] "PFAM"         "PMID"         "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
    ## [25] "UNIGENE"      "UNIPROT"

Related functionality

  • mapIds() – special case for mapping from 1 identifier to another
  • OrganismDb objects: combined org.*, TxDb.*, and other annotation resources for easy access

    library(Homo.sapiens)
    ## Loading required package: OrganismDbi
    ## Loading required package: GO.db
    ## 
    ## Now getting the GODb Object directly
    ## Now getting the OrgDb Object directly
    ## Now getting the TxDb Object directly
    select(Homo.sapiens, c("BRCA1", "PTEN"), 
           c("TXNAME", "TXCHROM", "TXSTART", "TXEND"), 
           "SYMBOL")
    ## 'select()' returned 1:many mapping between keys and columns
    ##    SYMBOL     TXNAME TXCHROM  TXSTART    TXEND
    ## 1   BRCA1 uc010whl.2   chr17 41196312 41276132
    ## 2   BRCA1 uc002icp.4   chr17 41196312 41277340
    ## 3   BRCA1 uc010whm.2   chr17 41196312 41277340
    ## 4   BRCA1 uc002icu.3   chr17 41196312 41277468
    ## 5   BRCA1 uc010cyx.3   chr17 41196312 41277468
    ## 6   BRCA1 uc002icq.3   chr17 41196312 41277500
    ## 7   BRCA1 uc002ict.3   chr17 41196312 41277500
    ## 8   BRCA1 uc010whn.2   chr17 41196312 41277500
    ## 9   BRCA1 uc010who.3   chr17 41196312 41277500
    ## 10  BRCA1 uc010whp.2   chr17 41196312 41322420
    ## 11  BRCA1 uc010whq.1   chr17 41215350 41256973
    ## 12  BRCA1 uc002idc.1   chr17 41215350 41277468
    ## 13  BRCA1 uc010whr.1   chr17 41215350 41277468
    ## 14  BRCA1 uc002idd.3   chr17 41243117 41276132
    ## 15  BRCA1 uc002ide.1   chr17 41243452 41256973
    ## 16  BRCA1 uc010cyy.1   chr17 41243452 41277340
    ## 17  BRCA1 uc010whs.1   chr17 41243452 41277468
    ## 18  BRCA1 uc010cyz.2   chr17 41243452 41277500
    ## 19  BRCA1 uc010cza.2   chr17 41243452 41277500
    ## 20  BRCA1 uc010wht.1   chr17 41243452 41277500
    ## 21   PTEN uc001kfb.3   chr10 89623195 89728532
    ## 22   PTEN uc021pvw.1   chr10 89623195 89728532

1.2 Other annotation resources – biomaRt, AnnotationHub

1.2.1 biomaRt & friends

http://biomart.org; Bioconductor package biomaRt

## NEEDS INTERNET ACCESS !!
library(biomaRt)
head(listMarts(), 3)                      ## list marts
head(listDatasets(useMart("ensembl")), 3) ## mart datasets
ensembl <-                                ## fully specified mart
    useMart("ensembl", dataset = "hsapiens_gene_ensembl")

head(listFilters(ensembl), 3)             ## filters
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) ## return values
myValues <- c("21", "22")
head(listAttributes(ensembl), 3)          ## attributes
myAttributes <- c("ensembl_gene_id","chromosome_name")

## assemble and query the mart
res <- getBM(attributes =  myAttributes, filters =  myFilter,
             values =  myValues, mart = ensembl)

Other internet resources

1.2.2 AnnotationHub

  • Bioconductor package AnnotationHub
  • Meant to ease use of ‘consortium’ and other genome-scale resources
  • Simplify discovery, retrieval, local management, and import to standard Bioconductor representations

Example: Ensembl ‘GTF’ files to R / Bioconductor GRanges and TxDb

library(AnnotationHub)
hub <- AnnotationHub()
hub
query(hub, c("Ensembl", "80", "gtf"))
## ensgtf = display(hub)                   # visual choice
hub["AH47107"]
gtf <- hub[["AH47107"]]
gtf
txdb <- GenomicFeatures::makeTxDbFromGRanges(gtf)

Example: non-model organism OrgDb packages

library(AnnotationHub)
hub <- AnnotationHub()
query(hub, "OrgDb")

Example: Map Roadmap epigenomic marks to hg38

  • Roadmap BED file as GRanges

    library(AnnotationHub)
    hub <- AnnotationHub()
    query(hub , c("EpigenomeRoadMap", "E126", "H3K4ME2"))
    E126 <- hub[["AH29817"]]
  • UCSC ‘liftOver’ file to map coordinates

    query(hub , c("hg19", "hg38", "chainfile"))
    chain <- hub[["AH14150"]]
  • lift over – possibly one-to-many mapping, so GRanges to GRangesList

    library(rtracklayer)
    E126hg38 <- liftOver(E126, chain)
    E126hg38

2 Annotating Variants

Example: read variants from a VCF file, and annotate with respect to a known gene model

## input variants
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
seqlevels(vcf) <- "chr22"
## known gene model
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
coding <- locateVariants(rowRanges(vcf),
    TxDb.Hsapiens.UCSC.hg19.knownGene,
    CodingVariants())
head(coding)
## GRanges object with 6 ranges and 9 metadata columns:
##     seqnames               ranges strand | LOCATION  LOCSTART    LOCEND   QUERYID        TXID
##        <Rle>            <IRanges>  <Rle> | <factor> <integer> <integer> <integer> <character>
##   1    chr22 [50301422, 50301422]      - |   coding       939       939        24       75253
##   2    chr22 [50301476, 50301476]      - |   coding       885       885        25       75253
##   3    chr22 [50301488, 50301488]      - |   coding       873       873        26       75253
##   4    chr22 [50301494, 50301494]      - |   coding       867       867        27       75253
##   5    chr22 [50301584, 50301584]      - |   coding       777       777        28       75253
##   6    chr22 [50302962, 50302962]      - |   coding       698       698        57       75253
##             CDSID      GENEID       PRECEDEID        FOLLOWID
##     <IntegerList> <character> <CharacterList> <CharacterList>
##   1        218562       79087                                
##   2        218562       79087                                
##   3        218562       79087                                
##   4        218562       79087                                
##   5        218562       79087                                
##   6        218563       79087                                
##   -------
##   seqinfo: 1 sequence from an unspecified genome; no seqlengths

3 Resources

Acknowledgements

3.1 sessionInfo()

sessionInfo()
## R version 3.2.2 (2015-08-14)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Debian GNU/Linux stretch/sid
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8       
##  [4] LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C              
## [10] LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] Homo.sapiens_1.3.1                      GO.db_3.2.2                            
##  [3] OrganismDbi_1.11.43                     biomaRt_2.25.3                         
##  [5] AnnotationHub_2.1.45                    VariantAnnotation_1.15.34              
##  [7] RNAseqData.HNRNPC.bam.chr14_0.7.0       GenomicAlignments_1.5.18               
##  [9] Rsamtools_1.21.21                       ALL_1.11.0                             
## [11] org.Hs.eg.db_3.2.3                      RSQLite_1.0.0                          
## [13] DBI_0.3.1                               ggplot2_1.0.1                          
## [15] airway_0.103.1                          limma_3.25.18                          
## [17] DESeq2_1.9.51                           RcppArmadillo_0.6.100.0.0              
## [19] Rcpp_0.12.1                             BSgenome.Hsapiens.UCSC.hg19_1.4.0      
## [21] BSgenome_1.37.6                         rtracklayer_1.29.28                    
## [23] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2 GenomicFeatures_1.21.33                
## [25] AnnotationDbi_1.31.19                   SummarizedExperiment_0.3.11            
## [27] Biobase_2.29.1                          GenomicRanges_1.21.32                  
## [29] GenomeInfoDb_1.5.16                     microbenchmark_1.4-2                   
## [31] Biostrings_2.37.8                       XVector_0.9.4                          
## [33] IRanges_2.3.26                          S4Vectors_0.7.23                       
## [35] BiocGenerics_0.15.11                    BiocStyle_1.7.9                        
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-6                 RColorBrewer_1.1-2           httr_1.0.0                  
##  [4] tools_3.2.2                  R6_2.1.1                     rpart_4.1-10                
##  [7] Hmisc_3.17-0                 colorspace_1.2-6             nnet_7.3-11                 
## [10] gridExtra_2.0.0              graph_1.47.2                 formatR_1.2.1               
## [13] sandwich_2.3-4               labeling_0.3                 scales_0.3.0                
## [16] mvtnorm_1.0-3                genefilter_1.51.1            RBGL_1.45.1                 
## [19] stringr_1.0.0                digest_0.6.8                 foreign_0.8-66              
## [22] rmarkdown_0.8.1              htmltools_0.2.6              BiocInstaller_1.19.14       
## [25] shiny_0.12.2                 zoo_1.7-12                   BiocParallel_1.3.54         
## [28] acepack_1.3-3.3              RCurl_1.95-4.7               magrittr_1.5                
## [31] Formula_1.2-1                futile.logger_1.4.1          munsell_0.4.2               
## [34] proto_0.3-10                 stringi_0.5-5                multcomp_1.4-1              
## [37] yaml_2.1.13                  MASS_7.3-44                  zlibbioc_1.15.0             
## [40] plyr_1.8.3                   grid_3.2.2                   lattice_0.20-33             
## [43] splines_3.2.2                annotate_1.47.4              locfit_1.5-9.1              
## [46] knitr_1.11                   geneplotter_1.47.0           reshape2_1.4.1              
## [49] codetools_0.2-14             futile.options_1.0.0         XML_3.98-1.3                
## [52] evaluate_0.8                 latticeExtra_0.6-26          lambda.r_1.1.7              
## [55] httpuv_1.3.3                 gtable_0.1.2                 mime_0.4                    
## [58] xtable_1.7-4                 survival_2.38-3              cluster_2.0.3               
## [61] TH.data_1.0-6                interactiveDisplayBase_1.7.3