Authors: Martin Morgan (mtmorgan@fhcrc.org), Sonali Arora (sarora@fredhutch.org)
Date: 16 June, 2015
library(org.Hs.eg.db)
org <- org.Hs.eg.db # convenient alias
keytypes(org) # map from keys...
## [1] "ENTREZID" "PFAM" "IPI" "PROSITE"
## [5] "ACCNUM" "ALIAS" "ENZYME" "MAP"
## [9] "PATH" "PMID" "REFSEQ" "SYMBOL"
## [13] "UNIGENE" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS"
## [17] "GENENAME" "UNIPROT" "GO" "EVIDENCE"
## [21] "ONTOLOGY" "GOALL" "EVIDENCEALL" "ONTOLOGYALL"
## [25] "OMIM" "UCSCKG"
columns(org) # ...to columns
## [1] "ENTREZID" "PFAM" "IPI" "PROSITE"
## [5] "ACCNUM" "ALIAS" "CHR" "CHRLOC"
## [9] "CHRLOCEND" "ENZYME" "MAP" "PATH"
## [13] "PMID" "REFSEQ" "SYMBOL" "UNIGENE"
## [17] "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" "GENENAME"
## [21] "UNIPROT" "GO" "EVIDENCE" "ONTOLOGY"
## [25] "GOALL" "EVIDENCEALL" "ONTOLOGYALL" "OMIM"
## [29] "UCSCKG"
mapIds()
enforces 1:1 mapping by default
sym <- keys(org, "SYMBOL")
mapIds(org, c("BRCA1", "PTEN"), "GENENAME", "SYMBOL")
## BRCA1 PTEN
## "breast cancer 1, early onset" "phosphatase and tensin homolog"
map <- mapIds(org, sym, "GENENAME", "SYMBOL")
length(map)
## [1] 56332
head(map)
## A1BG
## "alpha-1-B glycoprotein"
## A2M
## "alpha-2-macroglobulin"
## A2MP1
## "alpha-2-macroglobulin pseudogene 1"
## NAT1
## "N-acetyltransferase 1 (arylamine N-acetyltransferase)"
## NAT2
## "N-acetyltransferase 2 (arylamine N-acetyltransferase)"
## NATP
## "N-acetyltransferase pseudogene"
mapIds()
supports 1:many mapping
head(select(org, keys(org), "ALIAS"))
## ENTREZID ALIAS
## 1 1 A1B
## 2 1 ABG
## 3 1 GAB
## 4 1 HYST2477
## 5 1 A1BG
## 6 2 A2MD
head(mapIds(org, keys(org), "ALIAS", "ENTREZID"))
## 1 2 3 9 10 11
## "A1B" "A2MD" "A2MP" "AAC1" "AAC2" "AACP"
head(mapIds(org, keys(org), "ALIAS", "ENTREZID", multiVals="CharacterList"))
## CharacterList of length 6
## [["1"]] A1B ABG GAB HYST2477 A1BG
## [["2"]] A2MD CPAMD5 FWP007 S863-7 A2M
## [["3"]] A2MP A2MP1
## [["9"]] AAC1 MNAT NAT-1 NATI NAT1
## [["10"]] AAC2 NAT-2 PNAT NAT2
## [["11"]] AACP NATP1 NATP
str(head(mapIds(org, keys(org), "ALIAS", "ENTREZID", multiVals="list")))
## List of 6
## $ 1 : chr [1:5] "A1B" "ABG" "GAB" "HYST2477" ...
## $ 2 : chr [1:5] "A2MD" "CPAMD5" "FWP007" "S863-7" ...
## $ 3 : chr [1:2] "A2MP" "A2MP1"
## $ 9 : chr [1:5] "AAC1" "MNAT" "NAT-1" "NATI" ...
## $ 10: chr [1:4] "AAC2" "NAT-2" "PNAT" "NAT2"
## $ 11: chr [1:3] "AACP" "NATP1" "NATP"
select()
can be used to select several different columns
OrgDb
(e.g., org.Hs.eg.db)
OrganismDb
(e.g., Homo.sapiens) packages.
select()
, exons()
etc., without needing to navigate between packageslibrary(org.Hs.eg.db)
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
sym <- "BRCA1"
eid <- mapIds(org.Hs.eg.db, sym, "ENTREZID", "SYMBOL")
txid <- mapIds(txdb, eid, "TXNAME", "GENEID", multiVals="list")[[eid]]
txid
## [1] "uc010whl.2" "uc002icp.4" "uc010whm.2" "uc002icu.3" "uc010cyx.3"
## [6] "uc002icq.3" "uc002ict.3" "uc010whn.2" "uc010who.3" "uc010whp.2"
## [11] "uc010whq.1" "uc002idc.1" "uc010whr.1" "uc002idd.3" "uc002ide.1"
## [16] "uc010cyy.1" "uc010whs.1" "uc010cyz.2" "uc010cza.2" "uc010wht.1"
cds <- cdsBy(txdb, by="tx", use.names=TRUE)
cds[names(cds) %in% txid]
## GRangesList object of length 20:
## $uc010whl.2
## GRanges object with 22 ranges and 3 metadata columns:
## seqnames ranges strand | cds_id cds_name
## <Rle> <IRanges> <Rle> | <integer> <character>
## [1] chr17 [41276034, 41276113] - | 186246 <NA>
## [2] chr17 [41267743, 41267796] - | 186245 <NA>
## [3] chr17 [41258473, 41258550] - | 186243 <NA>
## ... ... ... ... ... ... ...
## [21] chr17 [41199660, 41199720] - | 186214 <NA>
## [22] chr17 [41197695, 41197819] - | 186212 <NA>
## exon_rank
## <integer>
## [1] 1
## [2] 2
## [3] 3
## ... ...
## [21] 21
## [22] 22
##
## ...
## <19 more elements>
## -------
## seqinfo: 93 sequences (1 circular) from hg19 genome
library(Homo.sapiens)
txid <- mapIds(Homo.sapiens, sym, "TXNAME", "SYMBOL", multiVals="list")[[sym]]
cds <- cdsBy(Homo.sapiens, by="tx", use.names=TRUE)
Possible to create custom versions, e.g., OrganismDbi::makeOrganismDbFromBiomart()
.
Web resource: http://biomart.org
Package: biomaRt
library(biomaRt)
head(listMarts()) # available marts, 52!
## biomart version
## 1 ensembl ENSEMBL GENES 80 (SANGER UK)
## 2 snp ENSEMBL VARIATION 80 (SANGER UK)
## 3 regulation ENSEMBL REGULATION 80 (SANGER UK)
## 4 vega VEGA 60 (SANGER UK)
## 5 fungi_mart_26 ENSEMBL FUNGI 26 (EBI UK)
## 6 fungi_variations_26 ENSEMBL FUNGI VARIATION 26 (EBI UK)
head(listDatasets(useMart("ensembl"))) # datasets in mart, 69!
## dataset
## 1 oanatinus_gene_ensembl
## 2 cporcellus_gene_ensembl
## 3 gaculeatus_gene_ensembl
## 4 lafricana_gene_ensembl
## 5 itridecemlineatus_gene_ensembl
## 6 choffmanni_gene_ensembl
## description version
## 1 Ornithorhynchus anatinus genes (OANA5) OANA5
## 2 Cavia porcellus genes (cavPor3) cavPor3
## 3 Gasterosteus aculeatus genes (BROADS1) BROADS1
## 4 Loxodonta africana genes (loxAfr3) loxAfr3
## 5 Ictidomys tridecemlineatus genes (spetri2) spetri2
## 6 Choloepus hoffmanni genes (choHof1) choHof1
ensembl <- # fully specified mart
useMart("ensembl", dataset = "hsapiens_gene_ensembl")
head(listFilters(ensembl), 3) # filters, 296!
## name description
## 1 chromosome_name Chromosome name
## 2 start Gene Start (bp)
## 3 end Gene End (bp)
myFilter <- "chromosome_name"
substr(filterOptions(myFilter, ensembl), 1, 50) # return values
## [1] "[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,2"
myValues <- c("21", "22")
head(listAttributes(ensembl), 3) # attributes
## name description
## 1 ensembl_gene_id Ensembl Gene ID
## 2 ensembl_transcript_id Ensembl Transcript ID
## 3 ensembl_peptide_id Ensembl Protein ID
myAttributes <- c("ensembl_gene_id","chromosome_name")
## assemble and query the mart
res <- getBM(myAttributes, myFilter, myValues, ensembl)
Package: AnnotationHub
library(AnnotationHub)
hub = AnnotationHub()
hub
## AnnotationHub with 16754 records
## # snapshotDate(): 2015-05-26
## # $dataprovider: UCSC, Ensembl, NCBI, Haemcode, Inparanoid8, Pazar, dbS...
## # $species: Homo sapiens, Mus musculus, Bos taurus, Pan troglodytes, Da...
## # $rdataclass: FaFile, GRanges, OrgDb, ChainFile, BigWigFile, Inparanoi...
## # additional mcols(): taxonomyid, genome, description, tags,
## # sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH2"]]'
##
## title
## AH2 | Ailuropoda_melanoleuca.ailMel1.69.dna.toplevel.fa
## AH3 | Ailuropoda_melanoleuca.ailMel1.69.dna_rm.toplevel.fa
## AH4 | Ailuropoda_melanoleuca.ailMel1.69.dna_sm.toplevel.fa
## ... ...
## AH47935 | Xiphophorus_maculatus.Xipmac4.4.2.ncrna.fa
## AH47936 | Xiphophorus_maculatus.Xipmac4.4.2.pep.all.fa
query(hub, c("Ensembl", "80", "gtf"))
## AnnotationHub with 102 records
## # snapshotDate(): 2015-05-26
## # $dataprovider: Ensembl
## # $species: Gadus morhua, Oryzias latipes, Xiphophorus maculatus, Ailur...
## # $rdataclass: GRanges
## # additional mcols(): taxonomyid, genome, description, tags,
## # sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH7535"]]'
##
## title
## AH7535 | Xiphophorus_maculatus.Xipmac4.4.2.69.gtf
## AH7554 | Gadus_morhua.gadMor1.70.gtf
## AH7575 | Oryzias_latipes.MEDAKA1.70.gtf
## ... ...
## AH47107 | Xenopus_tropicalis.JGI_4.2.80.gtf
## AH47108 | Xiphophorus_maculatus.Xipmac4.4.2.80.gtf
## ensgtf = display(hub) # visual choice
hub["AH47107"]
## AnnotationHub with 1 record
## # snapshotDate(): 2015-05-26
## # names(): AH47107
## # $dataprovider: Ensembl
## # $species: Xenopus tropicalis
## # $rdataclass: GRanges
## # $title: Xenopus_tropicalis.JGI_4.2.80.gtf
## # $description: Gene Annotation for Xenopus tropicalis
## # $taxonomyid: 8364
## # $genome: JGI_4.2
## # $sourcetype: GTF
## # $sourceurl: ftp://ftp.ensembl.org/pub/release-80/gtf/xenopus_tropical...
## # $sourcelastmodifieddate: 2015-05-01
## # $sourcesize: 8492889
## # $tags: GTF, ensembl, Gene, Transcript, Annotation
## # retrieve record with 'object[["AH47107"]]'
gtf <- hub[["AH47107"]]
gtf
## GRanges object with 581787 ranges and 19 metadata columns:
## seqnames ranges strand | source type
## <Rle> <IRanges> <Rle> | <factor> <factor>
## [1] GL172637.1 [34, 148] - | ensembl gene
## [2] GL172637.1 [34, 148] - | ensembl transcript
## [3] GL172637.1 [34, 148] - | ensembl exon
## ... ... ... ... ... ... ...
## [581786] GL180121.1 [1817, 1835] + | ensembl exon
## [581787] GL180121.1 [1817, 1835] + | ensembl CDS
## score phase gene_id gene_version gene_name
## <numeric> <integer> <character> <numeric> <character>
## [1] <NA> <NA> ENSXETG00000030486 1 U5
## [2] <NA> <NA> ENSXETG00000030486 1 U5
## [3] <NA> <NA> ENSXETG00000030486 1 U5
## ... ... ... ... ... ...
## [581786] <NA> <NA> ENSXETG00000033193 1 <NA>
## [581787] <NA> 1 ENSXETG00000033193 1 <NA>
## gene_source gene_biotype transcript_id
## <character> <character> <character>
## [1] ensembl snRNA <NA>
## [2] ensembl snRNA ENSXETT00000065882
## [3] ensembl snRNA ENSXETT00000065882
## ... ... ... ...
## [581786] ensembl protein_coding ENSXETT00000053735
## [581787] ensembl protein_coding ENSXETT00000053735
## transcript_version transcript_name transcript_source
## <numeric> <character> <character>
## [1] <NA> <NA> <NA>
## [2] 1 U5-201 ensembl
## [3] 1 U5-201 ensembl
## ... ... ... ...
## [581786] 2 <NA> ensembl
## [581787] 2 <NA> ensembl
## transcript_biotype exon_number exon_id exon_version
## <character> <numeric> <character> <numeric>
## [1] <NA> <NA> <NA> <NA>
## [2] snRNA <NA> <NA> <NA>
## [3] snRNA 1 ENSXETE00000393193 1
## ... ... ... ... ...
## [581786] protein_coding 3 ENSXETE00000416553 1
## [581787] protein_coding 3 <NA> <NA>
## protein_id protein_version
## <character> <numeric>
## [1] <NA> <NA>
## [2] <NA> <NA>
## [3] <NA> <NA>
## ... ... ...
## [581786] <NA> <NA>
## [581787] ENSXETP00000053735 2
## -------
## seqinfo: 2375 sequences from an unspecified genome; no seqlengths
## org.* data bases available from AnnotationHub
query(hub, "OrgDb")
## AnnotationHub with 1164 records
## # snapshotDate(): 2015-05-26
## # $dataprovider: NCBI, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
## # $species: Escherichia coli, Anopheles gambiae, Macaca mulatta, Pan tr...
## # $rdataclass: OrgDb
## # additional mcols(): taxonomyid, genome, description, tags,
## # sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH12818"]]'
##
## title
## AH12818 | org.Pseudomonas_mendocina_NK-01.eg.sqlite
## AH12819 | org.Streptomyces_coelicolor_A3(2).eg.sqlite
## AH12820 | org.Cricetulus_griseus.eg.sqlite
## ... ...
## AH47000 | org.Dr.eg.db.sqlite
## AH47001 | org.Pf.plasmo.db.sqlite
mcols(query(hub, "OrgDb"))[, "species", drop=FALSE]
## DataFrame with 1164 rows and 1 column
## species
## <character>
## AH12818 Pseudomonas mendocina_NK-01
## AH12819 Streptomyces coelicolor_A3(2)
## AH12820 Cricetulus griseus
## ... ...
## AH47000 Danio rerio
## AH47001 Plasmodium falciparum
In Bioc-devel, metadata(gtf)
returns information about the AnnotationHub record gtf
is derived from.
org.*
packageslibrary(AnnotationHub)
hub = AnnotationHub()
query(hub, "OrgDb")
## AnnotationHub with 1164 records
## # snapshotDate(): 2015-05-26
## # $dataprovider: NCBI, ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
## # $species: Escherichia coli, Anopheles gambiae, Macaca mulatta, Pan tr...
## # $rdataclass: OrgDb
## # additional mcols(): taxonomyid, genome, description, tags,
## # sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH12818"]]'
##
## title
## AH12818 | org.Pseudomonas_mendocina_NK-01.eg.sqlite
## AH12819 | org.Streptomyces_coelicolor_A3(2).eg.sqlite
## AH12820 | org.Cricetulus_griseus.eg.sqlite
## ... ...
## AH47000 | org.Dr.eg.db.sqlite
## AH47001 | org.Pf.plasmo.db.sqlite
hub[["AH12818"]]
## OrgDb object:
## | DBSCHEMAVERSION: 2.1
## | DBSCHEMA: NOSCHEMA_DB
## | ORGANISM: Pseudomonas mendocina_NK-01
## | SPECIES: Pseudomonas mendocina_NK-01
## | CENTRALID: GID
## | TAXID: 1001585
## | Db type: OrgDb
## | Supporting package: AnnotationDbi
##
## Please see: help('select') for usage information
Discover and retrieve available GTF and FASTA resources for pufferfish
library(AnnotationHub)
hub = AnnotationHub()
query(hub, c("ensembl","release-80", "Takifugu"))
## AnnotationHub with 7 records
## # snapshotDate(): 2015-05-26
## # $dataprovider: Ensembl
## # $species: Takifugu rubripes
## # $rdataclass: FaFile, GRanges
## # additional mcols(): taxonomyid, genome, description, tags,
## # sourceurl, sourcetype
## # retrieve records with, e.g., 'object[["AH47101"]]'
##
## title
## AH47101 | Takifugu_rubripes.FUGU4.80.gtf
## AH47475 | Takifugu_rubripes.FUGU4.cdna.all.fa
## AH47476 | Takifugu_rubripes.FUGU4.dna_rm.toplevel.fa
## ... ...
## AH47479 | Takifugu_rubripes.FUGU4.ncrna.fa
## AH47480 | Takifugu_rubripes.FUGU4.pep.all.fa
gtf <- hub[["AH47101"]]
dna <- hub[["AH47477"]]
gtf
## GRanges object with 1388717 ranges and 19 metadata columns:
## seqnames ranges strand | source type
## <Rle> <IRanges> <Rle> | <factor> <factor>
## [1] scaffold_1 [10422, 11354] - | ensembl gene
## [2] scaffold_1 [10422, 11354] - | ensembl transcript
## [3] scaffold_1 [10422, 11354] - | ensembl exon
## ... ... ... ... ... ... ...
## [1388716] scaffold_16598 [1807, 1936] + | ensembl exon
## [1388717] scaffold_16598 [1807, 1936] + | ensembl CDS
## score phase gene_id gene_version
## <numeric> <integer> <character> <numeric>
## [1] <NA> <NA> ENSTRUG00000003702 1
## [2] <NA> <NA> ENSTRUG00000003702 1
## [3] <NA> <NA> ENSTRUG00000003702 1
## ... ... ... ... ...
## [1388716] <NA> <NA> ENSTRUG00000004123 1
## [1388717] <NA> 1 ENSTRUG00000004123 1
## gene_source gene_biotype transcript_id
## <character> <character> <character>
## [1] ensembl protein_coding <NA>
## [2] ensembl protein_coding ENSTRUT00000008740
## [3] ensembl protein_coding ENSTRUT00000008740
## ... ... ... ...
## [1388716] ensembl protein_coding ENSTRUT00000009819
## [1388717] ensembl protein_coding ENSTRUT00000009819
## transcript_version transcript_source transcript_biotype
## <numeric> <character> <character>
## [1] <NA> <NA> <NA>
## [2] 1 ensembl protein_coding
## [3] 1 ensembl protein_coding
## ... ... ... ...
## [1388716] 1 ensembl protein_coding
## [1388717] 1 ensembl protein_coding
## exon_number exon_id exon_version protein_id
## <numeric> <character> <numeric> <character>
## [1] <NA> <NA> <NA> <NA>
## [2] <NA> <NA> <NA> <NA>
## [3] 1 ENSTRUE00000055472 1 <NA>
## ... ... ... ... ...
## [1388716] 4 ENSTRUE00000062911 1 <NA>
## [1388717] 4 <NA> <NA> ENSTRUP00000009764
## protein_version gene_name transcript_name
## <numeric> <character> <character>
## [1] <NA> <NA> <NA>
## [2] <NA> <NA> <NA>
## [3] <NA> <NA> <NA>
## ... ... ... ...
## [1388716] <NA> IWS1 (1 of 2) IWS1 (1 of 2)-201
## [1388717] 1 IWS1 (1 of 2) IWS1 (1 of 2)-201
## -------
## seqinfo: 2056 sequences from an unspecified genome; no seqlengths
dna
## class: FaFile
## path: /home/mtmorgan/.AnnotationHub/53323
## index: /home/mtmorgan/.AnnotationHub/53324
## isOpen: FALSE
## yieldSize: NA
head(seqlevels(dna))
## [1] "scaffold_1" "scaffold_2" "scaffold_3" "scaffold_4" "scaffold_5"
## [6] "scaffold_6"
Create a TxDb
instance
library(GenomicFeatures)
txdb <- makeTxDbFromGRanges(gtf)
## saveDb(txdb, "TxDb.Takifugu.Ensembl.80.sqlite")
## loadDb("TxDb.Takifugu.Ensembl.80.sqlite")
Use txdb
library(Rsamtools) # for getSeq,FaFile-method
## Loading required package: XVector
## Loading required package: Biostrings
exons <- exons(txdb)
getSeq(dna, exons)
## A DNAStringSet instance of length 322622
## width seq names
## [1] 68 GCTAGCGTAGCTTAACCA...TGAAAAGTCCCGCAGGCA MT
## [2] 947 CAAAAGCTTGGTCCTGAC...AGGTGCACTTGGAAAAAC MT
## [3] 74 CGGAGCATAGCTTAACAG...AAATCGAGCTGCCCCGAC MT
## ... ... ...
## [322621] 93 CCGTCAGAGCAGCAGGTG...GACGGCGACTACCATCAG scaffold_9956
## [322622] 198 AGGATTGAGCTGCGCTCC...GAGGTGGACGGGGTCAAG scaffold_9956