################################################### ### chunk number 1: orgDemo ################################################### ##load the package library("org.Hs.eg.db") ##look what we just loaded ls(2) ##Just like user accessible functions, all Mappings have a manual page which ##will show you what to expect as well as where the data came from # ?org.Hs.egCHRLOC ##Have a peak: as.list(org.Hs.egCHRLOC[1:4]) ##for the stop locations use: as.list(org.Hs.egCHRLOCEND[1:4]) ##or can use get, mget etc. with the entrez gene ID EGs = c("10","100","1000") mget(EGs, org.Hs.egCHRLOC, ifnotfound=NA) mget(EGs, org.Hs.egCHRLOCEND, ifnotfound=NA) ##You can also retrieve ENSEMBL IDs using this package mget(EGs, org.Hs.egENSEMBL, ifnotfound=NA) ##And GO IDs mget(EGs[1], org.Hs.egGO, ifnotfound=NA) ##And KEGG pathway IDs etc. mget(EGs, org.Hs.egPATH, ifnotfound=NA) ##Other convenient functions ##Lkeys, RKeys, mappedLkeys(),mappedRkeys() Lkeys(org.Hs.egENZYME)[110:112] Rkeys(org.Hs.egENZYME)[110:112] ##Left keys and right keys can be mapped or un-mapped. length(Lkeys(org.Hs.egPATH)) length(Rkeys(org.Hs.egPATH)) length(mappedLkeys(org.Hs.egPATH)) length(mappedRkeys(org.Hs.egPATH)) ##keys() and mappedkeys() both return the left keys length(keys(org.Hs.egPATH)) length(mappedkeys(org.Hs.egPATH)) ##revmap() can USUALLY be used to reverse the direction of a mapping. PATHIDs = unlist(mget(EGs[1], org.Hs.egPATH, ifnotfound=NA))[[1]] PATHIDs mget(as.character(PATHIDs), revmap(org.Hs.egPATH), ifnotfound=NA) ##toTable toTable(revmap(org.Hs.egPATH))[1:4,] ##special symbols: packagename(), _dbfile(), _dbinfo(), and _dbconn() ls(2) ##package info org.Hs.eg() ##location of the database file org.Hs.eg_dbfile() ##Data frame with Information org.Hs.eg_dbInfo() ##Connection object org.Hs.eg_dbconn() ################################################### ### chunk number 2: dbExamples ################################################### ##simple examples of using the DBI interface library(org.Hs.eg.db) dbconn <- org.Hs.eg_dbconn() sql <- "SELECT * FROM genes LIMIT 4;" result <- dbGetQuery(dbconn, sql) result ##Here is a simple join of the type that generates the mappings sql <- "SELECT * FROM genes,kegg WHERE genes._id=kegg._id;" result <- dbGetQuery(dbconn, sql) result[1:4,] ##simple example of a join across DBs ##get the entrez genes in humans and in mouse which share a kegg pathway ID. ## 1st we must get the pathway to the database file path = system.file("extdata", "org.Mm.eg.sqlite", package = "org.Mm.eg.db" ) ##then we have to attach the database sql <- paste("ATTACH '",path,"' AS Mm;",sep="") dbSendQuery(dbconn, sql) ##Then we can make our query sql <- "SELECT g.gene_id, k.path_id, mk.path_id, mg.gene_id FROM genes AS g, kegg AS k, Mm.kegg AS mk, Mm.genes AS mg WHERE g._id=k._id AND mg._id=mk._id AND k.path_id=mk.path_id limit 1000;" result <- dbGetQuery(dbconn, sql) result[1:4,] ##can even use SQLite specific stuff sql = "SELECT * FROM sqlite_master;" result = dbGetQuery(dbconn, sql) head(result) ################################################### ### chunk number 3: chipExamples ################################################### ##Things work very similarly to an org package library(hgu95av2.db) ls(2) ##Gene symbols and aliases probes = keys(hgu95av2SYMBOL)[1:4] probes ##You can use the probes in the same way you would have used Entrez Genes ##Here is a mapping that retrieves NCBIs official gene symbols mget(probes, hgu95av2SYMBOL, ifnotfound=NA) ##And here is a mapping that retrieves all known gene symbols mget(probes, revmap(hgu95av2ALIAS2PROBE), ifnotfound=NA) ##Be careful with Aliases as they are NOT unique ##This is easier to demonstrate with an org package. ##But its even more of a problem in a chip package. library(org.Hs.eg.db) EG2AliasList = as.list(org.Hs.egALIAS2EG) EG2AliasList["KAT"] ##We can calculate out how many things match each Alias like this: lengths = unlist(lapply(EG2AliasList, length)) lengths["KAT"] table(lengths) ##And just out of curiosity: lengths[lengths==36] EG2AliasList["VH"] ################################################### ### chunk number 4: customChipExample ################################################### ##1st you need the appropriate DBO package library(AnnotationDbi) available.db0pkgs() ##Then you need to get that package ##But Don't actually do this step (if you are copy/pasting along) # source("http://bioconductor.org/BiocLite.R") # biocLite("human.db0") ##Then you can get a tab-delimited file that has your probes paired with IDs hcg110_IDs = system.file("extdata", "hcg110_ID", package="AnnotationDbi") head(read.delim(hcg110_IDs,header=FALSE)) ##For this example lets not actually write anything to the file sys. tmpout = tempdir() ##Then you can make the package makeHUMANCHIP_DB(affy=FALSE, prefix="hcg110", fileName=hcg110_IDs, baseMapType="gb", outputDir = tmpout, version="1.0.0", manufacturer = "Affymetrix", chipName = "Human Cancer G110 Array", manufacturerUrl = "http://www.affymetrix.com") ################################################### ### chunk number 5: GOExamples ################################################### ##You may have already noticed that the organism packages have some GO ##information in them already. This mapping represents the relationship ##between these EG IDs and the GO IDs. For example: org.Hs.egGO, ##org.Hs.egGO2EG, and org.Hs.egGO2ALLEGS ls("package:org.Hs.eg.db")[22:24] ##There are two types of such mappings. org.Hs.egGO will map GO terms to ##entrez gene IDs, while org.Hs.egGO2ALLEGS maps GO terms and relevant child ##terms to specific entrez gene IDs The man pages will help you remember which ##is which. ##All the other GO information is found in GO.db library(GO.db) ls("package:GO.db") ##The mapping that you usually want is the one that describes all the terms keys = keys(GOTERM[1:500]) x = mget(as.character(keys), GOTERM, ifnotfound=NA) x[30] ##GO is a directed acyclic graph, so there are many parent and child ##relationships among terms. ##Therefore GO.db also has mappings to tell you about the parent and child ##terms as well as all the ancestor or offspring terms mget(as.character(names(x[30])),GOBPCHILDREN, ifnotfound=NA) ################################################### ### chunk number 6: biomaRtExamples ################################################### ##Getting the data from biomaRt: library("biomaRt") ##Choose a database listMarts()[1:5,] ##Get the current ensembl database. ensembl = useMart("ensembl") ##List the datasets therein listDatasets(ensembl)[1:10,] ##Then set up so that you use that for this session ##(we will choose the mouse one from NCBI build 37.1): ensembl = useDataset("mmusculus_gene_ensembl",mart=ensembl) ##List attributes attributes = listAttributes(ensembl) attributes[1:10,] ##And filters filters = listFilters(ensembl) filters[1:10,] ##Some entrez gene IDs EGs = c("18392","18414","56513") ##1st a Simple example to just get some gene names: getBM(attributes = "external_gene_id", filters = "entrezgene", values = EGs, mart=ensembl) ################################################### ### chunk number 7: biomartDemoContinued ################################################### ##Transcript starts and ends: getBM(attributes = c("entrezgene","transcript_start","transcript_end"), filters = "entrezgene", values = EGs, mart=ensembl) ################################################### ### chunk number 8: SessionInfo ################################################### sessionInfo()