###################################################
### chunk number 1: orgDemo
###################################################
##load the package
library("org.Hs.eg.db")

##look what we just loaded
ls(2)

##Just like user accessible functions, all Mappings have a manual page which
##will show you what to expect as well as where the data came from
# ?org.Hs.egCHRLOC

##Have a peak:
as.list(org.Hs.egCHRLOC[1:4])

##for the stop locations use:
as.list(org.Hs.egCHRLOCEND[1:4])

##or can use get, mget etc. with the entrez gene ID
EGs = c("10","100","1000")
mget(EGs, org.Hs.egCHRLOC, ifnotfound=NA)
mget(EGs, org.Hs.egCHRLOCEND, ifnotfound=NA)

##You can also retrieve ENSEMBL IDs using this package
mget(EGs, org.Hs.egENSEMBL, ifnotfound=NA)

##And GO IDs
mget(EGs[1], org.Hs.egGO, ifnotfound=NA)

##And KEGG pathway IDs etc.
mget(EGs, org.Hs.egPATH, ifnotfound=NA)


##Other convenient functions
##Lkeys, RKeys, mappedLkeys(),mappedRkeys()
Lkeys(org.Hs.egENZYME)[110:112]
Rkeys(org.Hs.egENZYME)[110:112]

##Left keys and right keys can be mapped or un-mapped.
length(Lkeys(org.Hs.egPATH))
length(Rkeys(org.Hs.egPATH))
length(mappedLkeys(org.Hs.egPATH))
length(mappedRkeys(org.Hs.egPATH))

##keys() and mappedkeys() both return the left keys 
length(keys(org.Hs.egPATH))
length(mappedkeys(org.Hs.egPATH))


##revmap() can USUALLY be used to reverse the direction of a mapping.
PATHIDs = unlist(mget(EGs[1], org.Hs.egPATH, ifnotfound=NA))[[1]]
PATHIDs
mget(as.character(PATHIDs), revmap(org.Hs.egPATH), ifnotfound=NA)


##toTable
toTable(revmap(org.Hs.egPATH))[1:4,]


##special symbols: packagename(), _dbfile(), _dbinfo(), and _dbconn()
ls(2)

##package info
org.Hs.eg()

##location of the database file
org.Hs.eg_dbfile()

##Data frame with Information 
org.Hs.eg_dbInfo()

##Connection object
org.Hs.eg_dbconn()


###################################################
### chunk number 2: dbExamples
###################################################

##simple examples of using the DBI interface
library(org.Hs.eg.db)
dbconn <- org.Hs.eg_dbconn()
sql <- "SELECT * FROM genes LIMIT 4;"
result <- dbGetQuery(dbconn, sql)
result


##Here is a simple join of the type that generates the mappings
sql <- "SELECT * FROM genes,kegg WHERE genes._id=kegg._id;"
result <- dbGetQuery(dbconn, sql)
result[1:4,]


##simple example of a join across DBs
##get the entrez genes in humans and in mouse which share a kegg pathway ID.

## 1st we must get the pathway to the database file 
path = system.file("extdata", "org.Mm.eg.sqlite", package = "org.Mm.eg.db" )

##then we have to attach the database
sql <- paste("ATTACH '",path,"' AS Mm;",sep="")
dbSendQuery(dbconn, sql)

##Then we can make our query
sql <- "SELECT g.gene_id, k.path_id, mk.path_id, mg.gene_id 
        FROM genes AS g, kegg AS k, Mm.kegg AS mk, Mm.genes AS mg 
        WHERE g._id=k._id AND mg._id=mk._id AND k.path_id=mk.path_id
        limit 1000;"
result <- dbGetQuery(dbconn, sql)
result[1:4,]


##can even use SQLite specific stuff
sql = "SELECT * FROM sqlite_master;"
result = dbGetQuery(dbconn, sql)
head(result)
  

###################################################
### chunk number 3: chipExamples
###################################################
##Things work very similarly to an org package
library(hgu95av2.db)
ls(2)

##Gene symbols and aliases
probes = keys(hgu95av2SYMBOL)[1:4]
probes

##You can use the probes in the same way you would have used Entrez Genes
##Here is a mapping that retrieves NCBIs official gene symbols
mget(probes, hgu95av2SYMBOL, ifnotfound=NA)

##And here is a mapping that retrieves all known gene symbols
mget(probes, revmap(hgu95av2ALIAS2PROBE), ifnotfound=NA)


##Be careful with Aliases as they are NOT unique
##This is easier to demonstrate with an org package.
##But its even more of a problem in a chip package.
library(org.Hs.eg.db)
EG2AliasList = as.list(org.Hs.egALIAS2EG)
EG2AliasList["KAT"]

##We can calculate out how many things match each Alias like this:
lengths = unlist(lapply(EG2AliasList, length))
lengths["KAT"]
table(lengths)

##And just out of curiosity:
lengths[lengths==36]
EG2AliasList["VH"]


###################################################
### chunk number 4: customChipExample
###################################################
##1st you need the appropriate DBO package
library(AnnotationDbi)
available.db0pkgs()

##Then you need to get that package
##But Don't actually do this step (if you are copy/pasting along)
# source("http://bioconductor.org/BiocLite.R")
# biocLite("human.db0")

##Then you can get a tab-delimited file that has your probes paired with IDs
hcg110_IDs = system.file("extdata", "hcg110_ID", package="AnnotationDbi")
head(read.delim(hcg110_IDs,header=FALSE))

##For this example lets not actually write anything to the file sys.
tmpout = tempdir()

##Then you can make the package
makeHUMANCHIP_DB(affy=FALSE,
                 prefix="hcg110",
                 fileName=hcg110_IDs,
                 baseMapType="gb",
                 outputDir = tmpout,
                 version="1.0.0",
                 manufacturer = "Affymetrix",
                 chipName = "Human Cancer G110 Array",
                 manufacturerUrl = "http://www.affymetrix.com")


###################################################
### chunk number 5: GOExamples
###################################################
##You may have already noticed that the organism packages have some GO
##information in them already.  This mapping represents the relationship
##between these EG IDs and the GO IDs.  For example: org.Hs.egGO,
##org.Hs.egGO2EG, and org.Hs.egGO2ALLEGS
ls("package:org.Hs.eg.db")[22:24]
##There are two types of such mappings. org.Hs.egGO will map GO terms to
##entrez gene IDs, while org.Hs.egGO2ALLEGS maps GO terms and relevant child
##terms to specific entrez gene IDs The man pages will help you remember which
##is which.


##All the other GO information is found in GO.db
library(GO.db)
ls("package:GO.db")

##The mapping that you usually want is the one that describes all the terms
keys = keys(GOTERM[1:500])
x = mget(as.character(keys), GOTERM, ifnotfound=NA)
x[30]

##GO is a directed acyclic graph, so there are many parent and child
##relationships among terms.
##Therefore GO.db also has mappings to tell you about the parent and child
##terms as well as all the ancestor or offspring terms
mget(as.character(names(x[30])),GOBPCHILDREN, ifnotfound=NA)


###################################################
### chunk number 6: biomaRtExamples
###################################################
##Getting the data from biomaRt:

library("biomaRt")
##Choose a database
listMarts()[1:5,]

##Get the current ensembl database.
ensembl = useMart("ensembl")

##List the datasets therein
listDatasets(ensembl)[1:10,]
##Then set up so that you use that for this session 
##(we will choose the mouse one from NCBI build 37.1):
ensembl = useDataset("mmusculus_gene_ensembl",mart=ensembl)

##List attributes
attributes = listAttributes(ensembl)
attributes[1:10,]

##And filters
filters = listFilters(ensembl)
filters[1:10,]

##Some entrez gene IDs
EGs = c("18392","18414","56513")

##1st a Simple example to just get some gene names:
getBM(attributes = "external_gene_id", 
      filters = "entrezgene", 
      values = EGs, 
      mart=ensembl)


###################################################
### chunk number 7: biomartDemoContinued
###################################################
##Transcript starts and ends:
getBM(attributes = c("entrezgene","transcript_start","transcript_end"), 
      filters = "entrezgene", 
      values = EGs, 
      mart=ensembl)


###################################################
### chunk number 8: SessionInfo
###################################################
sessionInfo()