eQTL tutorial, Bioc 2014
VJ Carey stvjc at channing dot harvard dot edu
```{r extracode,echo=FALSE,error=FALSE,message=FALSE}
suppressMessages({
exs2se = function (x, assayname = "exprs",
fngetter = function(z) rownames(exprs(z)),
annoDb,
probekeytype = "PROBEID", duphandler = function(z) {
if (any(isd <- duplicated(z[, "ENTREZID"])))
return(z[!isd, , drop = FALSE])
z
}, signIsStrand = TRUE, ucsdChrnames = TRUE)
{
annopk = annoDb
stopifnot(is(annopk, "ChipDb") | is(annopk, "OrgDb"))
fn = fngetter(x)
locd = duphandler(fulls <- select(annopk, keytype = probekeytype,
keys = fn, columns = c("CHR", "CHRLOC", "CHRLOCEND")))
nfulls = na.omit(fulls)
nmultiaddr = nrow(nfulls) - length(fn)
rownames(locd) = locd[, probekeytype]
locd = na.omit(locd)
dropped = setdiff(fn, rownames(locd))
if (length(dropped) > 0)
warning(paste("there were", length(dropped), "addresses dropped owing to missing address information in bioc annotation"))
locd = locd[intersect(rownames(locd), fn), ]
strand = rep("*", nrow(locd))
if (signIsStrand)
strand = ifelse(locd[, "CHRLOC"] > 0, "+", "-")
chpref = ""
if (ucsdChrnames)
chpref = "chr"
rowd = GRanges(paste0(chpref, locd[, "CHR"]), IRanges(abs(locd[,
"CHRLOC"]), abs(locd[, "CHRLOCEND"])), strand = strand)
names(rowd) = rownames(locd)
metadata(rowd)$dropped = dropped
metadata(rowd)$nmultiaddr = nmultiaddr
hasrowd = match(names(rowd), fn, nomatch = 0)
ex = x[hasrowd, ]
stopifnot(nrow(exprs(ex)) == length(rowd))
ed = SimpleList(initExptData = experimentData(ex))
SummarizedExperiment(assays = SimpleList(exprs = exprs(ex)),
rowData = rowd, colData = DataFrame(pData(ex)), exptData = ed)
}
genemodel = function (sym, genome = "hg19", force = FALSE)
{
stopifnot(genome == "hg19")
if (!exists("txdb") || force)
txlib()
require(org.Hs.eg.db)
num = get(sym, revmap(org.Hs.egSYMBOL))
exonsBy(txdb, by = "gene")[[num]]
}
txlib = function ()
{
library(TxDb.Hsapiens.UCSC.hg19.knownGene)
txdb <<- TxDb.Hsapiens.UCSC.hg19.knownGene
message("txdb assigned")
invisible(NULL)
}
rangeFromToksViaEntrez = function (toks, genome = "hg19", force = FALSE,
takeRange=TRUE, orgDb=Homo.sapiens, typesToTry=c("SYMBOL", "GOID", "PATH",
"TERM"), routeSink=TRUE)
{
#
# convert tokens to GRanges after translation to Entrez ids
# takeRange if true reduces answer to 'range' of exons
# open question whether we also want to cache specific models
#
require(stashR)
stopifnot(genome == "hg19")
#
# cache exonsBy of TxDb
#
require(deparse(substitute(orgDb)), character.only=TRUE)
modcon = new("localDB", dir="~/.SEresolvers", name="rowResolvers")
if (!dbExists(modcon, "hg19.exbygene")) {
message("one time cache creation for TxDb...")
txlib() # special function to create txdb in .GlobalEnv
dbInsert( modcon, "hg19.exbygene", exonsBy(txdb, by="gene") )
}
allex = dbFetch(modcon, "hg19.exbygene")
tf = file(tempfile(), open="wt")
if (routeSink) message("resolving row selection tokens... (messages diverted)")
if (routeSink) sink(tf, type="message")
for (curty in typesToTry) {
num = try(select(orgDb, keytype=curty,
keys=toks, columns="ENTREZID")[,"ENTREZID"])
if (!inherits(num, "try-error")) break
}
if (routeSink) sink(NULL, type="message")
if (routeSink) message("messages restored")
if (inherits(num, "try-error")) stop("can't resolve toks")
inds = match(num, names(allex))
filt = force
if (takeRange) filt = range
ans = do.call(c, lapply(1:length(inds), function(x) try({
tmp = filt(allex[[inds[x]]])
tmp$ENTREZID=num[x]
tmp})))
syms = select(Homo.sapiens, keys=ans$ENTREZID, keytype="ENTREZID",
columns="SYMBOL")
if (length(unique(syms[,1]))==nrow(syms)) ans$SYMBOL = syms[,2]
else {
ssyms = split(syms[,2], syms[,1])[ syms[,1] ] # retain order
s1 = sapply(ssyms, "[", 1)
ans$SYMBOL = s1
}
ans
}
hasRowResolver = function(x) !is.null(exptData(x)$rowResolverHook)
getRowResolverTypes = function(x) exptData(x)$rowResolverHook()
tryRowResolution = function(x,i) {
ress = getRowResolverTypes(x)
rngs = rangeFromToksViaEntrez(toks=i, typesToTry=ress)
ov = findOverlaps( rowData(x), rngs )
queryHits(ov)
}
# based on devel 1.17.26 GRanges 1.17.26
setMethod("[", c("SummarizedExperiment",
"ANY", "ANY", "ANY"), function (x, i, j, ..., drop = TRUE)
{
clone = GenomicRanges:::clone
.SummarizedExperiment.charbound = GenomicRanges:::.SummarizedExperiment.charbound
.SummarizedExperiment.assays.subset = GenomicRanges:::.SummarizedExperiment.assays.subset
if (1L != length(drop) || (!missing(drop) && drop))
warning("'drop' ignored '[,", class(x), ",ANY,ANY-method'")
if (missing(i) && missing(j))
return(x)
if (!missing(i) && is.character(i)) {
if (hasRowResolver(x)) i = tryRowResolution(x, i) # will be vector of integers
else {
fmt <- paste0("<", class(x), ">[i,] index out of bounds: %s")
i <- .SummarizedExperiment.charbound(i, rownames(x),
fmt)
}
}
if (!missing(j) && is.character(j)) {
fmt <- paste0("<", class(x), ">[,j] index out of bounds: %s")
j <- .SummarizedExperiment.charbound(j, colnames(x),
fmt)
}
if (!missing(i) && !missing(j)) {
ii <- as.vector(i)
jj <- as.vector(j)
x <- clone(x, ..., rowData = rowData(x)[i], colData = colData(x)[j,
, drop = FALSE], assays = .SummarizedExperiment.assays.subset(x,
ii, jj))
}
else if (missing(i)) {
jj <- as.vector(j)
x <- clone(x, ..., colData = colData(x)[j, , drop = FALSE],
assays = .SummarizedExperiment.assays.subset(x, j = jj))
}
else {
ii <- as.vector(i)
x <- clone(x, ..., rowData = rowData(x)[i], assays = .SummarizedExperiment.assays.subset(x,
ii))
}
x
})
library(knitcitations)
library(bibtex)
allbib = read.bibtex("/data/eQTL2014/eq.bib")
})
```
Basic data from RECOUNT
We will use transcript profiles obtained through RNA-seq applied
to HapMap cell lines (`r citet(allbib[["Montgomery2010"]])`). The reported
gene-level count data have been rendered as an ExpressionSet (montpick\_eset.RData) in the RECOUNT project
(`r citet(allbib[["Frazee2011"]])`) using the ENSEMBL nomenclature
for genes. These counts were filtered to
exclude genes with uniform zero value over all samples, subjected to
rlog transformation (DESeq2), and then filtered to include only
those with MAD at the 40th percentile of the overall distribution of genewise
MADs.
Characteristic code for the transformation to SummarizedExperiment is given here.
You do not need to execute this as the transformed data are available for loading
as noted below. (exs2se() is a bit of custom code available in the source
of this vignette.)
```{r trans1,eval=FALSE}
library(Biobase)
load("/data/eQTL2014/montpick_eset.RData")
mp = montpick.eset
annotation(mp) = "org.Hs.eg.db"
tmp = select(org.Hs.eg.db, keytype="ENSEMBL", columns="ENTREZID",
keys=featureNames(mp))
ezs = split(tmp[,2], tmp[,1])
ezs = sapply(ezs, "[", 1)
drop = which(duplicated(ezs) | is.na(ezs))
if (length(drop)>0) ezs = ezs[-drop]
mp = mp[names(ezs)]
featureNames(mp) = as.character(ezs[featureNames(mp)])
mpSE = exs2se(mp, annoDb = org.Hs.eg.db, probekeytype="ENTREZID")
```
Transformed expression data in SummarizedExperiment
We obtain the basic image of transformed count data as follows.
```{r getd}
load("/data/eQTL2014/rmpDEPF.rda") # rlogged Montgomery Pickrell DESeqDataSet, some positive, filtered
rmpDEPF
```
Genotype data in an Amazon S3 bucket
All the data generated in the 1000 genomes project is available
in a S3 (simple storage service) container. We have a
URL that can be used to access the genotypes from
chr22 in VCF format.
```{r lkgset, warning=FALSE, message=FALSE}
load("/data/eQTL2014/s3url.rda")
s3url
library(VariantAnnotation)
```{r lkg, warning=FALSE, cache=TRUE, message=FALSE}
head = scanVcfHeader(s3url)
str(head, max.level=4)
```
A naive analysis
The GGtools package includes a function, cisAssoc(), that
can test additive genetic associations in cis between SNP genotypes
housed in tabix-indexed VCF and expression data housed in a
SummarizedExperiment. The test is the score test for a generalized
linear model fit to independent
samples with a continuous response, with response variance approximately
independent of response mean. These assumptions are generally not met
for count data from RNA-seq experiments, but they may hold reasonably
approximately after
the rlog transformation. We will check on the severity
of the violations
after performing some tests.
Here is code that computes, for 200 genes on
chr17, association test statistics
and versions thereof under permutation of expression against
genotype. Do not perform these calculations, they will take too
long for the tutorial. The answer is provided in the
tutorial data, see below.
```{r getli}
library(GGtools)
```{r doit,eval=FALSE}
library(foreach)
library(doParallel)
registerDoSEQ()
r17 = rmpDEPF[ which(seqnames(rmpDEPF)=="chr17"),]
set.seed(1234)
s3urlb = sub("22", "17", s3url)
c17 = cisAssoc(r17, TabixFile(s3urlb), cisradius=100000, lbmaf=.025)
c17d = addmindist(c17)
```
```{r dod,echo=FALSE}
load("/data/eQTL2014/c17d.rda")
```
Here is an extract from the output. We have manually
added a column 'mindist' with the number of nucleotides
from SNP to nearest end of gene body. 'mindist' is zero
for SNP lying within the gene body.
```{r lkres}
c17d[1:3,1:5]
names(mcols(c17d))
```
Computing the plug-in FDR
The following algorithm is from The
Elements of Statistical Learning by
Hastie, Tibshirani and Friedman (Springer 2001).
We can obtain estimated FDR for the gene-SNP pairs
tested in the example above, as follows, using the
pifdr() function in GGtools.
```{r dofd}
fdr = pifdr( c17d$chisq, c(c17d$permScore_1,
c17d$permScore_2, c17d$permScore_3) )
sum(fdr == 0)
sum(fdr <= .05)
```
Sensitivity analysis, and "what to count?"
The following computations generate a sensitivity
analysis display, confronting the enumeration of
significantly associated SNP-gene pairs.
```{r dosens1}
library(data.table)
library(foreach)
registerDoSEQ()
dt1 = data.table(as.data.frame(mcols(c17d)))
```{r runsen,cache=TRUE}
s1 = eqsens_dt( dt1 )
```
```{r dopl,fig=TRUE}
plotsens(s1, ylab="Count of significant SNP-gene pairs at given FDR")
```
If instead of counting SNP-gene pairs, we wish to
count genes that show evidence of association with
some genotype, the plug-in FDR procedure can be
used with maximal association scores per gene, for
observed and permuted tests.
```{r rnsen2,cache=TRUE}
s2 = eqsens_dt( dt1, by="probes" )
```
```{r pl2,fig=TRUE}
plotsens(s2, ylab="Count of genes with evidence of eQTL at given FDR")
```
There is an indication here that we get a larger yield if
we filter more sharply on MAF and distance than we did in the
initial run.
Exercise 1.
How can we compute the FDR for the gene-wise hypotheses
H0g: Mean expression of gene g is independent of SNP
content for all SNP cis to g, under the restrictions
cis radius less than 50kb and MAF greater than 5 percent?
Visualization
The following code creates a visualization of genotype-expression
association for a selected
SNP-gene pair.
```{r doonePlot, fig=TRUE}
plotOne = function (summex, vcf.tf, whicha, whichv, genome="hg19", ... ) {
sampidsInSumm = colnames(summex)
sampidsInVCF = sampsInVCF(vcf.tf)
oksamp = intersect(sampidsInSumm, sampidsInVCF)
stopifnot(length(oksamp) > 0)
summex = summex[, oksamp]
vp = ScanVcfParam(fixed = "ALT", info = NA, geno = "GT",
samples = oksamp, which = whichv)
vdata = readVcf(vcf.tf, genome = genome, param = vp)
rdd = rowData(vdata)
vdata = GGtools:::snvsOnly(vdata)
gtdata = factor(geno(vdata)[[1]])
plot( as.numeric(assays(summex)[[1]][whicha,]) ~ gtdata , ...)
}
r17 = rmpDEPF[ which(as.character(seqnames(rmpDEPF))=="chr17"),]
s3urlb = sub("22", "17", s3url)
plotOne(r17, TabixFile(s3urlb), 1, c17d[1] )
```
Exercise 2.
What are the names of the SNP
and gene depicted here? Replot, annotating with this
information (improve the x and y labels for maximal
clarity of interpretation).
Exercise 3.
What is the FDR for the association shown in the display?
We can show four relationships at once fairly simply.
```{r dofff,fig=TRUE}
opar = par(no.readonly=TRUE)
par(mfrow=c(2,2))
for (i in 2:5)
plotOne(r17, TabixFile(s3urlb), i, c17d[i] )
par(opar)
```
Exercise 4.
Depict the associations for four gene-SNP pairs
with very small FDR. The four genes should be different.
Hint:
```{r lktops,eval=FALSE}
order(fdr)[1:10]
c17d[7659]
args(plotOne)
plotOne(r17, TabixFile(s3urlb), "114881", c17d[7659] )
```
Exercise 5: Master regulators.
Find SNP whose FDR for association with multiple distinct genes is
found to be less than one percent. Visualize the relationships.
Connection with "GWAS hits"
We can acquire an image of the current NHGRI GWAS catalog
as follows.
```{r lkgw, message=FALSE, warning=FALSE, eval=FALSE}
library(gwascat)
curgw = makeCurrentGwascat()
curgw
```
HOWEVER -- as of 28 July 2014, the coordinates used for the
current catalog are GRCh38. We used rtracklayer's liftOver
with the appropriate chain file to obtain an hg19 addressing scheme,
losing a small number of nonmappable loci.
```{r getgw}
load("/data/eQTL2014/curgwHG19.rda")
curgwHG19[1:3,1:5]
```
Exercise 6
How frequent are exact coincidences between SNP exhibiting
eQTL association at FDR five percent or
less, and GWAS hits? Compare this frequency as computed
for SNP exhibiting association at FDR 95 percent or greater.
Further exercises
Exercise 7: Population of origin.
We have disregarded the population of origin in
this analysis. Use visualization to assess whether the
association between gene OSBPL7 and SNP rs17774008
has similar forms in subjects from CEU and YRI populations.
You can use s3url to get access to genotype data for chr22.
It takes 5 minutes or so to run cisAssoc on chr22 genes and SNP.
Obtain analyses for CEU and YRI groups separately. What
is the concordance on significance of SNP-gene pairs at FDR five percent or so?
Exercise 8: Efficient analysis with counted expression values.
/data/eQTL2014/montpick_eset.RData has the raw counts.
(Note, probe identifiers are in ENSEMBL vocabulary.) Use DESeq2 or edgeR to
obtain an FDR for a selected SNP-gene pair. Write a function
that does this for any given selection. In principle the mean-variance
relationship needs to be modeled separately for each gene-SNP pair,
so obtaining a statistically optimal solution for a
general search may be laborious. It would be nice to understand
the costs of using rlog with blind and fast set to TRUE
before doing a Gaussian-like analysis of eQTL, as opposed to
pair-specific optimally modeled negative binomial testing for eQTL.
Bibliography
```{r results='asis',echo=FALSE}
bibliography() #style="markdown")
```