## ----vignetteSetup, echo=FALSE, message=FALSE, warning = FALSE-------------
## Track time spent on making the vignette
startTime <- Sys.time()

## Bib setup
library('knitcitations')

## Load knitcitations with a clean bibliography
cleanbib()
cite_options(hyperlink = 'to.doc', citation_format = 'text', style = 'html')
# Note links won't show for now due to the following issue
# https://github.com/cboettig/knitcitations/issues/63

## Write bibliography information
bib <- c(
    R = citation(),
    AnnotationDbi = RefManageR::BibEntry(bibtype = 'manual',
        key = 'AnnotationDbi',
        author = 'Hervé Pagès and Marc Carlson and Seth Falcon and Nianhua Li',
        title = 'AnnotationDbi: Annotation Database Interface',
        year = 2017, doi = '10.18129/B9.bioc.AnnotationDbi'),
    BiocParallel = citation('BiocParallel'),
    BiocStyle = citation('BiocStyle'),
    derfinder = citation('derfinder')[1], 
    DESeq2 = citation('DESeq2'),
    devtools = citation('devtools'),
    downloader = citation('downloader'),
    EnsDb.Hsapiens.v79 = citation('EnsDb.Hsapiens.v79'),
    GEOquery = citation('GEOquery'),
    GenomeInfoDb = RefManageR::BibEntry(bibtype = 'manual',
        key = 'GenomeInfoDb',
        author = 'Sonali Arora and Martin Morgan and Marc Carlson and H. Pagès',
        title = "GenomeInfoDb: Utilities for manipulating chromosome and other 'seqname' identifiers",
        year = 2017, doi = '10.18129/B9.bioc.GenomeInfoDb'),
    GenomicFeatures = citation('GenomicFeatures'),
    GenomicRanges = citation('GenomicRanges'),
    IRanges = citation('IRanges'),
    knitcitations = citation('knitcitations'),
    knitr = citation('knitr')[3],
    org.Hs.eg.db = citation('org.Hs.eg.db'),
    RCurl = citation('RCurl'),
	recount = citation('recount')[1],
    recountworkflow = citation('recount')[2],
    phenopredict = citation('recount')[3],
    regionReport = citation('regionReport'),
    rentrez = citation('rentrez'),
    rmarkdown = citation('rmarkdown'),
    rtracklayer = citation('rtracklayer'),
    S4Vectors = RefManageR::BibEntry(bibtype = 'manual', key = 'S4Vectors',
        author = 'Hervé Pagès and Michael Lawrence and Patrick Aboyoun',
        title = "S4Vectors: S4 implementation of vector-like and list-like objects",
        year = 2017, doi = '10.18129/B9.bioc.S4Vectors'),
    SummarizedExperiment = RefManageR::BibEntry(bibtype = 'manual',
        key = 'SummarizedExperiment',
        author = 'Martin Morgan and Valerie Obenchain and Jim Hester and Hervé Pagès',
        title = 'SummarizedExperiment: SummarizedExperiment container',
        year = 2017, doi = '10.18129/B9.bioc.SummarizedExperiment'),
    testthat = citation('testthat')
)

write.bibtex(bib, file = 'quickstartRef.bib')

## ----'installDer', eval = FALSE--------------------------------------------
#  ## try http:// if https:// URLs are not supported
#  source("https://bioconductor.org/biocLite.R")
#  biocLite("recount")
#  
#  ## Check that you have a valid Bioconductor installation
#  biocValid()

## ----'citation'------------------------------------------------------------
## Citation info
citation('recount')

## ----'ultraQuick', eval = FALSE--------------------------------------------
#  ## Load library
#  library('recount')
#  
#  ## Find a project of interest
#  project_info <- abstract_search('GSE32465')
#  
#  ## Download the gene-level RangedSummarizedExperiment data
#  download_study(project_info$project)
#  
#  ## Load the data
#  load(file.path(project_info$project, 'rse_gene.Rdata'))
#  
#  ## Browse the project at SRA
#  browse_study(project_info$project)
#  
#  ## View GEO ids
#  colData(rse_gene)$geo_accession
#  
#  ## Extract the sample characteristics
#  geochar <- lapply(split(colData(rse_gene), seq_len(nrow(colData(rse_gene)))), geo_characteristics)
#  
#  ## Note that the information for this study is a little inconsistent, so we
#  ## have to fix it.
#  geochar <- do.call(rbind, lapply(geochar, function(x) {
#      if('cells' %in% colnames(x)) {
#          colnames(x)[colnames(x) == 'cells'] <- 'cell.line'
#          return(x)
#      } else {
#          return(x)
#      }
#  }))
#  
#  ## We can now define some sample information to use
#  sample_info <- data.frame(
#      run = colData(rse_gene)$run,
#      group = ifelse(grepl('uninduced', colData(rse_gene)$title), 'uninduced', 'induced'),
#      gene_target = sapply(colData(rse_gene)$title, function(x) { strsplit(strsplit(x,
#          'targeting ')[[1]][2], ' gene')[[1]][1] }),
#      cell.line = geochar$cell.line
#  )
#  
#  ## Scale counts by taking into account the total coverage per sample
#  rse <- scale_counts(rse_gene)
#  
#  ## Add sample information for DE analysis
#  colData(rse)$group <- sample_info$group
#  colData(rse)$gene_target <- sample_info$gene_target
#  
#  ## Perform differential expression analysis with DESeq2
#  library('DESeq2')
#  
#  ## Specify design and switch to DESeq2 format
#  dds <- DESeqDataSet(rse, ~ gene_target + group)
#  
#  ## Perform DE analysis
#  dds <- DESeq(dds, test = 'LRT', reduced = ~ gene_target, fitType = 'local')
#  res <- results(dds)
#  
#  ## Explore results
#  plotMA(res, main="DESeq2 results for SRP009615")
#  
#  ## Make a report with the results
#  library('regionReport')
#  DESeq2Report(dds, res = res, project = 'SRP009615',
#      intgroup = c('group', 'gene_target'), outdir = '.',
#      output = 'SRP009615-results')

## ----'er_analysis', eval = FALSE-------------------------------------------
#  ## Define expressed regions for study SRP009615, only for chromosome Y
#  regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L,
#      maxClusterGap = 3000L)
#  
#  ## Compute coverage matrix for study SRP009615, only for chromosome Y
#  system.time( rse_ER <- coverage_matrix('SRP009615', 'chrY', regions) )
#  
#  ## Round the coverage matrix to integers
#  covMat <- round(assays(rse_ER)$counts, 0)
#  
#  ## Get phenotype data for study SRP009615
#  pheno <- colData(rse_ER)
#  
#  ## Complete the phenotype table with the data we got from GEO
#  m <- match(pheno$run, sample_info$run)
#  pheno <- cbind(pheno, sample_info[m, 2:3])
#  
#  ## Build a DESeqDataSet
#  dds_ers <- DESeqDataSetFromMatrix(countData = covMat, colData = pheno,
#      design =  ~ gene_target + group)
#  
#  ## Perform differential expression analysis with DESeq2 at the ER-level
#  dds_ers <- DESeq(dds_ers, test = 'LRT', reduced = ~ gene_target,
#      fitType = 'local')
#  res_ers <- results(dds_ers)
#  
#  ## Explore results
#  plotMA(res_ers, main="DESeq2 results for SRP009615 (ER-level, chrY)")
#  
#  ## Create a more extensive exploratory report
#  DESeq2Report(dds_ers, res = res_ers,
#      project = 'SRP009615 (ER-level, chrY)',
#      intgroup = c('group', 'gene_target'), outdir = '.',
#      output = 'SRP009615-results-ER-level-chrY')

## ----'install', eval = FALSE-----------------------------------------------
#  ## try http:// if https:// URLs are not supported
#  source("https://bioconductor.org/biocLite.R")
#  biocLite("recount")

## ----'start', message=FALSE------------------------------------------------
## Load recount R package
library('recount')

## ----'search_abstract'-----------------------------------------------------
## Find a project of interest
project_info <- abstract_search('GSE32465')

## Explore info
project_info

## ----'download'------------------------------------------------------------
## Download the gene-level RangedSummarizedExperiment data
download_study(project_info$project)

## Load the data
load(file.path(project_info$project, 'rse_gene.Rdata'))

## ----'explore_rse'---------------------------------------------------------
rse_gene

## This is the sample phenotype data provided by the recount project
colData(rse_gene)

## At the gene level, the row data includes the gene Gencode ids, the gene
## symbols and the sum of the disjoint exons widths, which can be used for 
## taking into account the gene length.
rowData(rse_gene)

## At the exon level, you can get the gene Gencode ids from the names of:
# rowRanges(rse_exon)

## ----'browse'--------------------------------------------------------------
## Browse the project at SRA
browse_study(project_info$project)

## ----'sample_info', warning = FALSE----------------------------------------
## View GEO ids
colData(rse_gene)$geo_accession

## Extract the sample characteristics
geochar <- lapply(split(colData(rse_gene), seq_len(nrow(colData(rse_gene)))), geo_characteristics)

## Note that the information for this study is a little inconsistent, so we
## have to fix it.
geochar <- do.call(rbind, lapply(geochar, function(x) {
    if('cells' %in% colnames(x)) {
        colnames(x)[colnames(x) == 'cells'] <- 'cell.line'
        return(x)
    } else {
        return(x)
    }
}))

## We can now define some sample information to use
sample_info <- data.frame(
    run = colData(rse_gene)$run,
    group = ifelse(grepl('uninduced', colData(rse_gene)$title), 'uninduced', 'induced'),
    gene_target = sapply(colData(rse_gene)$title, function(x) { strsplit(strsplit(x,
        'targeting ')[[1]][2], ' gene')[[1]][1] }),
    cell.line = geochar$cell.line
)

## ----'scale_counts'--------------------------------------------------------
## Scale counts by taking into account the total coverage per sample
rse <- scale_counts(rse_gene)

##### Details about counts #####

## Scale counts to 40 million mapped reads. Not all mapped reads are in exonic
## sequence, so the total is not necessarily 40 million.
colSums(assays(rse)$counts) / 1e6

## Compute read counts
rse_read_counts <- read_counts(rse_gene)
## Difference between read counts and number of reads downloaded by Rail-RNA
colSums(assays(rse_read_counts)$counts) / 1e6 -
    colData(rse_read_counts)$reads_downloaded / 1e6
## Check the help page for read_counts() for more details

## ----'add_sample_info'-----------------------------------------------------
## Add sample information for DE analysis
colData(rse)$group <- sample_info$group
colData(rse)$gene_target <- sample_info$gene_target

## ----'de_analysis'---------------------------------------------------------
## Perform differential expression analysis with DESeq2
library('DESeq2')

## Specify design and switch to DESeq2 format
dds <- DESeqDataSet(rse, ~ gene_target + group)

## Perform DE analysis
dds <- DESeq(dds, test = 'LRT', reduced = ~ gene_target, fitType = 'local')
res <- results(dds)

## ----'maplot', fig.cap = "MA plot for group differences adjusted by gene target for SRP009615 using gene-level data."----
## Explore results
plotMA(res, main="DESeq2 results for SRP009615")

## ----'make_report', eval = FALSE-------------------------------------------
#  ## Make a report with the results
#  library('regionReport')
#  report <- DESeq2Report(dds, res = res, project = 'SRP009615',
#      intgroup = c('group', 'gene_target'), outdir = '.',
#      output = 'SRP009615-results', nBest = 10, nBestFeatures = 2)

## ----'make_report_real', echo = FALSE, results = 'hide'--------------------
library('regionReport')

## Make it so that the report will be available as a vignette
original <- readLines(system.file('DESeq2Exploration', 'DESeq2Exploration.Rmd',
    package = 'regionReport'))
vignetteInfo <- c(
    'vignette: >',
    '  %\\VignetteEngine{knitr::rmarkdown}',
    '  %\\VignetteIndexEntry{Basic DESeq2 results exploration}',
    '  %\\VignetteEncoding{UTF-8}'
)
new <- c(original[1:16], vignetteInfo, original[17:length(original)])
writeLines(new, 'SRP009615-results-template.Rmd')

## Now create the report
report <- DESeq2Report(dds, res = res, project = 'SRP009615',
    intgroup = c('group', 'gene_target'), outdir = '.',
    output = 'SRP009615-results', device = 'png', template = 'SRP009615-results-template.Rmd', nBest = 10, nBestFeatures = 2)
    
## Clean up
file.remove('SRP009615-results-template.Rmd')

## ----'geneSymbols'---------------------------------------------------------
## Load required library
library('org.Hs.eg.db')

## Extract Gencode gene ids
gencode <- gsub('\\..*', '', names(recount_genes))

## Find the gene information we are interested in
gene_info <- select(org.Hs.eg.db, gencode, c('ENTREZID', 'GENENAME', 'SYMBOL',
    'ENSEMBL'), 'ENSEMBL')

## Explore part of the results
dim(gene_info)
head(gene_info)

## ----'define_ers', eval = .Platform$OS.type != 'windows'-------------------
## Define expressed regions for study SRP009615, only for chromosome Y
regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L, 
    maxClusterGap = 3000L)

## Briefly explore the resulting regions
regions

## ----'compute_covMat', eval = .Platform$OS.type != 'windows'---------------
## Compute coverage matrix for study SRP009615, only for chromosome Y
system.time( rse_ER <- coverage_matrix('SRP009615', 'chrY', regions) )

## Explore the RSE a bit
dim(rse_ER)
rse_ER

## ----'to_integer', eval = .Platform$OS.type != 'windows'-------------------
## Round the coverage matrix to integers
covMat <- round(assays(rse_ER)$counts, 0)

## ----'phenoData', eval = .Platform$OS.type != 'windows'--------------------
## Get phenotype data for study SRP009615
pheno <- colData(rse_ER)

## Complete the phenotype table with the data we got from GEO
m <- match(pheno$run, sample_info$run)
pheno <- cbind(pheno, sample_info[m, 2:3])

## Explore the phenotype data a little bit
head(pheno)

## ----'ers_dds', eval = .Platform$OS.type != 'windows'----------------------
## Build a DESeqDataSet
dds_ers <- DESeqDataSetFromMatrix(countData = covMat, colData = pheno,
    design =  ~ gene_target + group)

## ----'de_analysis_ers', eval = .Platform$OS.type != 'windows'--------------
## Perform differential expression analysis with DESeq2 at the ER-level
dds_ers <- DESeq(dds_ers, test = 'LRT', reduced = ~ gene_target,
    fitType = 'local')
res_ers <- results(dds_ers)

## ----'maploters', eval = .Platform$OS.type != 'windows', fig.cap = "MA plot for group differences adjusted by gene target for SRP009615 using ER-level data (just chrY)."----
## Explore results
plotMA(res_ers, main="DESeq2 results for SRP009615 (ER-level, chrY)")

## ----'report2', eval = FALSE-----------------------------------------------
#  ## Create the report
#  report2 <- DESeq2Report(dds_ers, res = res_ers,
#      project = 'SRP009615 (ER-level, chrY)',
#      intgroup = c('group', 'gene_target'), outdir = '.',
#      output = 'SRP009615-results-ER-level-chrY')

## ---- 'gene_annotation'----------------------------------------------------
## Gene annotation information
rowRanges(rse_gene_SRP009615)

## Also accessible via
rowData(rse_gene_SRP009615)

## ---- 'exon_annotation'----------------------------------------------------
## Get the rse_exon object for study SRP009615
download_study('SRP009615', type = 'rse-exon')
load(file.path('SRP009615', 'rse_exon.Rdata'))

## Annotation information
rowRanges(rse_exon)

## Add a gene_id column
rowRanges(rse_exon)$gene_id <- rownames(rse_exon)

## Example subset
rse_exon_subset <- subset(rse_exon, subset = gene_id == 'ENSG00000000003.14')
rowRanges(rse_exon_subset)

## ---- eval = .Platform$OS.type != 'windows'--------------------------------
## Get the disjoint exons based on EnsDb.Hsapiens.v79 which matches hg38
exons <- reproduce_ranges('exon', db = 'EnsDb.Hsapiens.v79')

## Change the chromosome names to match those used in the BigWig files
library('GenomeInfoDb')
seqlevelsStyle(exons) <- 'UCSC'

## Get the count matrix at the exon level for chrY
exons_chrY <- keepSeqlevels(exons, 'chrY', pruning.mode = 'coarse')
exonRSE <- coverage_matrix('SRP009615', 'chrY', unlist(exons_chrY),
    chunksize = 3000)
exonMatrix <- assays(exonRSE)$counts
dim(exonMatrix)
head(exonMatrix)

## Summary the information at the gene level for chrY
exon_gene <- rep(names(exons_chrY), elementNROWS(exons_chrY))
geneMatrix <- do.call(rbind, lapply(split(as.data.frame(exonMatrix),
    exon_gene), colSums))
dim(geneMatrix)
head(geneMatrix)

## ----'gene_fusions'--------------------------------------------------------
library('recount')

## Download and load RSE object for SRP009615 study
download_study('SRP009615', type = 'rse-jx')
load(file.path('SRP009615', 'rse_jx.Rdata'))

## Exon-exon junctions by class
table(rowRanges(rse_jx)$class)

## Potential gene fusions by chromosome
fusions <- rowRanges(rse_jx)[rowRanges(rse_jx)$class == 'fusion']
fusions_by_chr <- sort(table(seqnames(fusions)), decreasing = TRUE)
fusions_by_chr[fusions_by_chr > 0]

## Genes with the most fusions
head(sort(table(unlist(fusions$symbol_proposed)), decreasing = TRUE))

## ----snaptron--------------------------------------------------------------
library('GenomicRanges')
junctions <- GRanges(seqnames = 'chr2', IRanges(
    start = c(28971711, 29555082, 29754983),
    end = c(29462418, 29923339, 29917715)))

snaptron_query(junctions)

## ----'downloadAll'---------------------------------------------------------
## Get data.frame with all the URLs
library('recount')
dim(recount_url)

## Explore URLs
head(recount_url$url)

## ----'actuallyDownload', eval = FALSE--------------------------------------
#  ## Download all files (will take a while! It's over 8 TB of data)
#  sapply(unique(recount_url$project), download_study, type = 'all')

## ----Figure1, out.width="100%", fig.align="center", fig.cap = "SIgn up to SciServer", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/register.png")

## ----Figure2, out.width="100%", fig.align="center", fig.cap = "Click on SciServer Compute,", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/sciserver-compute.png")

## ----Figure3, out.width="100%", fig.align="center", fig.cap = "Select the appropriate image and load the recount public volume.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/new-container.png")

## ----Figure4, out.width="100%", fig.align="center", fig.cap = "Create a R notebook.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/new-R.png")

## ----Figure5, out.width="100%", fig.align="center", fig.cap = "Install recount and DESeq2 on SciServer.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/demo1.png")

## ----Figure6, out.width="100%", fig.align="center", fig.cap = "recount files are available locally so remmeber to use the local data when working on SciServer.", echo = FALSE----
knitr::include_graphics("https://raw.githubusercontent.com/leekgroup/recount-website/master/sciserver_demo/demo2.png")

## ----'ER-sciserver', eval = FALSE------------------------------------------
#  ## Expressed-regions SciServer example
#  regions <- expressed_regions('SRP009615', 'chrY', cutoff = 5L,
#      maxClusterGap = 3000L, outdir = file.path(scipath, 'SRP009615'))
#  regions

## ----createVignette, eval=FALSE--------------------------------------------
#  ## Create the vignette
#  library('rmarkdown')
#  system.time(render('recount-quickstart.Rmd', 'BiocStyle::html_document'))
#  
#  ## Extract the R code
#  library('knitr')
#  knit('recount-quickstart.Rmd', tangle = TRUE)

## ----createVignette2-------------------------------------------------------
## Clean up
file.remove('quickstartRef.bib')

## ----reproduce1, echo=FALSE------------------------------------------------
## Date the vignette was generated
Sys.time()

## ----reproduce2, echo=FALSE------------------------------------------------
## Processing time in seconds
totalTime <- diff(c(startTime, Sys.time()))
round(totalTime, digits=3)

## ----reproduce3, echo=FALSE-------------------------------------------------------------------------------------------
## Session info
library('devtools')
options(width = 120)
session_info()

## ----'datasetup', echo = FALSE, eval = FALSE--------------------------------------------------------------------------
#  ## Code for re-creating the data distributed in this package
#  
#  ## Genes/exons
#  library('recount')
#  recount_both <- reproduce_ranges('both', db = 'Gencode.v25')
#  recount_exons <- recount_both$exon
#  save(recount_exons, file = '../data/recount_exons.RData', compress = 'xz')
#  recount_genes <- recount_both$gene
#  save(recount_genes, file = '../data/recount_genes.RData', compress = 'xz')
#  
#  ## URL table
#  load('../../recount-website/fileinfo/upload_table.Rdata')
#  recount_url <- upload_table
#  ## Fake urls for now
#  is.bw <- grepl('[.]bw$', recount_url$file_name)
#  recount_url$url <- NA
#  recount_url$url[!is.bw] <- paste0('http://duffel.rail.bio/recount/',
#      recount_url$project[!is.bw], '/', recount_url$file_name[!is.bw])
#  recount_url$url[is.bw] <- paste0('http://duffel.rail.bio/recount/',
#      recount_url$project[is.bw], '/bw/', recount_url$file_name[is.bw])
#  save(recount_url, file = '../data/recount_url.RData', compress = 'xz')
#  
#  ## Abstract info
#  load('../../recount-website/website/meta_web.Rdata')
#  recount_abstract <- meta_web[, c('number_samples', 'species', 'abstract')]
#  recount_abstract$project <- gsub('.*">|</a>', '', meta_web$accession)
#  Encoding(recount_abstract$abstract) <- 'latin1'
#  recount_abstract$abstract <- iconv(recount_abstract$abstract, 'latin1', 'UTF-8')
#  save(recount_abstract, file = '../data/recount_abstract.RData',
#      compress = 'bzip2')
#  
#  ## Example rse_gene file
#  system('scp e:/dcl01/leek/data/recount-website/rse/rse_sra/SRP009615/rse_gene.Rdata .')
#  load('rse_gene.Rdata')
#  rse_gene_SRP009615 <- rse_gene
#  save(rse_gene_SRP009615, file = '../data/rse_gene_SRP009615.RData',
#      compress = 'xz')
#  unlink('rse_gene.Rdata')

## ----'cleanup', echo = FALSE, results = 'hide'------------------------------------------------------------------------
## Clean up
unlink('SRP009615', recursive = TRUE)

## ----vignetteBiblio, results = 'asis', echo = FALSE, warning = FALSE, message = FALSE---------------------------------
## Print bibliography
bibliography()