/",
urlfile = "./results/IGVurl.txt")
},
step_name = "bam_IGV",
dependency = "hisat_mapping",
run_step = "optional"
)
```
### Read counting for mRNA profiling experiments
Reads overlapping with annotation ranges of interest are counted for each
sample using the `summarizeOverlaps` function [@Lawrence2013-kt].
First, the gene annotation ranges from a GFF file are stored in a `TxDb` container for
efficient work with genomic features.
```{r create_txdb, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise(code = {
library(txdbmaker)
txdb <- makeTxDbFromGFF(file="data/tair10.gff", format="gff",
dataSource="TAIR", organism="Arabidopsis thaliana")
saveDb(txdb, file="./data/tair10.sqlite")
},
step_name = "create_txdb",
dependency = "hisat_mapping")
```
Next, The read counting is preformed for exonic gene regions in a
non-strand-specific manner while ignoring overlaps among different genes.
```{r read_counting, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
library(BiocParallel)
txdb <- loadDb("./data/tair10.sqlite")
eByg <- exonsBy(txdb, by="gene")
outpaths <- getColumn(sal, step = "hisat_mapping", 'outfiles', column = 2)
bfl <- BamFileList(outpaths, yieldSize=50000, index=character())
multicoreParam <- MulticoreParam(workers=4); register(multicoreParam); registered()
counteByg <- bplapply(bfl, function(x) summarizeOverlaps(eByg, x, mode="Union",
ignore.strand=TRUE,
inter.feature=TRUE,
singleEnd=TRUE))
# Note: for strand-specific RNA-Seq set 'ignore.strand=FALSE' and for PE data set 'singleEnd=FALSE'
countDFeByg <- sapply(seq(along=counteByg),
function(x) assays(counteByg[[x]])$counts)
rownames(countDFeByg) <- names(rowRanges(counteByg[[1]]))
colnames(countDFeByg) <- names(bfl)
rpkmDFeByg <- apply(countDFeByg, 2, function(x) returnRPKM(counts=x, ranges=eByg))
write.table(countDFeByg, "results/countDFeByg.xls",
col.names=NA, quote=FALSE, sep="\t")
write.table(rpkmDFeByg, "results/rpkmDFeByg.xls",
col.names=NA, quote=FALSE, sep="\t")
},
step_name = "read_counting",
dependency = "create_txdb")
```
Please note, in addition to read counts this step generates RPKM normalized
expression values. For most statistical differential expression or abundance
analysis methods, such as `edgeR` or `DESeq2`, the raw count values should
be used as input. The usage of RPKM values should be restricted to specialty
applications required by some users, _e.g._ manually comparing the expression
levels of different genes or features.
##### Read and alignment stats
The following provides an overview of the number of reads in each sample and
how many of them aligned to the reference.
```{r align_stats, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
read_statsDF <- alignStats(args)
write.table(read_statsDF, "results/alignStats.xls",
row.names = FALSE, quote = FALSE, sep = "\t")
},
step_name = "align_stats",
dependency = "hisat_mapping")
```
The following shows the first four lines of the sample alignment stats file
provided by the `systemPipeR` package. For simplicity the number of PE reads
is multiplied here by 2 to approximate proper alignment frequencies where each
read in a pair is counted.
```{r align_stats2, eval=TRUE}
read.table(system.file("extdata", "alignStats.xls", package = "systemPipeR"), header = TRUE)[1:4,]
```
### Read counting for miRNA profiling experiments
Example of downloading a GFF file for miRNA ranges from an organism of interest (here _A. thaliana_), and then
use them for read counting, here RNA-Seq reads from the above steps.
```{r read_counting_mirna, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
system("wget https://www.mirbase.org/download/ath.gff3 -P ./data/")
gff <- rtracklayer::import.gff("./data/ath.gff3")
gff <- split(gff, elementMetadata(gff)$ID)
bams <- getColumn(sal, step = "bowtie2_mapping", 'outfiles', column = 2)
bfl <- BamFileList(bams, yieldSize=50000, index=character())
countDFmiR <- summarizeOverlaps(gff, bfl, mode="Union",
ignore.strand = FALSE, inter.feature = FALSE)
countDFmiR <- assays(countDFmiR)$counts
# Note: inter.feature=FALSE important since pre and mature miRNA ranges overlap
rpkmDFmiR <- apply(countDFmiR, 2, function(x) returnRPKM(counts = x, ranges = gff))
write.table(assays(countDFmiR)$counts, "results/countDFmiR.xls",
col.names=NA, quote=FALSE, sep="\t")
write.table(rpkmDFmiR, "results/rpkmDFmiR.xls", col.names=NA, quote=FALSE, sep="\t")
},
step_name = "read_counting_mirna",
dependency = "bowtie2_mapping")
```
### Correlation analysis of samples
The following computes the sample-wise Spearman correlation coefficients from
the `rlog` (regularized-logarithm) transformed expression values generated
with the `DESeq2` package. After transformation to a distance matrix,
hierarchical clustering is performed with the `hclust` function and the
result is plotted as a dendrogram.
```{r sample_tree_rlog, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
library(DESeq2, warn.conflicts=FALSE, quietly=TRUE)
library(ape, warn.conflicts=FALSE)
countDFpath <- system.file("extdata", "countDFeByg.xls", package="systemPipeR")
countDF <- as.matrix(read.table(countDFpath))
colData <- data.frame(row.names = targetsWF(sal)[[2]]$SampleName,
condition=targetsWF(sal)[[2]]$Factor)
dds <- DESeqDataSetFromMatrix(countData = countDF, colData = colData,
design = ~ condition)
d <- cor(assay(rlog(dds)), method = "spearman")
hc <- hclust(dist(1-d))
plot.phylo(as.phylo(hc), type = "p", edge.col = 4, edge.width = 3,
show.node.label = TRUE, no.margin = TRUE)
},
step_name = "sample_tree_rlog",
dependency = "read_counting")
```
**Figure 7:** Correlation dendrogram of samples for _`rlog`_ values.
### DEG analysis with `edgeR`
The following _`run_edgeR`_ function is a convenience wrapper for
identifying differentially expressed genes (DEGs) in batch mode with
_`edgeR`_'s GML method [@Robinson2010-uk] for any number of
pairwise sample comparisons specified under the _`cmp`_ argument. Users
are strongly encouraged to consult the
[_`edgeR`_](\href{http://www.bioconductor.org/packages/devel/bioc/vignettes/edgeR/inst/doc/edgeRUsersGuide.pdf) vignette
for more detailed information on this topic and how to properly run _`edgeR`_
on data sets with more complex experimental designs.
```{r edger, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
targetspath <- system.file("extdata", "targets.txt", package = "systemPipeR")
targets <- read.delim(targetspath, comment = "#")
cmp <- readComp(file = targetspath, format = "matrix", delim = "-")
countDFeBygpath <- system.file("extdata", "countDFeByg.xls", package = "systemPipeR")
countDFeByg <- read.delim(countDFeBygpath, row.names = 1)
edgeDF <- run_edgeR(countDF = countDFeByg, targets = targets, cmp = cmp[[1]],
independent = FALSE, mdsplot = "")
DEG_list <- filterDEGs(degDF = edgeDF, filter = c(Fold = 2, FDR = 10))
},
step_name = "edger",
dependency = "read_counting")
```
Filter and plot DEG results for up and down-regulated genes. Because of the
small size of the toy data set used by this vignette, the _FDR_ cutoff value has been
set to a relatively high threshold (here 10%). More commonly used _FDR_ cutoffs
are 1% or 5%. The definition of '_up_' and '_down_' is given in the
corresponding help file. To open it, type `?filterDEGs` in the R console.
**Figure 8:** Up and down regulated DEGs identified by `edgeR`.
### DEG analysis with `DESeq2`
The following `run_DESeq2` function is a convenience wrapper for
identifying DEGs in batch mode with `DESeq2` [@Love2014-sh] for any number of
pairwise sample comparisons specified under the `cmp` argument. Users
are strongly encouraged to consult the
[_`DESeq2`_](http://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.pdf) vignette
for more detailed information on this topic and how to properly run `DESeq2`
on data sets with more complex experimental designs.
```{r deseq2, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
degseqDF <- run_DESeq2(countDF=countDFeByg, targets=targets, cmp=cmp[[1]],
independent=FALSE)
DEG_list2 <- filterDEGs(degDF=degseqDF, filter=c(Fold=2, FDR=10))
},
step_name = "deseq2",
dependency = "read_counting")
```
### Venn Diagrams
The function `overLapper` can compute Venn intersects for large numbers of
sample sets (up to 20 or more) and `vennPlot` can plot 2-5 way Venn diagrams.
A useful feature is the possibility to combine the counts from several Venn
comparisons with the same number of sample sets in a single Venn diagram (here
for 4 up and down DEG sets).
```{r vennplot, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
vennsetup <- overLapper(DEG_list$Up[6:9], type="vennsets")
vennsetdown <- overLapper(DEG_list$Down[6:9], type="vennsets")
vennPlot(list(vennsetup, vennsetdown), mymain="", mysub="",
colmode=2, ccol=c("blue", "red"))
},
step_name = "vennplot",
dependency = "edger")
```
**Figure 9:** Venn Diagram for 4 Up and Down DEG Sets.
### GO term enrichment analysis of DEGs
#### Obtain gene-to-GO mappings
The following shows how to obtain gene-to-GO mappings from `biomaRt` (here
for _A. thaliana_) and how to organize them for the downstream GO term
enrichment analysis. Alternatively, the gene-to-GO mappings can be obtained for
many organisms from Bioconductor's `*.db` genome annotation packages or GO
annotation files provided by various genome databases. For each annotation,
this relatively slow preprocessing step needs to be performed only once.
Subsequently, the preprocessed data can be loaded with the `load` function as
shown in the next step.
```{r get_go_biomart, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
library("biomaRt")
listMarts() # To choose BioMart database
listMarts(host="plants.ensembl.org")
m <- useMart("plants_mart", host="https://plants.ensembl.org")
listDatasets(m)
m <- useMart("plants_mart", dataset="athaliana_eg_gene", host="https://plants.ensembl.org")
listAttributes(m) # Choose data types you want to download
go <- getBM(attributes=c("go_id", "tair_locus", "namespace_1003"), mart=m)
go <- go[go[,3]!="",]; go[,3] <- as.character(go[,3])
go[go[,3]=="molecular_function", 3] <- "F"
go[go[,3]=="biological_process", 3] <- "P"
go[go[,3]=="cellular_component", 3] <- "C"
go[1:4,]
dir.create("./data/GO")
write.table(go, "data/GO/GOannotationsBiomart_mod.txt",
quote=FALSE, row.names=FALSE, col.names=FALSE, sep="\t")
catdb <- makeCATdb(myfile="data/GO/GOannotationsBiomart_mod.txt",
lib=NULL, org="", colno=c(1,2,3), idconv=NULL)
save(catdb, file="data/GO/catdb.RData")
},
step_name = "get_go_biomart",
dependency = "edger")
```
#### Batch GO term enrichment analysis
Apply the enrichment analysis to the DEG sets obtained in the above
differential expression analysis. Note, in the following example the _FDR_
filter is set here to an unreasonably high value, simply because of the small
size of the toy data set used in this vignette. Batch enrichment analysis of
many gene sets is performed with the `GOCluster_Report` function. When
`method="all"`, it returns all GO terms passing the p-value cutoff specified
under the `cutoff` arguments. When `method="slim"`, it returns only the GO
terms specified under the `myslimv` argument. The given example shows how one
can obtain such a GO slim vector from BioMart for a specific organism.
```{r go_enrichment, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
load("data/GO/catdb.RData")
DEG_list <- filterDEGs(degDF=edgeDF, filter=c(Fold=2, FDR=50), plot=FALSE)
up_down <- DEG_list$UporDown; names(up_down) <- paste(names(up_down), "_up_down", sep="")
up <- DEG_list$Up; names(up) <- paste(names(up), "_up", sep="")
down <- DEG_list$Down; names(down) <- paste(names(down), "_down", sep="")
DEGlist <- c(up_down, up, down)
DEGlist <- DEGlist[sapply(DEGlist, length) > 0]
BatchResult <- GOCluster_Report(catdb=catdb, setlist=DEGlist, method="all",
id_type="gene", CLSZ=2, cutoff=0.9,
gocats=c("MF", "BP", "CC"), recordSpecGO=NULL)
library("biomaRt")
m <- useMart("plants_mart", dataset="athaliana_eg_gene", host="https://plants.ensembl.org")
goslimvec <- as.character(getBM(attributes=c("goslim_goa_accession"), mart=m)[,1])
BatchResultslim <- GOCluster_Report(catdb=catdb, setlist=DEGlist, method="slim",
id_type="gene", myslimv=goslimvec, CLSZ=10,
cutoff=0.01, gocats=c("MF", "BP", "CC"),
recordSpecGO=NULL)
gos <- BatchResultslim[grep("M6-V6_up_down", BatchResultslim$CLID), ]
gos <- BatchResultslim
pdf("GOslimbarplotMF.pdf", height=8, width=10); goBarplot(gos, gocat="MF"); dev.off()
goBarplot(gos, gocat="BP")
goBarplot(gos, gocat="CC")
},
step_name = "go_enrichment",
dependency = "get_go_biomart")
```
#### Plot batch GO term results
The `data.frame` generated by `GOCluster_Report` can be plotted with the
`goBarplot` function. Because of the variable size of the sample sets, it may
not always be desirable to show the results from different DEG sets in the same
bar plot.
**Figure 10:** GO Slim Barplot for MF Ontology.
### Clustering and heat maps
The following example performs hierarchical clustering on the `rlog`
transformed expression matrix subsetted by the DEGs identified in the above
differential expression analysis. It uses a Pearson correlation-based distance
measure and complete linkage for cluster joining.
```{r hierarchical_clustering, message=FALSE, eval=FALSE, spr=TRUE}
appendStep(sal) <- LineWise({
library(pheatmap)
geneids <- unique(as.character(unlist(DEG_list[[1]])))
y <- assay(rlog(dds))[geneids, ]
pdf("heatmap1.pdf")
pheatmap(y, scale="row", clustering_distance_rows="correlation",
clustering_distance_cols="correlation")
dev.off()
},
step_name = "hierarchical_clustering",
dependency = c("sample_tree_rlog", "edger"))
```
**Figure 11:** Heat map with hierarchical clustering dendrograms of DEGs.
# Version information
```{r sessionInfo}
sessionInfo()
```
# Funding
This project is funded by awards from the National Science Foundation ([ABI-1661152](https://www.nsf.gov/awardsearch/showAward?AWD_ID=1661152)],
and the National Institute on Aging of the National Institutes of Health ([U19AG023122](https://reporter.nih.gov/project-details/9632486)).
# References