---
title: "BLASE for excluding developmental genes from bulk RNA-seq"
output: BiocStyle::html_document
vignette: >
  %\VignetteIndexEntry{BLASE for excluding developmental genes from bulk RNA-seq}
  %\VignetteEncoding{UTF-8}
  %\VignetteEngine{knitr::rmarkdown}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)
```

```{r setup, results='hide', message=FALSE, warning=FALSE}
library(scater)
library(ggplot2)
library(BiocParallel)
library(blase)
library(limma)
library(ggVennDiagram)
```

```{r, randomseed}
RNGversion("3.5.0")
SEED <- 7
set.seed(SEED)
```

```{r, concurrency}
N_CORES <- 2
bpparam <- MulticoreParam(N_CORES)
```

This article will show how BLASE can be used for excluding the effect of 
developmental differences when analysing bulk RNA-seq data. 
We make use of scRNA-seq 
[Dogga et al. 2024](https://www.science.org/doi/10.1126/science.adj4088)
and bulk RNA-seq data from 
[Zhang et al. 2021](https://www.nature.com/articles/s41467-021-24814-1). 
Code for generating the objects used here is available in the BLASE 
reproducibility repository.

## Load Data 

First we'll load in the data we're using, pre-prepared from the BLASE
reproducibility repository.

```{r, load_data}
data(zhang_2021_heat_shock_bulk, package = "blase")
data(MCA_PF_SCE, package = "blase")
```

We can examine the true lifecycle stages, and also the calculated pseudotime
trajectory (Slingshot).

```{r, sc_umaps}
#| fig.alt: >
#|   UMAP of Dogga et al. single-cell RNA-seq reference coloured by lifecycle
#|   stage, showing the cycle from Rings, Trophozoites, Schizonts, to Rings
#|   again.
plotUMAP(MCA_PF_SCE, colour_by = "STAGE_HR")
#| fig.alt: >
#|   UMAP of Dogga et al. single-cell RNA-seq reference coloured by pseudotime,
#|   starting with Rings, and ending with Schizonts.
plotUMAP(MCA_PF_SCE, colour_by = "slingPseudotime_1")
```
## Prepare BLASE 

Now we'll prepare BLASE for use.

### Create BLASE data object 

First, we create the object, giving it the 
number of bins we want to use, and how to calculate them.

```{r, create_BLASE_object}
blase_data <- as.BlaseData(
    MCA_PF_SCE,
    pseudotime_slot = "slingPseudotime_1",
    n_bins = 8,
    split_by = "pseudotime_range"
)

# Add these bins to the sc metadata
MCA_PF_SCE <- assign_pseudotime_bins(MCA_PF_SCE,
    pseudotime_slot = "slingPseudotime_1",
    n_bins = 8,
    split_by = "pseudotime_range"
)
```

### Select Genes

Now we will select the genes we want to use, using BLASE's gene peakedness 
metric.

```{r, calculate_gene_peakedness}
gene_peakedness_info <- calculate_gene_peakedness(
    MCA_PF_SCE,
    window_pct = 5,
    knots = 18,
    BPPARAM = bpparam
)

genes_to_use <- gene_peakedness_spread_selection(
    MCA_PF_SCE,
    gene_peakedness_info,
    genes_per_bin = 30,
    n_gene_bins = 30
)

head(gene_peakedness_info)
```

Here, we add them to the BLASE object for mapping with.
```{r, add_genes_to_blase_data}
genes(blase_data) <- genes_to_use
```

## Calculate Mappings

Now we can perform the actual mapping step, and review the results.

```{r, mapping}
mapping_results <- map_all_best_bins(
    blase_data,
    zhang_2021_heat_shock_bulk,
    BPPARAM = bpparam
)
```

```{r, mappingPlot}
#| fig.alt: >
#|   Heatmap of mapping correlations of the Zhang et al. data onto
#|   the Dogga et al. scRNA-seq.
plot_mapping_result_heatmap(mapping_results)
```

## Calculate DE genes

We calculate DE genes using Limma (ref) as we have access to only
normalised counts.

```{r, setUpDE}
metadata <- data.frame(row.names = seq_len(12))
metadata$strain <- c(
    rep("NF54", 4),
    rep("PB4", 4),
    rep("PB31", 4)
)
metadata$growth_conditions <- rep(c("Normal", "Normal", "HS", "HS"), 3)
metadata$group <- paste0(metadata$strain, "_", metadata$growth_conditions)
rownames(metadata) <- c(
    "NF54_37_1",
    "NF54_37_2",
    "NF54_41_1",
    "NF54_41_2",
    "PB4_37_1",
    "PB4_37_2",
    "PB4_41_1",
    "PB4_41_2",
    "PB31_37_1",
    "PB31_37_2",
    "PB31_41_1",
    "PB31_41_2"
)

design <- model.matrix(~ 0 + group, metadata)
colnames(design) <- gsub("group", "", colnames(design))

contr.matrix <- makeContrasts(
    WT_normal_v_HS = NF54_Normal - NF54_HS,
    PB31_normal_v_HS = PB31_Normal - PB31_HS,
    normal_NF54_v_PB31 = NF54_Normal - PB31_Normal,
    levels = colnames(design)
)
```

### Phenotype + Development (Bulk DE)
```{r, calculateBulkDE}
vfit <- lmFit(zhang_2021_heat_shock_bulk, design)
vfit <- contrasts.fit(vfit, contrasts = contr.matrix)
efit <- eBayes(vfit)

summary(decideTests(efit))

PB31_normal_v_HS_BULK_DE <- topTable(efit, n = Inf, coef = 2)
PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[PB31_normal_v_HS_BULK_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_BULK_DE$logFC > 0, ]
PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[order(-PB31_normal_v_HS_BULK_DE$logFC), ]
```

### Development (SC DE)
First, let's pseudobulk based on the pseudotime bins
```{r, pseudobulk}
pseudobulked_MCA_PF_SCE <- data.frame(row.names = rownames(MCA_PF_SCE))
for (r in mapping_results) {
    pseudobulked_MCA_PF_SCE[bulk_name(r)] <- rowSums(counts(MCA_PF_SCE[, MCA_PF_SCE$pseudotime_bin == best_bin(r)]))
}
pseudobulked_MCA_PF_SCE <- as.matrix(pseudobulked_MCA_PF_SCE)
```

And now calculate DE genes for these
```{r}
## Normalise, not needed for true bulks
par(mfrow = c(1, 2))
v <- voom(pseudobulked_MCA_PF_SCE, design, plot = FALSE)

vfit_sc <- lmFit(v, design)
vfit_sc <- contrasts.fit(vfit_sc, contrasts = contr.matrix)
efit_sc <- eBayes(vfit_sc)

summary(decideTests(efit_sc))
```

```{r}
PB31_normal_v_HS_SC_DE <- topTable(efit_sc, n = Inf, coef = 2)
PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[PB31_normal_v_HS_SC_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_SC_DE$logFC > 0, ]
PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[order(-PB31_normal_v_HS_SC_DE$logFC), ]
```

## Remove Developmental Genes
How many of these genes overlap? We can look at the intersection using a 
Venn Diagram:
```{r}
#| fig.alt: >
#|   Venn Diagram showing the intersection of Bulk and SC DE genes.
ggVennDiagram(list(
    Phenotype = rownames(PB31_normal_v_HS_BULK_DE),
    Development = rownames(PB31_normal_v_HS_SC_DE)
)) + ggtitle("PB31 Normal vs Heat Shock")
```

Great, there are genes which we can correct. We can now remove these from the 
original DE genes list.

NB: This is a primitive approach, and there are likely many other and better
ways to calculate which genes should be kept or removed.
```{r}
PB31_normal_v_HS_corrected_DE <- rownames(PB31_normal_v_HS_BULK_DE[!(rownames(PB31_normal_v_HS_BULK_DE) %in% rownames(PB31_normal_v_HS_SC_DE)), ])
head(PB31_normal_v_HS_corrected_DE)
```

Further downstream analysis can now be performed, as desired by the researcher, 
for example Gene Ontology term enrichment. 

## Session Info
```{r, sessionInfo}
sessionInfo()
```