## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----load libraries-----------------------------------------------------------
suppressPackageStartupMessages({
    library(anglemania)
    library(dplyr)
    library(Seurat)
    options(Seurat.object.assay.version = "v5")
    library(splatter)
    library(SingleCellExperiment)
    library(scater)
    library(scran)
    library(bluster)
    library(batchelor)
    library(UpSetR)
})

## ----create simulated data----------------------------------------------------
batch.facLoc <- 0.3
de.facLoc <- 0.1
nBatches <- 4
nGroups <- 3
nGenes <- 5000
groupCells <- 300

sce_raw <- splatSimulate(
    batchCells = rep(groupCells * nGroups, nBatches),
    batch.facLoc = batch.facLoc,
    group.prob = rep(1 / nGroups, nGroups),
    nGenes = nGenes,
    batch.facScale = 0.1,
    method = "groups",
    verbose = FALSE,
    out.prob = 0.001,
    de.prob = 0.1, # mild
    de.facLoc = de.facLoc,
    de.facScale = 0.1,
    bcv.common = 0.1,
    seed = 42
)
sce <- sce_raw
assays(sce)

## ----fig.cap = "UMAPs of unintegrated data, colored by Batch and Group. The clusters are driven by batch effects.", fig.wide = TRUE, fig.width = 10----
sce_unintegrated <- sce
# Normalization.
sce_unintegrated <- logNormCounts(sce_unintegrated)

# Feature selection.
dec <- modelGeneVar(sce_unintegrated)
hvg <- getTopHVGs(dec, prop = 0.1)

# PCA.
set.seed(1234)
sce_unintegrated <- scater::runPCA(
    sce_unintegrated,
    ncomponents = 50,
    subset_row = hvg
)

# Clustering.
colLabels(sce_unintegrated) <- clusterCells(sce_unintegrated,
    use.dimred = "PCA",
    BLUSPARAM = NNGraphParam(cluster.fun = "louvain")
)

# Visualization.
sce_unintegrated <- scater::runUMAP(sce_unintegrated, dimred = "PCA")
patchwork::wrap_plots(
    plotUMAP(sce_unintegrated, colour_by = "Batch") +
        ggtitle("Unintegrated data, colored by Batch"),
    plotUMAP(sce_unintegrated, colour_by = "Group") +
        ggtitle("Unintegrated data, colored by Group")
)

## ----run anglemania, message = FALSE------------------------------------------
head(colData(sce))
batch_key <- "Batch"
sce <- anglemania(
    sce,
    batch_key = batch_key,
    zscore_mean_threshold = 2,
    zscore_sn_threshold = 2
)

## -----------------------------------------------------------------------------
anglemania_genes <- get_anglemania_genes(sce)
head(anglemania_genes)
length(anglemania_genes)

## ----select genes, message = FALSE--------------------------------------------
# If you think the number of selected genes is
# too high or low you can adjust the thresholds:
sce <- select_genes(sce,
    zscore_mean_threshold = 2.5,
    zscore_sn_threshold = 2.5
)
# Inspect the anglemania genes
anglemania_genes <- get_anglemania_genes(sce)
head(anglemania_genes)
length(anglemania_genes) # 306 genes are selected with these thresholds

## ----MNN_300 HVGs-------------------------------------------------------------
hvg_300 <- sce %>%
    scater::logNormCounts() %>%
    modelGeneVar(block = colData(sce)[[batch_key]]) %>%
    getTopHVGs(n = 300)

barcodes_by_batch <- split(rownames(colData(sce)), colData(sce)[[batch_key]])
sce_list <- lapply(barcodes_by_batch, function(x) sce[, x])
sce_mnn <- sce %>%
    scater::logNormCounts()
sce_mnn <- batchelor::fastMNN(
    sce_mnn,
    subset.row = hvg_300,
    k = 20,
    batch = factor(colData(sce_mnn)[[batch_key]]),
    d = 50
)
reducedDim(sce, "MNN_hvg_300") <- reducedDim(sce_mnn, "corrected")
sce <- scater::runUMAP(sce, dimred = "MNN_hvg_300", name = "umap_MNN_hvg_300")
# k is the number of nearest neighbours to consider when identifying MNNs

## ----MNN_2000 HVGs------------------------------------------------------------
hvg_2000 <- sce %>%
    scater::logNormCounts() %>%
    modelGeneVar(block = colData(sce)[[batch_key]]) %>%
    getTopHVGs(n = 2000)

barcodes_by_batch <- split(rownames(colData(sce)), colData(sce)[[batch_key]])
sce_list <- lapply(barcodes_by_batch, function(x) sce[, x])
sce_mnn <- sce %>%
    scater::logNormCounts()
sce_mnn <- batchelor::fastMNN(
    sce_mnn,
    subset.row = hvg_2000,
    k = 20,
    batch = factor(colData(sce_mnn)[[batch_key]]),
    d = 50
)
reducedDim(sce, "MNN_hvg_2000") <- reducedDim(sce_mnn, "corrected")
sce <- scater::runUMAP(sce, dimred = "MNN_hvg_2000", name = "umap_MNN_hvg_2000")

## ----MNN anglemania, message = FALSE------------------------------------------
sce_mnn <- sce %>%
    scater::logNormCounts()
sce_mnn <- batchelor::fastMNN(
    sce_mnn,
    subset.row = anglemania_genes,
    k = 20,
    batch = factor(colData(sce_mnn)[[batch_key]]),
    d = 50
)
reducedDim(sce, "MNN_anglemania") <- reducedDim(sce_mnn, "corrected")
sce <- scater::runUMAP(
    sce,
    dimred = "MNN_anglemania",
    name = "umap_MNN_anglemania"
)

## ----fig.cap = "UMAPs of MNN integrated data. Comparison of UMAP embeddings of integrated data using anglemania genes, top 300 HVGs and top 2000 HVGs.", fig.width = 12, fig.height = 8, fig.wide = TRUE----
# Use wrap_plots
patchwork::wrap_plots(
    plotReducedDim(sce, colour_by = "Batch", dimred = "umap_MNN_anglemania") +
        ggtitle("MNN integration using anglemania genes, colored by Batch"),
    plotReducedDim(sce, colour_by = "Group", dimred = "umap_MNN_anglemania") +
        ggtitle("MNN integration using anglemania genes, colored by Group"),
    plotReducedDim(sce, colour_by = "Batch", dimred = "umap_MNN_hvg_300") +
        ggtitle("MNN integration using top 300 HVGs, colored by Batch"),
    plotReducedDim(sce, colour_by = "Group", dimred = "umap_MNN_hvg_300") +
        ggtitle("MNN integration using top 300 HVGs, colored by Group"),
    plotReducedDim(sce, colour_by = "Batch", dimred = "umap_MNN_hvg_2000") +
        ggtitle("MNN integration using top 2000 HVGs, colored by Batch"),
    plotReducedDim(sce, colour_by = "Group", dimred = "umap_MNN_hvg_2000") +
        ggtitle("MNN integration using top 2000 HVGs, colored by Group"),
    ncol = 2
)

## ----fig.cap = "Overlap of selected genes. Additionally, we check the overlap of the anglemania genes with the HVGs. About 33 of the 306 anglemania genes are also found in the top 300 HVGs, and about 179 of the 306 anglemania genes are also found in the top 2000 HVGs.", fig.width = 10, fig.height = 8, fig.wide = TRUE, message = FALSE, warning = FALSE----
upsetr_df <- fromList(
    list(
        anglemania = anglemania_genes,
        hvg_300 = hvg_300,
        hvg_2000 = hvg_2000
    )
)
upset(upsetr_df, text.scale = 2)

## ----message = FALSE, warning = FALSE-----------------------------------------
se <- CreateSeuratObject(
    counts = counts(sce_raw),
    meta.data = as.data.frame(colData(sce_raw))
)
se
anglemania_genes <- se |>
    as.SingleCellExperiment(assay = "RNA") |>
    anglemania(
        batch_key = "Batch",
        zscore_mean_threshold = 2,
        zscore_sn_threshold = 2
    ) |>
    get_anglemania_genes()

## ----Seurat Integration v5, message = FALSE, warning = FALSE------------------
# Split by batch
se[["RNA"]] <- split(se[["RNA"]], f = se$Batch)

# Standard preprocessing but use anglemania genes as "VariableFeatures"
se <- NormalizeData(se, verbose = FALSE)
VariableFeatures(se) <- anglemania_genes
se <- se |>
    ScaleData(verbose = FALSE) |>
    RunPCA(verbose = FALSE)

# Integrate
se <- IntegrateLayers(
    object = se,
    method = CCAIntegration,
    orig.reduction = "pca",
    new.reduction = "integrated.cca",
    features = anglemania_genes,
    verbose = FALSE
)
se <- RunUMAP(se, dims = 1:30, reduction = "integrated.cca", verbose = FALSE)
se

## ----fig.cap = "UMAPs of Seurat integrated data. Here we show that we can use the anglemania genes for integration of a SeuratObject.", fig.wide = TRUE, fig.width = 10----
patchwork::wrap_plots(
    DimPlot(se, reduction = "umap", group.by = "Batch") +
        ggtitle("Seurat integration using\nanglemania genes\ncolored by Batch"),
    DimPlot(se, reduction = "umap", group.by = "Group") +
        ggtitle("Seurat integration using\nanglemania genes\ncolored by Group"),
    ncol = 2
)

## ----message = FALSE----------------------------------------------------------
sce_raw <- sce_example()
sce <- sce_raw
batch_key <- "batch"
sce <- anglemania(sce, batch_key = batch_key, verbose = FALSE)

## ----message = FALSE----------------------------------------------------------
barcodes_by_batch <- split(rownames(colData(sce)), colData(sce)[[batch_key]])
counts_by_batch <- lapply(barcodes_by_batch, function(x) {
    counts(sce[, x]) %>% sparse_to_fbm()
})
counts_by_batch[[1]][1:10, 1:6]
# we are working on FBMs (file-backed matrices
# implemented in the bigstatsr package)
class(counts_by_batch[[1]])

# factorise produces the correlation matrices transformed to z-scores
factorised <- lapply(counts_by_batch, factorise)
factorised[[1]][1:10, 1:6]

## ----message = FALSE----------------------------------------------------------
matrix_list <- metadata(sce)$anglemania$matrix_list
weights <- setNames(
    metadata(sce)$anglemania$weights$weight,
    metadata(sce)$anglemania$weights$batch
)
list_stats <- get_list_stats(
    matrix_list = matrix_list,
    weights = weights,
    verbose = FALSE
)
names(list_stats)
class(list_stats)
list_stats$mean_zscore[1:10, 1:6]
list_stats$sn_zscore[1:10, 1:6]

# Or we can access them directly from the SCE object
# after running anglemania
metadata(sce)$anglemania$list_stats$mean_zscore[1:10, 1:6]
metadata(sce)$anglemania$list_stats$sn_zscore[1:10, 1:6]

## ----message = FALSE----------------------------------------------------------
previous_genes <- get_anglemania_genes(sce)
sce <- select_genes(
    sce,
    zscore_mean_threshold = 2,
    zscore_sn_threshold = 2,
    verbose = FALSE
)
# Inspect the anglemania genes
new_genes <- get_anglemania_genes(sce)

length(previous_genes)
length(new_genes)

## -----------------------------------------------------------------------------
sessionInfo()