## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(echo = TRUE) ## ----load libraries----------------------------------------------------------- suppressPackageStartupMessages({ library(anglemania) library(dplyr) library(Seurat) options(Seurat.object.assay.version = "v5") library(splatter) library(SingleCellExperiment) library(scater) library(scran) library(bluster) library(batchelor) library(UpSetR) }) ## ----create simulated data---------------------------------------------------- batch.facLoc <- 0.3 de.facLoc <- 0.1 nBatches <- 4 nGroups <- 3 nGenes <- 5000 groupCells <- 300 sce_raw <- splatSimulate( batchCells = rep(groupCells * nGroups, nBatches), batch.facLoc = batch.facLoc, group.prob = rep(1 / nGroups, nGroups), nGenes = nGenes, batch.facScale = 0.1, method = "groups", verbose = FALSE, out.prob = 0.001, de.prob = 0.1, # mild de.facLoc = de.facLoc, de.facScale = 0.1, bcv.common = 0.1, seed = 42 ) sce <- sce_raw assays(sce) ## ----fig.cap = "UMAPs of unintegrated data, colored by Batch and Group. The clusters are driven by batch effects.", fig.wide = TRUE, fig.width = 10---- sce_unintegrated <- sce # Normalization. sce_unintegrated <- logNormCounts(sce_unintegrated) # Feature selection. dec <- modelGeneVar(sce_unintegrated) hvg <- getTopHVGs(dec, prop = 0.1) # PCA. set.seed(1234) sce_unintegrated <- scater::runPCA( sce_unintegrated, ncomponents = 50, subset_row = hvg ) # Clustering. colLabels(sce_unintegrated) <- clusterCells(sce_unintegrated, use.dimred = "PCA", BLUSPARAM = NNGraphParam(cluster.fun = "louvain") ) # Visualization. sce_unintegrated <- scater::runUMAP(sce_unintegrated, dimred = "PCA") patchwork::wrap_plots( plotUMAP(sce_unintegrated, colour_by = "Batch") + ggtitle("Unintegrated data, colored by Batch"), plotUMAP(sce_unintegrated, colour_by = "Group") + ggtitle("Unintegrated data, colored by Group") ) ## ----run anglemania, message = FALSE------------------------------------------ head(colData(sce)) batch_key <- "Batch" sce <- anglemania( sce, batch_key = batch_key, zscore_mean_threshold = 2, zscore_sn_threshold = 2 ) ## ----------------------------------------------------------------------------- anglemania_genes <- get_anglemania_genes(sce) head(anglemania_genes) length(anglemania_genes) ## ----select genes, message = FALSE-------------------------------------------- # If you think the number of selected genes is # too high or low you can adjust the thresholds: sce <- select_genes(sce, zscore_mean_threshold = 2.5, zscore_sn_threshold = 2.5 ) # Inspect the anglemania genes anglemania_genes <- get_anglemania_genes(sce) head(anglemania_genes) length(anglemania_genes) # 306 genes are selected with these thresholds ## ----MNN_300 HVGs------------------------------------------------------------- hvg_300 <- sce %>% scater::logNormCounts() %>% modelGeneVar(block = colData(sce)[[batch_key]]) %>% getTopHVGs(n = 300) barcodes_by_batch <- split(rownames(colData(sce)), colData(sce)[[batch_key]]) sce_list <- lapply(barcodes_by_batch, function(x) sce[, x]) sce_mnn <- sce %>% scater::logNormCounts() sce_mnn <- batchelor::fastMNN( sce_mnn, subset.row = hvg_300, k = 20, batch = factor(colData(sce_mnn)[[batch_key]]), d = 50 ) reducedDim(sce, "MNN_hvg_300") <- reducedDim(sce_mnn, "corrected") sce <- scater::runUMAP(sce, dimred = "MNN_hvg_300", name = "umap_MNN_hvg_300") # k is the number of nearest neighbours to consider when identifying MNNs ## ----MNN_2000 HVGs------------------------------------------------------------ hvg_2000 <- sce %>% scater::logNormCounts() %>% modelGeneVar(block = colData(sce)[[batch_key]]) %>% getTopHVGs(n = 2000) barcodes_by_batch <- split(rownames(colData(sce)), colData(sce)[[batch_key]]) sce_list <- lapply(barcodes_by_batch, function(x) sce[, x]) sce_mnn <- sce %>% scater::logNormCounts() sce_mnn <- batchelor::fastMNN( sce_mnn, subset.row = hvg_2000, k = 20, batch = factor(colData(sce_mnn)[[batch_key]]), d = 50 ) reducedDim(sce, "MNN_hvg_2000") <- reducedDim(sce_mnn, "corrected") sce <- scater::runUMAP(sce, dimred = "MNN_hvg_2000", name = "umap_MNN_hvg_2000") ## ----MNN anglemania, message = FALSE------------------------------------------ sce_mnn <- sce %>% scater::logNormCounts() sce_mnn <- batchelor::fastMNN( sce_mnn, subset.row = anglemania_genes, k = 20, batch = factor(colData(sce_mnn)[[batch_key]]), d = 50 ) reducedDim(sce, "MNN_anglemania") <- reducedDim(sce_mnn, "corrected") sce <- scater::runUMAP( sce, dimred = "MNN_anglemania", name = "umap_MNN_anglemania" ) ## ----fig.cap = "UMAPs of MNN integrated data. Comparison of UMAP embeddings of integrated data using anglemania genes, top 300 HVGs and top 2000 HVGs.", fig.width = 12, fig.height = 8, fig.wide = TRUE---- # Use wrap_plots patchwork::wrap_plots( plotReducedDim(sce, colour_by = "Batch", dimred = "umap_MNN_anglemania") + ggtitle("MNN integration using anglemania genes, colored by Batch"), plotReducedDim(sce, colour_by = "Group", dimred = "umap_MNN_anglemania") + ggtitle("MNN integration using anglemania genes, colored by Group"), plotReducedDim(sce, colour_by = "Batch", dimred = "umap_MNN_hvg_300") + ggtitle("MNN integration using top 300 HVGs, colored by Batch"), plotReducedDim(sce, colour_by = "Group", dimred = "umap_MNN_hvg_300") + ggtitle("MNN integration using top 300 HVGs, colored by Group"), plotReducedDim(sce, colour_by = "Batch", dimred = "umap_MNN_hvg_2000") + ggtitle("MNN integration using top 2000 HVGs, colored by Batch"), plotReducedDim(sce, colour_by = "Group", dimred = "umap_MNN_hvg_2000") + ggtitle("MNN integration using top 2000 HVGs, colored by Group"), ncol = 2 ) ## ----fig.cap = "Overlap of selected genes. Additionally, we check the overlap of the anglemania genes with the HVGs. About 33 of the 306 anglemania genes are also found in the top 300 HVGs, and about 179 of the 306 anglemania genes are also found in the top 2000 HVGs.", fig.width = 10, fig.height = 8, fig.wide = TRUE, message = FALSE, warning = FALSE---- upsetr_df <- fromList( list( anglemania = anglemania_genes, hvg_300 = hvg_300, hvg_2000 = hvg_2000 ) ) upset(upsetr_df, text.scale = 2) ## ----message = FALSE, warning = FALSE----------------------------------------- se <- CreateSeuratObject( counts = counts(sce_raw), meta.data = as.data.frame(colData(sce_raw)) ) se anglemania_genes <- se |> as.SingleCellExperiment(assay = "RNA") |> anglemania( batch_key = "Batch", zscore_mean_threshold = 2, zscore_sn_threshold = 2 ) |> get_anglemania_genes() ## ----Seurat Integration v5, message = FALSE, warning = FALSE------------------ # Split by batch se[["RNA"]] <- split(se[["RNA"]], f = se$Batch) # Standard preprocessing but use anglemania genes as "VariableFeatures" se <- NormalizeData(se, verbose = FALSE) VariableFeatures(se) <- anglemania_genes se <- se |> ScaleData(verbose = FALSE) |> RunPCA(verbose = FALSE) # Integrate se <- IntegrateLayers( object = se, method = CCAIntegration, orig.reduction = "pca", new.reduction = "integrated.cca", features = anglemania_genes, verbose = FALSE ) se <- RunUMAP(se, dims = 1:30, reduction = "integrated.cca", verbose = FALSE) se ## ----fig.cap = "UMAPs of Seurat integrated data. Here we show that we can use the anglemania genes for integration of a SeuratObject.", fig.wide = TRUE, fig.width = 10---- patchwork::wrap_plots( DimPlot(se, reduction = "umap", group.by = "Batch") + ggtitle("Seurat integration using\nanglemania genes\ncolored by Batch"), DimPlot(se, reduction = "umap", group.by = "Group") + ggtitle("Seurat integration using\nanglemania genes\ncolored by Group"), ncol = 2 ) ## ----message = FALSE---------------------------------------------------------- sce_raw <- sce_example() sce <- sce_raw batch_key <- "batch" sce <- anglemania(sce, batch_key = batch_key, verbose = FALSE) ## ----message = FALSE---------------------------------------------------------- barcodes_by_batch <- split(rownames(colData(sce)), colData(sce)[[batch_key]]) counts_by_batch <- lapply(barcodes_by_batch, function(x) { counts(sce[, x]) %>% sparse_to_fbm() }) counts_by_batch[[1]][1:10, 1:6] # we are working on FBMs (file-backed matrices # implemented in the bigstatsr package) class(counts_by_batch[[1]]) # factorise produces the correlation matrices transformed to z-scores factorised <- lapply(counts_by_batch, factorise) factorised[[1]][1:10, 1:6] ## ----message = FALSE---------------------------------------------------------- matrix_list <- metadata(sce)$anglemania$matrix_list weights <- setNames( metadata(sce)$anglemania$weights$weight, metadata(sce)$anglemania$weights$batch ) list_stats <- get_list_stats( matrix_list = matrix_list, weights = weights, verbose = FALSE ) names(list_stats) class(list_stats) list_stats$mean_zscore[1:10, 1:6] list_stats$sn_zscore[1:10, 1:6] # Or we can access them directly from the SCE object # after running anglemania metadata(sce)$anglemania$list_stats$mean_zscore[1:10, 1:6] metadata(sce)$anglemania$list_stats$sn_zscore[1:10, 1:6] ## ----message = FALSE---------------------------------------------------------- previous_genes <- get_anglemania_genes(sce) sce <- select_genes( sce, zscore_mean_threshold = 2, zscore_sn_threshold = 2, verbose = FALSE ) # Inspect the anglemania genes new_genes <- get_anglemania_genes(sce) length(previous_genes) length(new_genes) ## ----------------------------------------------------------------------------- sessionInfo()