--- title: "BLASE for excluding developmental genes from bulk RNA-seq" output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{BLASE for excluding developmental genes from bulk RNA-seq} %\VignetteEncoding{UTF-8} %\VignetteEngine{knitr::rmarkdown} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` ```{r setup, results='hide', message=FALSE, warning=FALSE} library(scater) library(ggplot2) library(BiocParallel) library(blase) library(limma) library(ggVennDiagram) ``` ```{r, randomseed} RNGversion("3.5.0") SEED <- 7 set.seed(SEED) ``` ```{r, concurrency} N_CORES <- 2 bpparam <- MulticoreParam(N_CORES) ``` This article will show how BLASE can be used for excluding the effect of developmental differences when analysing bulk RNA-seq data. We make use of scRNA-seq [Dogga et al. 2024](https://www.science.org/doi/10.1126/science.adj4088) and bulk RNA-seq data from [Zhang et al. 2021](https://www.nature.com/articles/s41467-021-24814-1). Code for generating the objects used here is available in the BLASE reproducibility repository. ## Load Data First we'll load in the data we're using, pre-prepared from the BLASE reproducibility repository. ```{r, load_data} data(zhang_2021_heat_shock_bulk, package = "blase") data(MCA_PF_SCE, package = "blase") ``` We can examine the true lifecycle stages, and also the calculated pseudotime trajectory (Slingshot). ```{r, sc_umaps} #| fig.alt: > #| UMAP of Dogga et al. single-cell RNA-seq reference coloured by lifecycle #| stage, showing the cycle from Rings, Trophozoites, Schizonts, to Rings #| again. plotUMAP(MCA_PF_SCE, colour_by = "STAGE_HR") #| fig.alt: > #| UMAP of Dogga et al. single-cell RNA-seq reference coloured by pseudotime, #| starting with Rings, and ending with Schizonts. plotUMAP(MCA_PF_SCE, colour_by = "slingPseudotime_1") ``` ## Prepare BLASE Now we'll prepare BLASE for use. ### Create BLASE data object First, we create the object, giving it the number of bins we want to use, and how to calculate them. ```{r, create_BLASE_object} blase_data <- as.BlaseData( MCA_PF_SCE, pseudotime_slot = "slingPseudotime_1", n_bins = 8, split_by = "pseudotime_range" ) # Add these bins to the sc metadata MCA_PF_SCE <- assign_pseudotime_bins(MCA_PF_SCE, pseudotime_slot = "slingPseudotime_1", n_bins = 8, split_by = "pseudotime_range" ) ``` ### Select Genes Now we will select the genes we want to use, using BLASE's gene peakedness metric. ```{r, calculate_gene_peakedness} gene_peakedness_info <- calculate_gene_peakedness( MCA_PF_SCE, window_pct = 5, knots = 18, BPPARAM = bpparam ) genes_to_use <- gene_peakedness_spread_selection( MCA_PF_SCE, gene_peakedness_info, genes_per_bin = 30, n_gene_bins = 30 ) head(gene_peakedness_info) ``` Here, we add them to the BLASE object for mapping with. ```{r, add_genes_to_blase_data} genes(blase_data) <- genes_to_use ``` ## Calculate Mappings Now we can perform the actual mapping step, and review the results. ```{r, mapping} mapping_results <- map_all_best_bins( blase_data, zhang_2021_heat_shock_bulk, BPPARAM = bpparam ) ``` ```{r, mappingPlot} #| fig.alt: > #| Heatmap of mapping correlations of the Zhang et al. data onto #| the Dogga et al. scRNA-seq. plot_mapping_result_heatmap(mapping_results) ``` ## Calculate DE genes We calculate DE genes using Limma (ref) as we have access to only normalised counts. ```{r, setUpDE} metadata <- data.frame(row.names = seq_len(12)) metadata$strain <- c( rep("NF54", 4), rep("PB4", 4), rep("PB31", 4) ) metadata$growth_conditions <- rep(c("Normal", "Normal", "HS", "HS"), 3) metadata$group <- paste0(metadata$strain, "_", metadata$growth_conditions) rownames(metadata) <- c( "NF54_37_1", "NF54_37_2", "NF54_41_1", "NF54_41_2", "PB4_37_1", "PB4_37_2", "PB4_41_1", "PB4_41_2", "PB31_37_1", "PB31_37_2", "PB31_41_1", "PB31_41_2" ) design <- model.matrix(~ 0 + group, metadata) colnames(design) <- gsub("group", "", colnames(design)) contr.matrix <- makeContrasts( WT_normal_v_HS = NF54_Normal - NF54_HS, PB31_normal_v_HS = PB31_Normal - PB31_HS, normal_NF54_v_PB31 = NF54_Normal - PB31_Normal, levels = colnames(design) ) ``` ### Phenotype + Development (Bulk DE) ```{r, calculateBulkDE} vfit <- lmFit(zhang_2021_heat_shock_bulk, design) vfit <- contrasts.fit(vfit, contrasts = contr.matrix) efit <- eBayes(vfit) summary(decideTests(efit)) PB31_normal_v_HS_BULK_DE <- topTable(efit, n = Inf, coef = 2) PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[PB31_normal_v_HS_BULK_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_BULK_DE$logFC > 0, ] PB31_normal_v_HS_BULK_DE <- PB31_normal_v_HS_BULK_DE[order(-PB31_normal_v_HS_BULK_DE$logFC), ] ``` ### Development (SC DE) First, let's pseudobulk based on the pseudotime bins ```{r, pseudobulk} pseudobulked_MCA_PF_SCE <- data.frame(row.names = rownames(MCA_PF_SCE)) for (r in mapping_results) { pseudobulked_MCA_PF_SCE[bulk_name(r)] <- rowSums(counts(MCA_PF_SCE[, MCA_PF_SCE$pseudotime_bin == best_bin(r)])) } pseudobulked_MCA_PF_SCE <- as.matrix(pseudobulked_MCA_PF_SCE) ``` And now calculate DE genes for these ```{r} ## Normalise, not needed for true bulks par(mfrow = c(1, 2)) v <- voom(pseudobulked_MCA_PF_SCE, design, plot = FALSE) vfit_sc <- lmFit(v, design) vfit_sc <- contrasts.fit(vfit_sc, contrasts = contr.matrix) efit_sc <- eBayes(vfit_sc) summary(decideTests(efit_sc)) ``` ```{r} PB31_normal_v_HS_SC_DE <- topTable(efit_sc, n = Inf, coef = 2) PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[PB31_normal_v_HS_SC_DE$adj.P.Val < 0.05 & PB31_normal_v_HS_SC_DE$logFC > 0, ] PB31_normal_v_HS_SC_DE <- PB31_normal_v_HS_SC_DE[order(-PB31_normal_v_HS_SC_DE$logFC), ] ``` ## Remove Developmental Genes How many of these genes overlap? We can look at the intersection using a Venn Diagram: ```{r} #| fig.alt: > #| Venn Diagram showing the intersection of Bulk and SC DE genes. ggVennDiagram(list( Phenotype = rownames(PB31_normal_v_HS_BULK_DE), Development = rownames(PB31_normal_v_HS_SC_DE) )) + ggtitle("PB31 Normal vs Heat Shock") ``` Great, there are genes which we can correct. We can now remove these from the original DE genes list. NB: This is a primitive approach, and there are likely many other and better ways to calculate which genes should be kept or removed. ```{r} PB31_normal_v_HS_corrected_DE <- rownames(PB31_normal_v_HS_BULK_DE[!(rownames(PB31_normal_v_HS_BULK_DE) %in% rownames(PB31_normal_v_HS_SC_DE)), ]) head(PB31_normal_v_HS_corrected_DE) ``` Further downstream analysis can now be performed, as desired by the researcher, for example Gene Ontology term enrichment. ## Session Info ```{r, sessionInfo} sessionInfo() ```