--- title: "Multi-Resolution Compositional Analysis in scRNA-seq: Reference Frames with SETA" date: "`r BiocStyle::doc_date()`" author: - name: Kyle Kimler affiliation: - &CDN Cell Discovery Network - &BCH Boston Children's Hospital - &UVIC-UCC University of Vic - Central University of Catalonia email: kkimler@broadinstitute.org - name: Marc Elosua-Bayes affiliation: - &CDN Cell Discovery Network - &BCH Boston Children's Hospital email: marc.elosuabayes@childrens.harvard.edu package: "`r BiocStyle::pkg_ver('SETA')`" vignette: > %\VignetteEncoding{UTF-8} %\VignetteIndexEntry{Multi-Resolution Compositional Analysis in scRNA-seq: Reference Frames with SETA} %\VignetteEngine{knitr::rmarkdown} output: BiocStyle::html_document editor_options: markdown: wrap: 80 chunk_output_type: console --- ```{=html} ``` ## Introduction `SETA` provides a set of functions for compositional analysis of single-cell RNA-seq data. In earlier vignettes, we introduced the basics of compositional transforms (`setaCLR`, `setaALR`, etc.) and distance/metadata handling. In this vignette, we demonstrate how to create and utilize _multi-resolution_ annotations, here referred to as **reference frames**, for hierarchical or subcompositional analysis. Our outline: - **Extract a taxonomic data frame** of differing cell-type resolutions - **Visualize the taxonomy as a tree** using `ggraph` - **Transform counts data with a chosen taxonomic reference frame** - **Compare PCA latent spaces** based on different reference frames This example works with the Tabula Muris Senis lung dataset which can be downloaded directly with bioconductor It contains lung single-cell profiles from 14 donor mice (mouse_id). Each mouse is assigned to one of five age groups, from young adult to old, recorded in the age column. Standard cell-type labels (free_annotation) and additional metadata (e.g. sex) are also included. ## Installation ```{r installation, eval=FALSE} if (!require("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install("SETA") ``` ```{r load packages, message=FALSE, warning=FALSE} library(SingleCellExperiment) library(SETA) library(ggplot2) library(ggraph) library(tidygraph) library(dplyr) library(ggplot2) library(patchwork) library(TabulaMurisSenisData) ``` ```{r palettes} # Set up a color palette for plots # 9-group distinct categoricals d_palette <- c("#3B9AB2", "#6BAFAF", "#9BBDAC", "#C9C171", "#EBCC2A", "#E6A80F", "#E98903", "#F84C00", "#F21A00", "#B10026", "#67000D") ``` ## Load and prepare data ```{r load data, echo = FALSE} sce <- TabulaMurisSenisDroplet(tissues = "Lung")$Lung sce <- sce[, colData(sce)$subtissue != "immune-endo-depleted"] sce$mouse.id <- gsub("/", "", sce$mouse.id) ``` Quick data exploration: ```{r tabular data exploration} table(sce$free_annotation, sce$age) ``` ## Creating a Taxonomic Data Frame Below, we define a custom lineage column that lumps certain cell types together: All Monocyte variants → Monocytes All CD4/CD8 variants → T cells All B cell variants → B cells Everything else remains as is. ```{r add lineage information} # We'll make a small mapping from original free_annotation -> broader 'Lineage' lineage_map <- c( "Adventitial Fibroblast" = "Fibroblast", "Airway Smooth Muscle" = "SMC", "Alveolar Epithelial Type 2" = "Epithelial", "Alveolar Fibroblast" = "Fibroblast", "Alveolar Macrophage" = "Myeloid", "Artery" = "Endothelial", "B" = "B cell", "Basophil" = "Granulocyte", "Ccr7+ Dendritic" = "Myeloid", "CD4+ T" = "T cell", "CD8+ T" = "T cell", "Capillary" = "Endothelial", "Capillary Aerocyte" = "Endothelial", "Ciliated" = "Epithelial", "Classical Monocyte" = "Myeloid", "Club" = "Epithelial", "Intermediate Monocyte" = "Myeloid", "Interstitial Macrophage" = "Myeloid", "Lympatic" = "Endothelial", "Ly6g5b+ T" = "T cell", "Myeloid Dendritic Type 1" = "Myeloid", "Myeloid Dendritic Type 2" = "Myeloid", "Myofibroblast" = "Fibroblast", "Natural Killer" = "NK", "Natural Killer T" = "NK", "Neuroendocrine" = "Neuroendocrine", "Neutrophil" = "Granulocyte", "Nonclassical Monocyte" = "Myeloid", "Pericyte" = "Endothelial", "Plasma" = "Plasma Cell", "Plasmacytoid Dendritic" = "Myeloid", "Proliferating Alveolar Macrophage" = "Proliferating", "Proliferating Classical Monocyte" = "Proliferating", "Proliferating Dendritic" = "Proliferating", "Proliferating NK" = "Proliferating", "Proliferating T" = "Proliferating", "Regulatory T" = "T cell", "Vein" = "Endothelial", "Zbtb32+ B" = "B cell" ) # Add a new column in named 'Lineage' free_annotations <- as.character(sce$free_annotation) sce$Lineage <- lineage_map[free_annotations] ``` Now we can build a taxonomic data frame that has two levels: the fine label (free_annotation) and the broad label (Lineage). ```{r create taxonomy DF} # In a Seurat Object, barcodes are in the rownames of the metadata. # Similar to setaCounts, taxonomy DF creation can use rownames as bc. # Then we ensure we have columns for 'free_annotation' and 'Lineage'. print(head(sce$free_annotation)) print(head(sce$Lineage)) # We want a data frame that includes bc + 'free_annotation' + 'Lineage' # Then we'll pass it to setaTaxonomyDF tax_df <- setaTaxonomyDF( obj = data.frame(colData(sce)), resolution_cols = c("Lineage", "free_annotation"), bc_col = "cell" ) # Each row is a unique 'free_annotation'. # The last column 'free_annotation' is the finer label. # cols must be entered in order of increasing resolution (broadest to finest) tax_df ``` ## Visualize the Taxonomy as a Tree via ggraph We now have a 2-level taxonomy: free_annotation (fine) -> Lineage (coarse). For demonstration, let's build a small graph that connects each free_annotation node to its Lineage node. We'll color edges by the lineage label: ```{r ggraph, message=FALSE, warning=FALSE, fig.width = 8, fig.height = 6} # Create a tidygraph object using SETA built-in utils tbl_g <- taxonomy_to_tbl_graph( tax_df, columns = c("Lineage", "free_annotation")) # Plot with ggraph ggraph(tbl_g, layout = "dendrogram") + geom_edge_diagonal2(aes(color = node.Lineage)) + geom_node_text(aes(filter = leaf, label = free_annotation, color = Lineage), nudge_y = 0.1, vjust = 0.5, hjust = 0, size = 5) + geom_node_text(aes(filter = !leaf, label = Lineage), color = 'black', size = 5, repel = TRUE) + guides(edge_colour = guide_legend(title = "Lineage"), color = guide_legend(title = "Lineage")) + theme_linedraw(base_size = 16) + scale_edge_color_manual(values = d_palette) + scale_color_manual(values = d_palette) + scale_y_reverse(breaks = seq(0, 2, by = 1), labels = c("Type", "Lineage", "Root")) + theme(axis.text.y = element_blank()) + expand_limits(y = -5) + coord_flip() + ggtitle("SETA: Taxonomy") ``` Here, each Lineage is a parent node, and each free_annotation is a leaf node. The edges are colored by the lineage name. ## Taxonomic Balances with SETA Balances can be calculated between clades and species among the taxonomic tree. Currently, SETA supports user-specified balances with the transform "balance", where balances are calculated as log-ratios between these clades or species. The choice of numerator and denominator groups determine which direction the resulting balance points. If the composition is weighted towards the numerator, the log-ratio will be positive, and negative if towards the denominator. ```{r counts and metadata} # Create sample x free_annotation counts: taxa_counts <- setaCounts( data.frame(colData(sce)), cell_type_col = "free_annotation", sample_col = "mouse.id", bc_col = "rownames" ) # Create SETA sample-level metadata meta_df <- setaMetadata(data.frame(colData(sce)), sample_col = "mouse.id", meta_cols = c("age", "sex")) bal_out <- setaTransform( counts = taxa_counts, method = "balance", balances = list( epi_vs_myeloid = list( num = "Epithelial", denom = "Myeloid" ) ), taxonomyDF = tax_df, taxonomy_col = "Lineage" ) # merge the balances with sample-level metadata bal_df <- data.frame( sample_id = rownames(bal_out$counts), epi_vs_myeloid = as.numeric(bal_out$counts[, "epi_vs_myeloid"]), stringsAsFactors = FALSE ) |> dplyr::left_join(meta_df, by = "sample_id") # Compare age groups by epithelial to myeloid ratios library(ggplot2) ggplot(bal_df, aes(x = age, y = epi_vs_myeloid, fill = age)) + geom_boxplot(outlier.shape = NA, alpha = 0.4) + geom_jitter(width = 0.15, size = 2) + scale_fill_manual(values = d_palette) + labs(title = "Epithelial / Myeloid balance across age groups", y = "log GM(Epithelial) - log GM(Myeloid)", x = "Age (months)") + theme_minimal(base_size = 14) + theme(legend.position = "none") ``` The y-axis shows the log-ratio of the geometric means of Epithelial versus Myeloid counts for each mouse. Positive values indicate epithelial enrichment relative to myeloid cells, while negative values indicate the opposite. Because this is a standard log-ratio, the statistic is fully compositional and can be compared directly across groups. ## Transform Counts with a Taxonomic Reference Frame We can now use setaCounts to get the sample × free_annotation matrix, then apply a within-lineage transform. This lumps certain columns (cell types) into subcompositions. For example, a CLR transform within each lineage: ```{r reference frame calculation} # We'll transform them with a 'Lineage' grouping. # The taxonomyDF uses rownames = free_annotation # so that colnames(taxa_counts) align with rownames(tax_df). refframe_out <- setaTransform( counts = taxa_counts, method = "CLR", taxonomyDF = tax_df, taxonomy_col = "Lineage", within_resolution = TRUE ) refframe_out$within_resolution refframe_out$grouping_var # Compare to a standard global CLR transform: global_out <- setaTransform(taxa_counts, method = "CLR") ``` ## Visualize Latent Spaces With Different Reference Frames Below we show how to do PCA on the sample-level coordinates from either the global CLR or the lineage-based CLR. We color points by age to see if the subcompositional approach changes clustering. ```{r latent reference frames, fig.width = 10, fig.height = 6} # A) Global CLR latent_global <- setaLatent(global_out, method = "PCA", dims = 2) pca_global <- latent_global$latentSpace pca_global$sample_id <- rownames(pca_global) # B) Within-Lineage CLR latent_lineage <- setaLatent(refframe_out, method = "PCA", dims = 2) pca_lineage <- latent_lineage$latentSpace pca_lineage$sample_id <- rownames(pca_lineage) # Merge with metadata pca_global <- left_join(pca_global, meta_df) pca_lineage <- left_join(pca_lineage, meta_df) # Plot side by side p1 <- ggplot(pca_global, aes(x = PC1, y = PC2, color = age)) + geom_text(aes(label = sample_id)) + scale_color_manual( values = d_palette ) + labs(title = "Global CLR PCA", x = "PC1", y = "PC2", color = "Age (Months)") + theme_minimal() + xlab(sprintf("PC1 (%s%%)", signif(latent_global$varExplained[1], 4) * 100)) + ylab(sprintf("PC2 (%s%%)", signif(latent_global$varExplained[2], 4) * 100)) + theme(legend.position = 'none') p2 <- ggplot(pca_lineage, aes(x = PC1, y = PC2, color = age)) + geom_text(aes(label = sample_id)) + scale_color_manual( values = d_palette ) + labs(title = "Within-Lineage CLR PCA", x = "PC1", y = "PC2", color = "Age (Months)") + theme_minimal() + xlab(sprintf("PC1 (%s%%)", signif(latent_lineage$varExplained[1], 4) * 100)) + ylab(sprintf("PC2 (%s%%)", signif(latent_lineage$varExplained[2], 4) * 100)) p1 + p2 ``` ```{r libraries used} sessionInfo() ```