CSOA 0.99.2
CSOA is a tool for scoring gene set signatures in scRNA-seq data. It constructs, for each input gene, a set of cells highly expressing the gene, then assesses all pairs of cell sets for statistical significance. The significant overlaps are ranked, and the top-ranked overlaps receive scores used to calculate per-cell CSOA scores.
To install CSOA, run the following commands in an R session:
if (!require("BiocManager", quietly=TRUE))
install.packages("BiocManager")
BiocManager::install("CSOA")
In addition to CSOA, you need to install patchwork, scRNAseq and scuttle for this tutorial.
This tutorial uses an scRNA-seq human pancreas dataset with provided cell type annotations.
After loading the required packages, download the dataset using the
BaronPancreasData function from scRNAseq. The dataset will be stored as a SingleCellExperiment object.
CSOA requires the input scRNA-seq data to be normalized and log-transformed.
We will employ the logNormCounts function from scuttle for this step.
library(CSOA)
library(ggplot2)
library(patchwork)
library(scRNAseq)
library(scuttle)
library(Seurat)
sceObj <- BaronPancreasData('human')
sceObj <- logNormCounts(sceObj)
We can take a look at the cell types found in this dataset:
table(colData(sceObj)$label)
#>
#> acinar activated_stellate alpha beta
#> 958 284 2326 2525
#> delta ductal endothelial epsilon
#> 601 1077 252 18
#> gamma macrophage mast quiescent_stellate
#> 255 55 25 173
#> schwann t_cell
#> 13 7
Next, we will define gene sets of acinar, alpha, ductal and gamma markers based on PanglaoDB. The gene sets must be provided as a named list of character vectors. The names of the list will be used as column names when storing the results.
acinarMarkers <- c('PRSS1', 'KLK1', 'CTRC', 'PNLIP', 'AKR1C3', 'CTRB1',
'DUOXA2', 'ALDOB', 'REG3A', 'SERPINA3', 'PRSS3', 'REG1B',
'CFB', 'GDF15', 'MUC1','ANPEP', 'ANGPTL4', 'OLFM4',
'GSTA1', 'LGALS2', 'PDZK1IP1', 'RARRES2', 'CXCL17',
'UBD', 'GSTA2', 'LYZ', 'RBPJL', 'PTF1A', 'CELA3A',
'SPINK1', 'ZG16', 'CEL', 'CELA2A', 'CPB1', 'CELA1',
'PNLIPRP1', 'RNASE1', 'AMY2B', 'CPA2','CPA1', 'CELA3B',
'CTRB2', 'PLA2G1B', 'PRSS2', 'CLPS', 'REG1A', 'SYCN')
alphaMarkers <- c('GCG', 'TTR', 'PCSK2', 'FXYD5', 'LDB2', 'MAFB',
'CHGA', 'SCGB2A1', 'GLS', 'FAP', 'DPP4', 'GPR119',
'PAX6', 'NEUROD1', 'LOXL4', 'PLCE1', 'GC', 'KLHL41',
'FEV', 'PTGER3', 'RFX6', 'SMARCA1', 'PGR', 'IRX1',
'UCP2', 'RGS4', 'KCNK16', 'GLP1R', 'ARX', 'POU3F4',
'RESP18', 'PYY', 'SLC38A5', 'TM4SF4', 'CRYBA2', 'SH3GL2',
'PCSK1', 'PRRG2', 'IRX2', 'ALDH1A1','PEMT', 'SMIM24',
'F10', 'SCGN', 'SLC30A8')
ductalMarkers <- c('CFTR', 'SERPINA5', 'SLPI', 'TFF1', 'CFB', 'LGALS4',
'CTSH', 'PERP', 'PDLIM3', 'WFDC2', 'SLC3A1', 'AQP1',
'ALDH1A3', 'VTCN1', 'KRT19', 'TFF2', 'KRT7', 'CLDN4',
'LAMB3', 'TACSTD2', 'CCL2', 'DCDC2','CXCL2', 'CLDN10',
'HNF1B', 'KRT20', 'MUC1', 'ONECUT1', 'AMBP', 'HHEX',
'ANXA4', 'SPP1', 'PDX1', 'SERPINA3', 'GDF15', 'AKR1C3',
'MMP7', 'DEFB1', 'SERPING1', 'TSPAN8', 'CLDN1', 'S100A10',
'PIGR')
gammaMarkers <- c('PPY', 'ABCC9', 'FGB', 'ZNF503', 'MEIS1', 'LMO3', 'EGR3',
'CHN2', 'PTGFR', 'ENTPD2', 'AQP3', 'THSD7A', 'CARTPT',
'ISL1', 'PAX6', 'NEUROD1', 'APOBEC2', 'SEMA3E', 'SLITRK6',
'SERTM1', 'PXK', 'PPY2P', 'ETV1', 'ARX', 'CMTM8', 'SCGB2A1',
'FXYD2', 'SCGN')
geneSets <- list(acinarMarkers, alphaMarkers, ductalMarkers, gammaMarkers)
names(geneSets) <- c('CSOA_acinar', 'CSOA_alpha', 'CSOA_ductal', 'CSOA_gamma')
Before running CSOA, we will convert the SingleCellExperiment object to a
Seurat object in order to employ the VlnPlot function for visualization.
seuratObj <- as.Seurat(sceObj)
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
#> Found more than one class "package_version" in cache; using the first, from namespace 'SeuratObject'
#> Also defined by 'alabaster.base'
seuratObj <- runCSOA(seuratObj, geneSets)
#> Computing percentile sets...
#> Assessing gene overlaps...
#> 194 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 105 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 248 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 18 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
We can now display the results:
plots <- lapply(names(geneSets), function(x) {
VlnPlot(seuratObj, x, group.by = 'label') + NoLegend() +
theme(axis.text = element_text(size=10),
axis.title.x = element_blank(),
plot.title = element_text (size=11))
})
(plots[[1]] | plots[[2]]) / (plots[[3]] | plots[[4]])
Alternatively, CSOA can be run on a SingleCellExperiment object.
The results will be stored as a column in the object’s colData matrix:
sceObj <- runCSOA(sceObj, geneSets)
#> Computing percentile sets...
#> Warning in percentileSets(geneSetExp, percentile): 1 gene(s) had no top cells
#> at the indicated percentile. These are now excluded from the gene signature.
#> Assessing gene overlaps...
#> 194 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 105 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 248 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
#> 18 overlaps will be used in the calculation of CSOA scores.
#> Normalizing expression matrix by rows...
#> Computing per-cell scores for gene pairs...
#> Computing per-cell gene signature scores...
We verify that the results are identical:
sceObj <- runCSOA(sceObj, geneSets)
#> Warning in percentileSets(geneSetExp, percentile): 1 gene(s) had no top cells
#> at the indicated percentile. These are now excluded from the gene signature.
vapply(names(geneSets),
function(x) identical(seuratObj[[]][[x]], colData(sceObj)[[x]]),
logical(1))
#> CSOA_acinar CSOA_alpha CSOA_ductal CSOA_gamma
#> TRUE TRUE TRUE TRUE
Additionally, CSOA can be run directly on an expression matrix. The expMat
function extracts the log-transformed and normalized expression matrix—data
forSeurat, logcounts for SingleCellExperiment—from the input object,
and converts it to a dense matrix.
CSOA requires
a dense expression matrix, as opposed to the sparse matrix class (dgCMatrix)
used by Seurat and SingleCellExperiment. To satisfy this requirement,
the expMat function will be employed. This function extracts the
geneSetExp <- expMat(seuratObj)
#> Warning in asMethod(object): sparse->dense coercion: allocating vector of size
#> 1.3 GiB
Note: runCSOA also accepts the sparse matrix class used by Seurat and
SingleCellExperiment (dgCMatrix) as input. Internally, runCSOA filters the
dgCMatrix as to contain only the genes present in the input gene sets, then
densifies the matrix using expMat.
When running directly on an expression matrix, runCSOA will return a list in
which the first element is the expression matrix and the second is the data
frame of CSOA scores.
res <- runCSOA(geneSetExp, geneSets)
#> Warning in percentileSets(geneSetExp, percentile): 1 gene(s) had no top cells
#> at the indicated percentile. These are now excluded from the gene signature.
head(res[[2]])
#> CSOA_acinar CSOA_alpha CSOA_ductal CSOA_gamma
#> human1_lib1.final_cell_0001 0.7534719 0.0001117549 0.004928853 0.0000000000
#> human1_lib1.final_cell_0002 0.5009556 0.0008294738 0.023359437 0.0005709145
#> human1_lib1.final_cell_0003 0.5308047 0.0000000000 0.000000000 0.0000000000
#> human1_lib1.final_cell_0004 0.5052516 0.0010344557 0.011338792 0.0036947608
#> human1_lib1.final_cell_0005 0.5371408 0.0011376886 0.008792743 0.0000000000
#> human1_lib1.final_cell_0006 0.3647260 0.0001892905 0.006612631 0.0000000000
The results are identical with those obtained earlier for the Seurat object:
vapply(names(geneSets),
function(x) identical(seuratObj[[]][[x]], res[[2]][[x]]), logical(1))
#> CSOA_acinar CSOA_alpha CSOA_ductal CSOA_gamma
#> TRUE TRUE TRUE TRUE
sessionInfo()
#> R version 4.5.1 Patched (2025-08-23 r88802)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.3 LTS
#>
#> Matrix products: default
#> BLAS: /home/biocbuild/bbs-3.22-bioc/R/lib/libRblas.so
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
#>
#> locale:
#> [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
#> [3] LC_TIME=en_GB LC_COLLATE=C
#> [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
#> [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
#> [9] LC_ADDRESS=C LC_TELEPHONE=C
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
#>
#> time zone: America/New_York
#> tzcode source: system (glibc)
#>
#> attached base packages:
#> [1] stats4 stats graphics grDevices utils datasets methods
#> [8] base
#>
#> other attached packages:
#> [1] Seurat_5.3.0 SeuratObject_5.2.0
#> [3] sp_2.2-0 stringr_1.5.2
#> [5] scuttle_1.19.0 scRNAseq_2.23.0
#> [7] SingleCellExperiment_1.31.1 SummarizedExperiment_1.39.2
#> [9] Biobase_2.69.1 GenomicRanges_1.61.5
#> [11] Seqinfo_0.99.2 IRanges_2.43.5
#> [13] S4Vectors_0.47.4 BiocGenerics_0.55.1
#> [15] generics_0.1.4 MatrixGenerics_1.21.0
#> [17] matrixStats_1.5.0 patchwork_1.3.2
#> [19] ggplot2_4.0.0 CSOA_0.99.2
#> [21] BiocStyle_2.37.1
#>
#> loaded via a namespace (and not attached):
#> [1] ProtGenerics_1.41.0 spatstat.sparse_3.1-0 bitops_1.0-9
#> [4] httr_1.4.7 RColorBrewer_1.1-3 tools_4.5.1
#> [7] sctransform_0.4.2 alabaster.base_1.9.5 R6_2.6.1
#> [10] HDF5Array_1.37.0 lazyeval_0.2.2 uwot_0.2.3
#> [13] rhdf5filters_1.21.0 withr_3.0.2 gridExtra_2.3
#> [16] progressr_0.16.0 cli_3.6.5 spatstat.explore_3.5-3
#> [19] fastDummies_1.7.5 labeling_0.4.3 alabaster.se_1.9.0
#> [22] sass_0.4.10 S7_0.2.0 spatstat.data_3.1-8
#> [25] ggridges_0.5.7 pbapply_1.7-4 Rsamtools_2.25.3
#> [28] dichromat_2.0-0.1 parallelly_1.45.1 RSQLite_2.4.3
#> [31] RApiSerialize_0.1.4 BiocIO_1.19.0 ica_1.0-3
#> [34] spatstat.random_3.4-2 dplyr_1.1.4 wesanderson_0.3.7
#> [37] Matrix_1.7-4 ggbeeswarm_0.7.2 abind_1.4-8
#> [40] lifecycle_1.0.4 yaml_2.3.10 rhdf5_2.53.5
#> [43] SparseArray_1.9.1 BiocFileCache_2.99.6 Rtsne_0.17
#> [46] grid_4.5.1 blob_1.2.4 promises_1.3.3
#> [49] ExperimentHub_2.99.5 crayon_1.5.3 miniUI_0.1.2
#> [52] lattice_0.22-7 beachmat_2.25.5 cowplot_1.2.0
#> [55] GenomicFeatures_1.61.6 KEGGREST_1.49.1 poibin_1.6
#> [58] magick_2.9.0 pillar_1.11.1 knitr_1.50
#> [61] rjson_0.2.23 bayesbio_1.0.0 future.apply_1.20.0
#> [64] codetools_0.2-20 glue_1.8.0 spatstat.univar_3.1-4
#> [67] data.table_1.17.8 vctrs_0.6.5 png_0.1-8
#> [70] gypsum_1.5.0 spam_2.11-1 gtable_0.3.6
#> [73] kernlab_0.9-33 cachem_1.1.0 xfun_0.53
#> [76] S4Arrays_1.9.1 mime_0.13 tidygraph_1.3.1
#> [79] survival_3.8-3 tinytex_0.57 fitdistrplus_1.2-4
#> [82] ROCR_1.0-11 nlme_3.1-168 bit64_4.6.0-1
#> [85] alabaster.ranges_1.9.1 filelock_1.0.3 RcppAnnoy_0.0.22
#> [88] GenomeInfoDb_1.45.12 bslib_0.9.0 irlba_2.3.5.1
#> [91] vipor_0.4.7 KernSmooth_2.23-26 DBI_1.2.3
#> [94] ggrastr_1.0.2 tidyselect_1.2.1 bit_4.6.0
#> [97] compiler_4.5.1 curl_7.0.0 httr2_1.2.1
#> [100] h5mread_1.1.1 textshape_1.7.5 DelayedArray_0.35.3
#> [103] plotly_4.11.0 stringfish_0.17.0 bookdown_0.45
#> [106] rtracklayer_1.69.1 scales_1.4.0 lmtest_0.9-40
#> [109] ggeasy_0.1.6 sgof_2.3.5 rappdirs_0.3.3
#> [112] digest_0.6.37 goftest_1.2-3 spatstat.utils_3.2-0
#> [115] alabaster.matrix_1.9.0 rmarkdown_2.30 XVector_0.49.1
#> [118] htmltools_0.5.8.1 pkgconfig_2.0.3 ensembldb_2.33.2
#> [121] dbplyr_2.5.1 fastmap_1.2.0 UCSC.utils_1.5.0
#> [124] rlang_1.1.6 htmlwidgets_1.6.4 shiny_1.11.1
#> [127] farver_2.1.2 jquerylib_0.1.4 zoo_1.8-14
#> [130] jsonlite_2.0.0 BiocParallel_1.43.4 RCurl_1.98-1.17
#> [133] magrittr_2.0.4 dotCall64_1.2 Rhdf5lib_1.31.0
#> [136] Rcpp_1.1.0 kerntools_1.2.0 ggnewscale_0.5.2
#> [139] viridis_0.6.5 reticulate_1.43.0 stringi_1.8.7
#> [142] alabaster.schemas_1.9.0 ggraph_2.2.2 MASS_7.3-65
#> [145] AnnotationHub_3.99.6 plyr_1.8.9 parallel_4.5.1
#> [148] listenv_0.9.1 ggrepel_0.9.6 deldir_2.0-4
#> [151] Biostrings_2.77.2 graphlayouts_1.2.2 splines_4.5.1
#> [154] tensor_1.5.1 igraph_2.1.4 spatstat.geom_3.6-0
#> [157] RcppHNSW_0.6.0 reshape2_1.4.4 BiocVersion_3.22.0
#> [160] XML_3.99-0.19 evaluate_1.0.5 RcppParallel_5.1.11-1
#> [163] BiocManager_1.30.26 tweenr_2.0.3 httpuv_1.6.16
#> [166] RANN_2.6.2 tidyr_1.3.1 purrr_1.1.0
#> [169] polyclip_1.10-7 alabaster.sce_1.9.0 qs_0.27.3
#> [172] future_1.67.0 scattermore_1.2 ggforce_0.5.0
#> [175] xtable_1.8-4 AnnotationFilter_1.33.0 restfulr_0.0.16
#> [178] RSpectra_0.16-2 later_1.4.4 viridisLite_0.4.2
#> [181] tibble_3.3.0 beeswarm_0.4.0 memoise_2.0.1
#> [184] AnnotationDbi_1.71.1 GenomicAlignments_1.45.4 cluster_2.1.8.1
#> [187] globals_0.18.0