scrapper 1.5.1
scrapper implements bindings to C++ code for analyzing single-cell data, mostly from the libscran libraries. Each function performs an individual analysis step ranging from quality control to clustering and marker detection. scrapper is mostly intended for other Bioconductor package developers; users should check out the scrapple package instead for a more ergonomic experience.
Let’s fetch a small single-cell RNA-seq dataset for demonstration purposes:
library(scRNAseq)
sce.z <- ZeiselBrainData()
sce.z
## class: SingleCellExperiment
## dim: 20006 3005
## metadata(0):
## assays(1): counts
## rownames(20006): Tspan12 Tshz1 ... mt-Rnr1 mt-Nd4l
## rowData names(1): featureType
## colnames(3005): 1772071015_C02 1772071017_G12 ... 1772066098_A12
## 1772058148_F03
## colData names(9): tissue group # ... level1class level2class
## reducedDimNames(0):
## mainExpName: gene
## altExpNames(2): repeat ERCC
We run it through the scrapper analysis pipeline. First, some quality control:
counts <- assay(sce.z)
is.mito <- grepl("^mt-", rownames(sce.z))
nthreads <- 2 # using a smaller value to avoid stressing out the build machines.
library(scrapper)
rna.qc.metrics <- computeRnaQcMetrics(counts, subsets=list(mt=is.mito), num.threads=nthreads)
rna.qc.thresholds <- suggestRnaQcThresholds(rna.qc.metrics)
rna.qc.filter <- filterRnaQcMetrics(rna.qc.thresholds, rna.qc.metrics)
filtered <- counts[,rna.qc.filter,drop=FALSE]
Then normalization:
size.factors <- centerSizeFactors(rna.qc.metrics$sum[rna.qc.filter])
normalized <- normalizeCounts(filtered, size.factors)
Feature selection and PCA:
gene.var <- modelGeneVariances(normalized, num.threads=nthreads)
top.hvgs <- chooseHighlyVariableGenes(gene.var$statistics$residuals)
pca <- runPca(normalized[top.hvgs,], num.threads=nthreads)
Some clustering:
snn.graph <- buildSnnGraph(pca$components, num.threads=nthreads)
clust.out <- clusterGraph(snn.graph)
table(clust.out$membership)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 304 201 510 192 212 229 202 169 193 177 235 34 79 137
And visualization:
umap.out <- runUmap(pca$components, num.threads=nthreads)
tsne.out <- runTsne(pca$components, num.threads=nthreads)
plot(tsne.out[,1], tsne.out[,2], col=factor(clust.out$membership))
And finally marker detection:
markers <- scoreMarkers(normalized, groups=clust.out$membership, num.threads=nthreads)
# Top markers for each cluster, say, based on the median AUC:
top.markers <- lapply(markers$auc, function(x) {
head(rownames(x)[order(x$median, decreasing=TRUE)], 10)
})
head(top.markers)
## $`1`
## [1] "Gad1" "Gad2" "Ndrg4" "Stmn3" "Tspyl4" "Scg5" "Snap25" "Syp"
## [9] "Slc6a1" "Atp1a3"
##
## $`2`
## [1] "Stmn3" "Calm1" "Scg5" "Chn1" "Pcsk2" "Snap25" "Calm2" "Mdh1"
## [9] "Mef2c" "Nme1"
##
## $`3`
## [1] "Stmn3" "Chn1" "Crym" "3110035E14Rik"
## [5] "Calm1" "Neurod6" "Calm2" "Hpca"
## [9] "Ppp3r1" "Ppp3ca"
##
## $`4`
## [1] "Ywhah" "Syp" "Ndrg4" "Snap25" "Rab3a" "Chn1" "Vsnl1" "Stmn3"
## [9] "Rtn1" "Pcsk2"
##
## $`5`
## [1] "Ppp3ca" "Atp1b1" "Tspan13" "Rtn1" "Hpca" "Gria1" "Nsg2"
## [8] "Chn1" "Syp" "Wasf1"
##
## $`6`
## [1] "Plp1" "Mbp" "Cnp" "Mog" "Mobp" "Ugt8a" "Cldn11" "Mal"
## [9] "Sept4" "Ermn"
Readers are referred to the OSCA book for more details on the theory behind each step.
Let’s fetch another brain dataset and combine it with the previous one.
sce.t <- TasicBrainData()
common <- intersect(rownames(sce.z), rownames(sce.t))
combined <- cbind(assay(sce.t)[common,], assay(sce.z)[common,])
block <- rep(c("tasic", "zeisel"), c(ncol(sce.t), ncol(sce.z)))
We set block= in each step to account for the batch structure.
This ensures that the various calculations are not affected by inter-block differences.
It also uses MNN correction to batch effects in the low-dimensional space prior to further analyses.
library(scrapper)
rna.qc.metrics <- computeRnaQcMetrics(combined, subsets=list(), num.threads=nthreads)
rna.qc.thresholds <- suggestRnaQcThresholds(rna.qc.metrics, block=block)
rna.qc.filter <- filterRnaQcMetrics(rna.qc.thresholds, rna.qc.metrics, block=block)
filtered <- combined[,rna.qc.filter,drop=FALSE]
filtered.block <- block[rna.qc.filter]
size.factors <- centerSizeFactors(rna.qc.metrics$sum[rna.qc.filter], block=filtered.block)
normalized <- normalizeCounts(filtered, size.factors)
gene.var <- modelGeneVariances(normalized, num.threads=nthreads, block=filtered.block)
top.hvgs <- chooseHighlyVariableGenes(gene.var$statistics$residuals)
pca <- runPca(normalized[top.hvgs,], num.threads=nthreads, block=filtered.block)
# Now we do a MNN correction to get rid of the batch effect.
corrected <- correctMnn(pca$components, block=filtered.block, num.threads=nthreads)
umap.out <- runUmap(corrected$corrected, num.threads=nthreads)
tsne.out <- runTsne(corrected$corrected, num.threads=nthreads)
plot(tsne.out[,1], tsne.out[,2], pch=16, col=factor(filtered.block))
snn.graph <- buildSnnGraph(corrected$corrected, num.threads=nthreads)
clust.out <- clusterGraph(snn.graph)
markers <- scoreMarkers(normalized, groups=clust.out$membership, block=filtered.block, num.threads=nthreads)
We can also compute pseudo-bulk profiles for each cluster-dataset combination, e.g., for differential expression analyses.
aggregates <- aggregateAcrossCells(filtered, list(cluster=clust.out$membership, dataset=filtered.block))
Let’s fetch a single-cell dataset with both RNA-seq and CITE-seq data. To keep the run-time short, we’ll only consider the first 5000 cells.
sce.s <- StoeckiusHashingData(mode=c("human", "adt1", "adt2"))
sce.s <- sce.s[,1:5000]
sce.s
## class: SingleCellExperiment
## dim: 27679 5000
## metadata(0):
## assays(1): counts
## rownames(27679): A1BG A1BG-AS1 ... hsa-mir-8072 snoU2-30
## rowData names(0):
## colnames(5000): TGCCAAATCTCTAAGG ACTGCTCAGGTGTTAA ... CGCTATCGTCCGTCAG
## GGCGACTGTAAGGGAA
## colData names(0):
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(2): adt1 adt2
We extract the counts for both the RNA and the ADTs.
rna.counts <- assay(sce.s)
adt.counts <- rbind(assay(altExp(sce.s, "adt1")), assay(altExp(sce.s, "adt2")))
We use both matrices in our analysis pipeline. Each modality is processed separately before being combined for steps like clustering and visualization.
# QC in both modalities, only keeping the cells that pass in both.
is.mito <- grepl("^MT-", rownames(rna.counts))
rna.qc.metrics <- computeRnaQcMetrics(rna.counts, subsets=list(MT=is.mito), num.threads=nthreads)
rna.qc.thresholds <- suggestRnaQcThresholds(rna.qc.metrics)
rna.qc.filter <- filterRnaQcMetrics(rna.qc.thresholds, rna.qc.metrics)
is.igg <- grepl("^IgG", rownames(adt.counts))
adt.qc.metrics <- computeAdtQcMetrics(adt.counts, subsets=list(IgG=is.igg), num.threads=nthreads)
adt.qc.thresholds <- suggestAdtQcThresholds(adt.qc.metrics)
adt.qc.filter <- filterAdtQcMetrics(adt.qc.thresholds, adt.qc.metrics)
keep <- rna.qc.filter & adt.qc.filter
rna.filtered <- rna.counts[,keep,drop=FALSE]
adt.filtered <- adt.counts[,keep,drop=FALSE]
rna.size.factors <- centerSizeFactors(rna.qc.metrics$sum[keep])
rna.normalized <- normalizeCounts(rna.filtered, rna.size.factors)
adt.size.factors <- computeClrm1Factors(adt.filtered, num.threads=nthreads)
adt.size.factors <- centerSizeFactors(adt.size.factors)
adt.normalized <- normalizeCounts(adt.filtered, adt.size.factors)
gene.var <- modelGeneVariances(rna.normalized, num.threads=nthreads)
top.hvgs <- chooseHighlyVariableGenes(gene.var$statistics$residuals)
rna.pca <- runPca(rna.normalized[top.hvgs,], num.threads=nthreads)
# Combining the RNA-derived PCs with ADT expression. Here, there's very few ADT
# tags so there's no need for further dimensionality reduction.
combined <- scaleByNeighbors(list(rna.pca$components, as.matrix(adt.normalized)), num.threads=nthreads)
snn.graph <- buildSnnGraph(combined$combined, num.threads=nthreads)
clust.out <- clusterGraph(snn.graph)
table(clust.out$membership)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 308 374 710 386 166 113 372 623 701 304 109 85
umap.out <- runUmap(combined$combined, num.threads=nthreads)
tsne.out <- runTsne(combined$combined, num.threads=nthreads)
plot(umap.out[,1], umap.out[,2], pch=16, col=factor(clust.out$membership))
rna.markers <- scoreMarkers(rna.normalized, groups=clust.out$membership, num.threads=nthreads)
adt.markers <- scoreMarkers(adt.normalized, groups=clust.out$membership, num.threads=nthreads)
The runAllNeighborSteps() will run runUmap(), runTsne(), buildSnnGraph() and clusterGraph() in a single call.
This runs the UMAP/t-SNE iterations and the clustering in parallel to maximize use of multiple threads.
The scoreGeneSet() function will compute a gene set score based on the input expression matrix.
This can be used to summarize the activity of pathways into a single per-cell score for visualization.
The subsampleByNeighbors() function will deterministically select a representative subset of cells based on their local neighborhood.
This can be used to reduce the compute time of the various steps downstream of the PCA.
For CRISPR data, quality control can be performed using computeCrisprQcMetrics(), suggestCrisprQcThresholds() and filterCrisprQcMetrics().
To normalize, we use size factors defined by centering the total sum of guide counts for each cell.
sessionInfo()
## R Under development (unstable) (2025-10-20 r88955)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.3 LTS
##
## Matrix products: default
## BLAS: /home/biocbuild/bbs-3.23-bioc/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0 LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB LC_COLLATE=C
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: America/New_York
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] scrapper_1.5.1 scRNAseq_2.25.0
## [3] SingleCellExperiment_1.33.0 SummarizedExperiment_1.41.0
## [5] Biobase_2.71.0 GenomicRanges_1.63.0
## [7] Seqinfo_1.1.0 IRanges_2.45.0
## [9] S4Vectors_0.49.0 BiocGenerics_0.57.0
## [11] generics_0.1.4 MatrixGenerics_1.23.0
## [13] matrixStats_1.5.0 knitr_1.50
## [15] BiocStyle_2.39.0
##
## loaded via a namespace (and not attached):
## [1] DBI_1.2.3 bitops_1.0-9 httr2_1.2.1
## [4] rlang_1.1.6 magrittr_2.0.4 gypsum_1.7.0
## [7] compiler_4.6.0 RSQLite_2.4.3 GenomicFeatures_1.63.1
## [10] png_0.1-8 vctrs_0.6.5 ProtGenerics_1.43.0
## [13] pkgconfig_2.0.3 crayon_1.5.3 fastmap_1.2.0
## [16] magick_2.9.0 dbplyr_2.5.1 XVector_0.51.0
## [19] Rsamtools_2.27.0 rmarkdown_2.30 UCSC.utils_1.7.0
## [22] purrr_1.1.0 tinytex_0.57 bit_4.6.0
## [25] xfun_0.54 cachem_1.1.0 beachmat_2.27.0
## [28] cigarillo_1.1.0 GenomeInfoDb_1.47.0 jsonlite_2.0.0
## [31] blob_1.2.4 rhdf5filters_1.23.0 DelayedArray_0.37.0
## [34] Rhdf5lib_1.33.0 BiocParallel_1.45.0 parallel_4.6.0
## [37] R6_2.6.1 bslib_0.9.0 rtracklayer_1.71.0
## [40] jquerylib_0.1.4 Rcpp_1.1.0 bookdown_0.45
## [43] Matrix_1.7-4 tidyselect_1.2.1 abind_1.4-8
## [46] yaml_2.3.10 codetools_0.2-20 curl_7.0.0
## [49] lattice_0.22-7 alabaster.sce_1.11.0 tibble_3.3.0
## [52] withr_3.0.2 KEGGREST_1.51.0 evaluate_1.0.5
## [55] BiocFileCache_3.1.0 alabaster.schemas_1.11.0 ExperimentHub_3.1.0
## [58] Biostrings_2.79.1 pillar_1.11.1 BiocManager_1.30.26
## [61] filelock_1.0.3 RCurl_1.98-1.17 BiocVersion_3.23.1
## [64] ensembldb_2.35.0 alabaster.base_1.11.1 glue_1.8.0
## [67] alabaster.ranges_1.11.0 alabaster.matrix_1.11.0 lazyeval_0.2.2
## [70] tools_4.6.0 AnnotationHub_4.1.0 BiocIO_1.21.0
## [73] BiocNeighbors_2.5.0 GenomicAlignments_1.47.0 XML_3.99-0.19
## [76] rhdf5_2.55.4 grid_4.6.0 AnnotationDbi_1.73.0
## [79] HDF5Array_1.39.0 restfulr_0.0.16 cli_3.6.5
## [82] rappdirs_0.3.3 S4Arrays_1.11.0 dplyr_1.1.4
## [85] AnnotationFilter_1.35.0 alabaster.se_1.11.0 sass_0.4.10
## [88] digest_0.6.37 SparseArray_1.11.1 rjson_0.2.23
## [91] memoise_2.0.1 htmltools_0.5.8.1 lifecycle_1.0.4
## [94] h5mread_1.3.0 httr_1.4.7 bit64_4.6.0-1