Chapter 10 Nestorowa mouse HSC (Smart-seq2)
10.1 Introduction
This performs an analysis of the mouse haematopoietic stem cell (HSC) dataset generated with Smart-seq2 (Nestorowa et al. 2016).
10.2 Data loading
library(AnnotationHub)
ens.mm.v97 <- AnnotationHub()[["AH73905"]]
anno <- select(ens.mm.v97, keys=rownames(sce.nest),
keytype="GENEID", columns=c("SYMBOL", "SEQNAME"))
rowData(sce.nest) <- anno[match(rownames(sce.nest), anno$GENEID),]
After loading and annotation, we inspect the resulting SingleCellExperiment
object:
## class: SingleCellExperiment
## dim: 46078 1920
## metadata(0):
## assays(1): counts
## rownames(46078): ENSMUSG00000000001 ENSMUSG00000000003 ...
## ENSMUSG00000107391 ENSMUSG00000107392
## rowData names(3): GENEID SYMBOL SEQNAME
## colnames(1920): HSPC_007 HSPC_013 ... Prog_852 Prog_810
## colData names(9): gate broad ... projected metrics
## reducedDimNames(1): diffusion
## mainExpName: endogenous
## altExpNames(2): ERCC FACS
10.3 Quality control
For some reason, no mitochondrial transcripts are available, so we will perform quality control using the spike-in proportions only.
library(scater)
stats <- perCellQCMetrics(sce.nest)
qc <- quickPerCellQC(stats, percent_subsets="altexps_ERCC_percent")
sce.nest <- sce.nest[,!qc$discard]
We examine the number of cells discarded for each reason.
## low_lib_size low_n_features high_altexps_ERCC_percent
## 146 28 241
## discard
## 264
We create some diagnostic plots for each metric (Figure 10.1).
colData(unfiltered) <- cbind(colData(unfiltered), stats)
unfiltered$discard <- qc$discard
gridExtra::grid.arrange(
plotColData(unfiltered, y="sum", colour_by="discard") +
scale_y_log10() + ggtitle("Total count"),
plotColData(unfiltered, y="detected", colour_by="discard") +
scale_y_log10() + ggtitle("Detected features"),
plotColData(unfiltered, y="altexps_ERCC_percent",
colour_by="discard") + ggtitle("ERCC percent"),
ncol=2
)
10.4 Normalization
library(scran)
set.seed(101000110)
clusters <- quickCluster(sce.nest)
sce.nest <- computeSumFactors(sce.nest, clusters=clusters)
sce.nest <- logNormCounts(sce.nest)
We examine some key metrics for the distribution of size factors, and compare it to the library sizes as a sanity check (Figure 10.2).
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.039 0.420 0.743 1.000 1.249 16.789
plot(librarySizeFactors(sce.nest), sizeFactors(sce.nest), pch=16,
xlab="Library size factors", ylab="Deconvolution factors", log="xy")
10.5 Variance modelling
We use the spike-in transcripts to model the technical noise as a function of the mean (Figure 10.3).
set.seed(00010101)
dec.nest <- modelGeneVarWithSpikes(sce.nest, "ERCC")
top.nest <- getTopHVGs(dec.nest, prop=0.1)
plot(dec.nest$mean, dec.nest$total, pch=16, cex=0.5,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(dec.nest)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
points(curfit$mean, curfit$var, col="red")
10.6 Dimensionality reduction
set.seed(101010011)
sce.nest <- denoisePCA(sce.nest, technical=dec.nest, subset.row=top.nest)
sce.nest <- runTSNE(sce.nest, dimred="PCA")
We check that the number of retained PCs is sensible.
## [1] 9
10.7 Clustering
snn.gr <- buildSNNGraph(sce.nest, use.dimred="PCA")
colLabels(sce.nest) <- factor(igraph::cluster_walktrap(snn.gr)$membership)
##
## 1 2 3 4 5 6 7 8 9 10
## 198 319 208 147 221 182 21 209 74 77
10.8 Marker gene detection
markers <- findMarkers(sce.nest, colLabels(sce.nest),
test.type="wilcox", direction="up", lfc=0.5,
row.data=rowData(sce.nest)[,"SYMBOL",drop=FALSE])
To illustrate the manual annotation process, we examine the marker genes for one of the clusters. Upregulation of Car2, Hebp1 amd hemoglobins indicates that cluster 10 contains erythroid precursors.
chosen <- markers[['10']]
best <- chosen[chosen$Top <= 10,]
aucs <- getMarkerEffects(best, prefix="AUC")
rownames(aucs) <- best$SYMBOL
library(pheatmap)
pheatmap(aucs, color=viridis::plasma(100))
10.9 Cell type annotation
library(SingleR)
mm.ref <- MouseRNAseqData()
# Renaming to symbols to match with reference row names.
renamed <- sce.nest
rownames(renamed) <- uniquifyFeatureNames(rownames(renamed),
rowData(sce.nest)$SYMBOL)
labels <- SingleR(renamed, mm.ref, labels=mm.ref$label.fine)
Most clusters are not assigned to any single lineage (Figure 10.6), which is perhaps unsurprising given that HSCs are quite different from their terminal fates. Cluster 10 is considered to contain erythrocytes, which is roughly consistent with our conclusions from the marker gene analysis above.
tab <- table(labels$labels, colLabels(sce.nest))
pheatmap(log10(tab+10), color=viridis::viridis(100))
10.10 Miscellaneous analyses
This dataset also contains information about the protein abundances in each cell from FACS. There is barely any heterogeneity in the chosen markers across the clusters (Figure 10.7); this is perhaps unsurprising given that all cells should be HSCs of some sort.
Y <- assay(altExp(sce.nest, "FACS"))
keep <- colSums(is.na(Y))==0 # Removing NA intensities.
se.averaged <- sumCountsAcrossCells(Y[,keep],
colLabels(sce.nest)[keep], average=TRUE)
averaged <- assay(se.averaged)
log.intensities <- log2(averaged+1)
centered <- log.intensities - rowMeans(log.intensities)
pheatmap(centered, breaks=seq(-1, 1, length.out=101))
Session Info
R Under development (unstable) (2024-10-21 r87258)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.1 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/New_York
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] SingleR_2.9.0 pheatmap_1.0.12
[3] scran_1.35.0 scater_1.35.0
[5] ggplot2_3.5.1 scuttle_1.17.0
[7] AnnotationHub_3.15.0 BiocFileCache_2.15.0
[9] dbplyr_2.5.0 ensembldb_2.31.0
[11] AnnotationFilter_1.31.0 GenomicFeatures_1.59.1
[13] AnnotationDbi_1.69.0 scRNAseq_2.21.0
[15] SingleCellExperiment_1.29.1 SummarizedExperiment_1.37.0
[17] Biobase_2.67.0 GenomicRanges_1.59.0
[19] GenomeInfoDb_1.43.0 IRanges_2.41.0
[21] S4Vectors_0.45.1 BiocGenerics_0.53.1
[23] generics_0.1.3 MatrixGenerics_1.19.0
[25] matrixStats_1.4.1 BiocStyle_2.35.0
[27] rebook_1.17.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 jsonlite_1.8.9
[3] CodeDepends_0.6.6 magrittr_2.0.3
[5] ggbeeswarm_0.7.2 gypsum_1.3.0
[7] farver_2.1.2 rmarkdown_2.29
[9] BiocIO_1.17.0 zlibbioc_1.53.0
[11] vctrs_0.6.5 DelayedMatrixStats_1.29.0
[13] memoise_2.0.1 Rsamtools_2.23.0
[15] RCurl_1.98-1.16 htmltools_0.5.8.1
[17] S4Arrays_1.7.1 curl_6.0.0
[19] BiocNeighbors_2.1.0 Rhdf5lib_1.29.0
[21] SparseArray_1.7.1 rhdf5_2.51.0
[23] sass_0.4.9 alabaster.base_1.7.1
[25] bslib_0.8.0 alabaster.sce_1.7.0
[27] httr2_1.0.6 cachem_1.1.0
[29] GenomicAlignments_1.43.0 igraph_2.1.1
[31] mime_0.12 lifecycle_1.0.4
[33] pkgconfig_2.0.3 rsvd_1.0.5
[35] Matrix_1.7-1 R6_2.5.1
[37] fastmap_1.2.0 GenomeInfoDbData_1.2.13
[39] digest_0.6.37 colorspace_2.1-1
[41] dqrng_0.4.1 irlba_2.3.5.1
[43] ExperimentHub_2.15.0 RSQLite_2.3.7
[45] beachmat_2.23.0 labeling_0.4.3
[47] filelock_1.0.3 fansi_1.0.6
[49] httr_1.4.7 abind_1.4-8
[51] compiler_4.5.0 bit64_4.5.2
[53] withr_3.0.2 BiocParallel_1.41.0
[55] viridis_0.6.5 DBI_1.2.3
[57] HDF5Array_1.35.1 alabaster.ranges_1.7.0
[59] alabaster.schemas_1.7.0 rappdirs_0.3.3
[61] DelayedArray_0.33.1 bluster_1.17.0
[63] rjson_0.2.23 tools_4.5.0
[65] vipor_0.4.7 beeswarm_0.4.0
[67] glue_1.8.0 restfulr_0.0.15
[69] rhdf5filters_1.19.0 grid_4.5.0
[71] Rtsne_0.17 cluster_2.1.6
[73] gtable_0.3.6 metapod_1.15.0
[75] BiocSingular_1.23.0 ScaledMatrix_1.15.0
[77] utf8_1.2.4 XVector_0.47.0
[79] ggrepel_0.9.6 BiocVersion_3.21.1
[81] pillar_1.9.0 limma_3.63.2
[83] dplyr_1.1.4 lattice_0.22-6
[85] rtracklayer_1.67.0 bit_4.5.0
[87] tidyselect_1.2.1 locfit_1.5-9.10
[89] Biostrings_2.75.1 knitr_1.49
[91] gridExtra_2.3 bookdown_0.41
[93] ProtGenerics_1.39.0 edgeR_4.5.0
[95] xfun_0.49 statmod_1.5.0
[97] UCSC.utils_1.3.0 lazyeval_0.2.2
[99] yaml_2.3.10 evaluate_1.0.1
[101] codetools_0.2-20 tibble_3.2.1
[103] alabaster.matrix_1.7.0 BiocManager_1.30.25
[105] graph_1.85.0 cli_3.6.3
[107] munsell_0.5.1 jquerylib_0.1.4
[109] Rcpp_1.0.13-1 dir.expiry_1.15.0
[111] png_0.1-8 XML_3.99-0.17
[113] parallel_4.5.0 blob_1.2.4
[115] sparseMatrixStats_1.19.0 bitops_1.0-9
[117] viridisLite_0.4.2 alabaster.se_1.7.0
[119] scales_1.3.0 purrr_1.0.2
[121] crayon_1.5.3 rlang_1.1.4
[123] celldex_1.17.0 cowplot_1.1.3
[125] KEGGREST_1.47.0