Chapter 7 Human PBMCs (10X Genomics)
7.1 Introduction
This performs an analysis of the public PBMC ID dataset generated by 10X Genomics (Zheng et al. 2017), starting from the filtered count matrix.
7.3 Quality control
Cell calling implicitly serves as a QC step to remove libraries with low total counts and number of detected genes. Thus, we will only filter on the mitochondrial proportion.
library(scater)
stats <- high.mito <- list()
for (n in names(all.sce)) {
current <- all.sce[[n]]
is.mito <- grep("MT", rowData(current)$Symbol_TENx)
stats[[n]] <- perCellQCMetrics(current, subsets=list(Mito=is.mito))
high.mito[[n]] <- isOutlier(stats[[n]]$subsets_Mito_percent, type="higher")
all.sce[[n]] <- current[,!high.mito[[n]]]
}
qcplots <- list()
for (n in names(all.sce)) {
current <- unfiltered[[n]]
colData(current) <- cbind(colData(current), stats[[n]])
current$discard <- high.mito[[n]]
qcplots[[n]] <- plotColData(current, x="sum", y="subsets_Mito_percent",
colour_by="discard") + scale_x_log10()
}
do.call(gridExtra::grid.arrange, c(qcplots, ncol=3))
![Percentage of mitochondrial reads in each cell in each of the 10X PBMC datasets, compared to the total count. Each point represents a cell and is colored according to whether that cell was discarded.](tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-pbmc-filtered-var-1.png)
Figure 7.1: Percentage of mitochondrial reads in each cell in each of the 10X PBMC datasets, compared to the total count. Each point represents a cell and is colored according to whether that cell was discarded.
## $pbmc3k
## Mode FALSE TRUE
## logical 2609 91
##
## $pbmc4k
## Mode FALSE TRUE
## logical 4182 158
##
## $pbmc8k
## Mode FALSE TRUE
## logical 8157 224
7.4 Normalization
We perform library size normalization, simply for convenience when dealing with file-backed matrices.
## $pbmc3k
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.234 0.748 0.926 1.000 1.157 6.604
##
## $pbmc4k
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.315 0.711 0.890 1.000 1.127 11.027
##
## $pbmc8k
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.296 0.704 0.877 1.000 1.118 6.794
7.5 Variance modelling
library(scran)
all.dec <- lapply(all.sce, modelGeneVar)
all.hvgs <- lapply(all.dec, getTopHVGs, prop=0.1)
par(mfrow=c(1,3))
for (n in names(all.dec)) {
curdec <- all.dec[[n]]
plot(curdec$mean, curdec$total, pch=16, cex=0.5, main=n,
xlab="Mean of log-expression", ylab="Variance of log-expression")
curfit <- metadata(curdec)
curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}
![Per-gene variance as a function of the mean for the log-expression values in each PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to the variances.](tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-filtered-pbmc-variance-1.png)
Figure 7.2: Per-gene variance as a function of the mean for the log-expression values in each PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to the variances.
7.6 Dimensionality reduction
For various reasons, we will first analyze each PBMC dataset separately rather than merging them together. We use randomized SVD, which is more efficient for file-backed matrices.
library(BiocSingular)
set.seed(10000)
all.sce <- mapply(FUN=runPCA, x=all.sce, subset_row=all.hvgs,
MoreArgs=list(ncomponents=25, BSPARAM=RandomParam()),
SIMPLIFY=FALSE)
set.seed(100000)
all.sce <- lapply(all.sce, runTSNE, dimred="PCA")
set.seed(1000000)
all.sce <- lapply(all.sce, runUMAP, dimred="PCA")
7.7 Clustering
for (n in names(all.sce)) {
g <- buildSNNGraph(all.sce[[n]], k=10, use.dimred='PCA')
clust <- igraph::cluster_walktrap(g)$membership
colLabels(all.sce[[n]]) <- factor(clust)
}
## $pbmc3k
##
## 1 2 3 4 5 6 7 8 9 10
## 475 636 153 476 164 31 159 164 340 11
##
## $pbmc4k
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 127 594 518 775 211 394 187 993 55 201 91 36
##
## $pbmc8k
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 292 1603 388 94 738 1035 1049 156 203 153 2098 261 64 14 9
all.tsne <- list()
for (n in names(all.sce)) {
all.tsne[[n]] <- plotTSNE(all.sce[[n]], colour_by="label") + ggtitle(n)
}
do.call(gridExtra::grid.arrange, c(all.tsne, list(ncol=2)))
![Obligatory $t$-SNE plots of each PBMC dataset, where each point represents a cell in the corresponding dataset and is colored according to the assigned cluster.](tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-filtered-pbmc-tsne-1.png)
Figure 7.3: Obligatory \(t\)-SNE plots of each PBMC dataset, where each point represents a cell in the corresponding dataset and is colored according to the assigned cluster.
7.8 Data integration
With the per-dataset analyses out of the way, we will now repeat the analysis after merging together the three batches.
# Intersecting the common genes.
universe <- Reduce(intersect, lapply(all.sce, rownames))
all.sce2 <- lapply(all.sce, "[", i=universe,)
all.dec2 <- lapply(all.dec, "[", i=universe,)
# Renormalizing to adjust for differences in depth.
library(batchelor)
normed.sce <- do.call(multiBatchNorm, all.sce2)
# Identifying a set of HVGs using stats from all batches.
combined.dec <- do.call(combineVar, all.dec2)
combined.hvg <- getTopHVGs(combined.dec, n=5000)
set.seed(1000101)
merged.pbmc <- do.call(fastMNN, c(normed.sce,
list(subset.row=combined.hvg, BSPARAM=RandomParam())))
We use the percentage of lost variance as a diagnostic measure.
## pbmc3k pbmc4k pbmc8k
## [1,] 7.044e-03 3.129e-03 0.000000
## [2,] 6.876e-05 4.912e-05 0.003008
We proceed to clustering:
g <- buildSNNGraph(merged.pbmc, use.dimred="corrected")
colLabels(merged.pbmc) <- factor(igraph::cluster_louvain(g)$membership)
table(colLabels(merged.pbmc), merged.pbmc$batch)
##
## pbmc3k pbmc4k pbmc8k
## 1 535 426 830
## 2 331 588 1126
## 3 182 122 217
## 4 150 179 292
## 5 170 345 573
## 6 292 538 1020
## 7 342 630 1236
## 8 437 749 1538
## 9 9 18 95
## 10 97 365 782
## 11 34 120 201
## 12 11 54 159
## 13 11 3 9
## 14 4 36 64
## 15 4 9 15
And visualization:
set.seed(10101010)
merged.pbmc <- runTSNE(merged.pbmc, dimred="corrected")
gridExtra::grid.arrange(
plotTSNE(merged.pbmc, colour_by="label", text_by="label", text_colour="red"),
plotTSNE(merged.pbmc, colour_by="batch")
)
![Obligatory $t$-SNE plots for the merged PBMC datasets, where each point represents a cell and is colored by cluster (top) or batch (bottom).](tenx-filtered-pbmc3k-4k-8k_files/figure-html/unref-filtered-pbmc-merged-tsne-1.png)
Figure 7.4: Obligatory \(t\)-SNE plots for the merged PBMC datasets, where each point represents a cell and is colored by cluster (top) or batch (bottom).
Session Info
R Under development (unstable) (2024-10-21 r87258)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.1 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/New_York
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] batchelor_1.23.0 BiocSingular_1.23.0
[3] scran_1.35.0 scater_1.35.0
[5] ggplot2_3.5.1 scuttle_1.17.0
[7] TENxPBMCData_1.25.0 HDF5Array_1.35.7
[9] rhdf5_2.51.2 DelayedArray_0.33.4
[11] SparseArray_1.7.4 S4Arrays_1.7.1
[13] abind_1.4-8 Matrix_1.7-1
[15] SingleCellExperiment_1.29.1 SummarizedExperiment_1.37.0
[17] Biobase_2.67.0 GenomicRanges_1.59.1
[19] GenomeInfoDb_1.43.2 IRanges_2.41.2
[21] S4Vectors_0.45.2 BiocGenerics_0.53.3
[23] generics_0.1.3 MatrixGenerics_1.19.1
[25] matrixStats_1.5.0 BiocStyle_2.35.0
[27] rebook_1.17.0
loaded via a namespace (and not attached):
[1] DBI_1.2.3 gridExtra_2.3
[3] CodeDepends_0.6.6 rlang_1.1.5
[5] magrittr_2.0.3 RcppAnnoy_0.0.22
[7] compiler_4.5.0 RSQLite_2.3.9
[9] DelayedMatrixStats_1.29.1 dir.expiry_1.15.0
[11] png_0.1-8 vctrs_0.6.5
[13] pkgconfig_2.0.3 crayon_1.5.3
[15] fastmap_1.2.0 dbplyr_2.5.0
[17] XVector_0.47.2 labeling_0.4.3
[19] rmarkdown_2.29 graph_1.85.1
[21] UCSC.utils_1.3.1 ggbeeswarm_0.7.2
[23] purrr_1.0.2 bit_4.5.0.1
[25] bluster_1.17.0 xfun_0.50
[27] cachem_1.1.0 beachmat_2.23.6
[29] jsonlite_1.8.9 blob_1.2.4
[31] rhdf5filters_1.19.0 Rhdf5lib_1.29.0
[33] BiocParallel_1.41.0 cluster_2.1.8
[35] irlba_2.3.5.1 parallel_4.5.0
[37] R6_2.5.1 bslib_0.8.0
[39] limma_3.63.3 jquerylib_0.1.4
[41] Rcpp_1.0.14 bookdown_0.42
[43] knitr_1.49 FNN_1.1.4.1
[45] igraph_2.1.3 tidyselect_1.2.1
[47] viridis_0.6.5 yaml_2.3.10
[49] codetools_0.2-20 curl_6.1.0
[51] lattice_0.22-6 tibble_3.2.1
[53] withr_3.0.2 KEGGREST_1.47.0
[55] Rtsne_0.17 evaluate_1.0.3
[57] BiocFileCache_2.15.1 ExperimentHub_2.15.0
[59] Biostrings_2.75.3 pillar_1.10.1
[61] BiocManager_1.30.25 filelock_1.0.3
[63] BiocVersion_3.21.1 sparseMatrixStats_1.19.0
[65] munsell_0.5.1 scales_1.3.0
[67] glue_1.8.0 metapod_1.15.0
[69] tools_4.5.0 AnnotationHub_3.15.0
[71] BiocNeighbors_2.1.2 ScaledMatrix_1.15.0
[73] locfit_1.5-9.10 XML_3.99-0.18
[75] cowplot_1.1.3 grid_4.5.0
[77] edgeR_4.5.1 AnnotationDbi_1.69.0
[79] colorspace_2.1-1 GenomeInfoDbData_1.2.13
[81] beeswarm_0.4.0 vipor_0.4.7
[83] cli_3.6.3 rsvd_1.0.5
[85] rappdirs_0.3.3 viridisLite_0.4.2
[87] dplyr_1.1.4 ResidualMatrix_1.17.0
[89] uwot_0.2.2 gtable_0.3.6
[91] sass_0.4.9 digest_0.6.37
[93] dqrng_0.4.1 ggrepel_0.9.6
[95] farver_2.1.2 memoise_2.0.1
[97] htmltools_0.5.8.1 lifecycle_1.0.4
[99] httr_1.4.7 statmod_1.5.0
[101] mime_0.12 bit64_4.6.0-1