Chapter 7 Human PBMCs (10X Genomics)

7.1 Introduction

This performs an analysis of the public PBMC ID dataset generated by 10X Genomics (Zheng et al. 2017), starting from the filtered count matrix.

7.2 Data loading

library(TENxPBMCData)
all.sce <- list(
    pbmc3k=TENxPBMCData('pbmc3k'),
    pbmc4k=TENxPBMCData('pbmc4k'),
    pbmc8k=TENxPBMCData('pbmc8k')
)

7.3 Quality control

unfiltered <- all.sce

Cell calling implicitly serves as a QC step to remove libraries with low total counts and number of detected genes. Thus, we will only filter on the mitochondrial proportion.

library(scater)
stats <- high.mito <- list()
for (n in names(all.sce)) {
    current <- all.sce[[n]]
    is.mito <- grep("MT", rowData(current)$Symbol_TENx)
    stats[[n]] <- perCellQCMetrics(current, subsets=list(Mito=is.mito))
    high.mito[[n]] <- isOutlier(stats[[n]]$subsets_Mito_percent, type="higher")
    all.sce[[n]] <- current[,!high.mito[[n]]]
}

qcplots <- list()
for (n in names(all.sce)) {
    current <- unfiltered[[n]]
    colData(current) <- cbind(colData(current), stats[[n]])
    current$discard <- high.mito[[n]]
    qcplots[[n]] <- plotColData(current, x="sum", y="subsets_Mito_percent",
        colour_by="discard") + scale_x_log10()
}
do.call(gridExtra::grid.arrange, c(qcplots, ncol=3))

Percentage of mitochondrial reads in each cell in each of the 10X PBMC datasets, compared to the total count. Each point represents a cell and is colored according to whether that cell was discarded.

Figure 7.1: Percentage of mitochondrial reads in each cell in each of the 10X PBMC datasets, compared to the total count. Each point represents a cell and is colored according to whether that cell was discarded.

lapply(high.mito, summary)

## $pbmc3k
##    Mode   FALSE    TRUE 
## logical    2609      91 
## 
## $pbmc4k
##    Mode   FALSE    TRUE 
## logical    4182     158 
## 
## $pbmc8k
##    Mode   FALSE    TRUE 
## logical    8157     224

7.4 Normalization

We perform library size normalization, simply for convenience when dealing with file-backed matrices.

all.sce <- lapply(all.sce, logNormCounts)

lapply(all.sce, function(x) summary(sizeFactors(x)))

## $pbmc3k
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.234   0.748   0.926   1.000   1.157   6.604 
## 
## $pbmc4k
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.315   0.711   0.890   1.000   1.127  11.027 
## 
## $pbmc8k
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.296   0.704   0.877   1.000   1.118   6.794

7.5 Variance modelling

library(scran)
all.dec <- lapply(all.sce, modelGeneVar)
all.hvgs <- lapply(all.dec, getTopHVGs, prop=0.1)

par(mfrow=c(1,3))
for (n in names(all.dec)) {
    curdec <- all.dec[[n]]
    plot(curdec$mean, curdec$total, pch=16, cex=0.5, main=n,
        xlab="Mean of log-expression", ylab="Variance of log-expression")
    curfit <- metadata(curdec)
    curve(curfit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}

Per-gene variance as a function of the mean for the log-expression values in each PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to the variances.

Figure 7.2: Per-gene variance as a function of the mean for the log-expression values in each PBMC dataset. Each point represents a gene (black) with the mean-variance trend (blue) fitted to the variances.

7.6 Dimensionality reduction

For various reasons, we will first analyze each PBMC dataset separately rather than merging them together. We use randomized SVD, which is more efficient for file-backed matrices.

library(BiocSingular)
set.seed(10000)
all.sce <- mapply(FUN=runPCA, x=all.sce, subset_row=all.hvgs, 
    MoreArgs=list(ncomponents=25, BSPARAM=RandomParam()), 
    SIMPLIFY=FALSE)

set.seed(100000)
all.sce <- lapply(all.sce, runTSNE, dimred="PCA")

set.seed(1000000)
all.sce <- lapply(all.sce, runUMAP, dimred="PCA")

7.7 Clustering

for (n in names(all.sce)) {
    g <- buildSNNGraph(all.sce[[n]], k=10, use.dimred='PCA')
    clust <- igraph::cluster_walktrap(g)$membership
    colLabels(all.sce[[n]])  <- factor(clust)
}

lapply(all.sce, function(x) table(colLabels(x)))

## $pbmc3k
## 
##   1   2   3   4   5   6   7   8   9  10 
## 475 636 153 476 164  31 159 164 340  11 
## 
## $pbmc4k
## 
##   1   2   3   4   5   6   7   8   9  10  11  12 
## 127 594 518 775 211 394 187 993  55 201  91  36 
## 
## $pbmc8k
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
##  292 1603  388   94  738 1035 1049  156  203  153 2098  261   64   14    9

all.tsne <- list()
for (n in names(all.sce)) {
    all.tsne[[n]] <- plotTSNE(all.sce[[n]], colour_by="label") + ggtitle(n)
}
do.call(gridExtra::grid.arrange, c(all.tsne, list(ncol=2)))

Obligatory $t$-SNE plots of each PBMC dataset, where each point represents a cell in the corresponding dataset and is colored according to the assigned cluster.

Figure 7.3: Obligatory $t$-SNE plots of each PBMC dataset, where each point represents a cell in the corresponding dataset and is colored according to the assigned cluster.

7.8 Data integration

With the per-dataset analyses out of the way, we will now repeat the analysis after merging together the three batches.

# Intersecting the common genes.
universe <- Reduce(intersect, lapply(all.sce, rownames))
all.sce2 <- lapply(all.sce, "[", i=universe,)
all.dec2 <- lapply(all.dec, "[", i=universe,)

# Renormalizing to adjust for differences in depth.
library(batchelor)
normed.sce <- do.call(multiBatchNorm, all.sce2)

# Identifying a set of HVGs using stats from all batches.
combined.dec <- do.call(combineVar, all.dec2)
combined.hvg <- getTopHVGs(combined.dec, n=5000)

set.seed(1000101)
merged.pbmc <- do.call(fastMNN, c(normed.sce, 
    list(subset.row=combined.hvg, BSPARAM=RandomParam())))

We use the percentage of lost variance as a diagnostic measure.

metadata(merged.pbmc)$merge.info$lost.var

##         pbmc3k    pbmc4k   pbmc8k
## [1,] 7.044e-03 3.129e-03 0.000000
## [2,] 6.876e-05 4.912e-05 0.003008

We proceed to clustering:

g <- buildSNNGraph(merged.pbmc, use.dimred="corrected")
colLabels(merged.pbmc) <- factor(igraph::cluster_louvain(g)$membership)
table(colLabels(merged.pbmc), merged.pbmc$batch)

##     
##      pbmc3k pbmc4k pbmc8k
##   1     535    426    830
##   2     331    588   1126
##   3     182    122    217
##   4     150    179    292
##   5     170    345    573
##   6     292    538   1020
##   7     342    630   1236
##   8     437    749   1538
##   9       9     18     95
##   10     97    365    782
##   11     34    120    201
##   12     11     54    159
##   13     11      3      9
##   14      4     36     64
##   15      4      9     15

And visualization:

set.seed(10101010)
merged.pbmc <- runTSNE(merged.pbmc, dimred="corrected")
gridExtra::grid.arrange(
    plotTSNE(merged.pbmc, colour_by="label", text_by="label", text_colour="red"),
    plotTSNE(merged.pbmc, colour_by="batch")
)

Figure 7.4: Obligatory $t$-SNE plots for the merged PBMC datasets, where each point represents a cell and is colored by cluster (top) or batch (bottom).

Session Info

R version 4.5.0 RC (2025-04-04 r88126)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.2 LTS

Matrix products: default
BLAS:   /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_GB              LC_COLLATE=C              
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: America/New_York
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] batchelor_1.24.0            BiocSingular_1.24.0        
 [3] scran_1.36.0                scater_1.36.0              
 [5] ggplot2_3.5.2               scuttle_1.18.0             
 [7] TENxPBMCData_1.25.0         HDF5Array_1.36.0           
 [9] h5mread_1.0.0               rhdf5_2.52.0               
[11] DelayedArray_0.34.0         SparseArray_1.8.0          
[13] S4Arrays_1.8.0              abind_1.4-8                
[15] Matrix_1.7-3                SingleCellExperiment_1.30.0
[17] SummarizedExperiment_1.38.0 Biobase_2.68.0             
[19] GenomicRanges_1.60.0        GenomeInfoDb_1.44.0        
[21] IRanges_2.42.0              S4Vectors_0.46.0           
[23] BiocGenerics_0.54.0         generics_0.1.3             
[25] MatrixGenerics_1.20.0       matrixStats_1.5.0          
[27] BiocStyle_2.36.0            rebook_1.18.0              

loaded via a namespace (and not attached):
  [1] DBI_1.2.3                 gridExtra_2.3            
  [3] CodeDepends_0.6.6         rlang_1.1.6              
  [5] magrittr_2.0.3            RcppAnnoy_0.0.22         
  [7] compiler_4.5.0            RSQLite_2.3.9            
  [9] DelayedMatrixStats_1.30.0 dir.expiry_1.16.0        
 [11] png_0.1-8                 vctrs_0.6.5              
 [13] pkgconfig_2.0.3           crayon_1.5.3             
 [15] fastmap_1.2.0             dbplyr_2.5.0             
 [17] XVector_0.48.0            labeling_0.4.3           
 [19] rmarkdown_2.29            graph_1.86.0             
 [21] UCSC.utils_1.4.0          ggbeeswarm_0.7.2         
 [23] purrr_1.0.4               bit_4.6.0                
 [25] bluster_1.18.0            xfun_0.52                
 [27] cachem_1.1.0              beachmat_2.24.0          
 [29] jsonlite_2.0.0            blob_1.2.4               
 [31] rhdf5filters_1.20.0       Rhdf5lib_1.30.0          
 [33] BiocParallel_1.42.0       cluster_2.1.8.1          
 [35] irlba_2.3.5.1             parallel_4.5.0           
 [37] R6_2.6.1                  bslib_0.9.0              
 [39] limma_3.64.0              jquerylib_0.1.4          
 [41] Rcpp_1.0.14               bookdown_0.43            
 [43] knitr_1.50                FNN_1.1.4.1              
 [45] igraph_2.1.4              tidyselect_1.2.1         
 [47] viridis_0.6.5             yaml_2.3.10              
 [49] codetools_0.2-20          curl_6.2.2               
 [51] lattice_0.22-7            tibble_3.2.1             
 [53] withr_3.0.2               KEGGREST_1.48.0          
 [55] Rtsne_0.17                evaluate_1.0.3           
 [57] BiocFileCache_2.16.0      ExperimentHub_2.16.0     
 [59] Biostrings_2.76.0         pillar_1.10.2            
 [61] BiocManager_1.30.25       filelock_1.0.3           
 [63] BiocVersion_3.21.1        sparseMatrixStats_1.20.0 
 [65] munsell_0.5.1             scales_1.3.0             
 [67] glue_1.8.0                metapod_1.16.0           
 [69] tools_4.5.0               AnnotationHub_3.16.0     
 [71] BiocNeighbors_2.2.0       ScaledMatrix_1.16.0      
 [73] locfit_1.5-9.12           XML_3.99-0.18            
 [75] cowplot_1.1.3             grid_4.5.0               
 [77] edgeR_4.6.0               AnnotationDbi_1.70.0     
 [79] colorspace_2.1-1          GenomeInfoDbData_1.2.14  
 [81] beeswarm_0.4.0            vipor_0.4.7              
 [83] cli_3.6.4                 rsvd_1.0.5               
 [85] rappdirs_0.3.3            viridisLite_0.4.2        
 [87] dplyr_1.1.4               ResidualMatrix_1.18.0    
 [89] uwot_0.2.3                gtable_0.3.6             
 [91] sass_0.4.10               digest_0.6.37            
 [93] dqrng_0.4.1               ggrepel_0.9.6            
 [95] farver_2.1.2              memoise_2.0.1            
 [97] htmltools_0.5.8.1         lifecycle_1.0.4          
 [99] httr_1.4.7                statmod_1.5.0            
[101] mime_0.13                 bit64_4.6.0-1

References

Zheng, G. X., J. M. Terry, P. Belgrader, P. Ryvkin, Z. W. Bent, R. Wilson, S. B. Ziraldo, et al. 2017. “Massively parallel digital transcriptional profiling of single cells.” Nat Commun 8 (January): 14049.