Chapter 13 Messmer human ESC (Smart-seq2)
13.1 Introduction
This performs an analysis of the human embryonic stem cell (hESC) dataset generated with Smart-seq2 (Messmer et al. 2019), which contains several plates of naive and primed hESCs. The chapter’s code is based on the steps in the paper’s GitHub repository, with some additional steps for cell cycle effect removal contributed by Philippe Boileau.
13.2 Data loading
Converting the batch to a factor, to make life easier later on.
13.3 Quality control
Let’s have a look at the QC statistics.
## low_lib_size low_n_features high_subsets_Mito_percent
## 107 99 22
## high_altexps_ERCC_percent discard
## 117 156
gridExtra::grid.arrange(
plotColData(original, x="experiment batch", y="sum",
colour_by=I(filtered$discard), other_field="phenotype") +
facet_wrap(~phenotype) + scale_y_log10(),
plotColData(original, x="experiment batch", y="detected",
colour_by=I(filtered$discard), other_field="phenotype") +
facet_wrap(~phenotype) + scale_y_log10(),
plotColData(original, x="experiment batch", y="subsets_Mito_percent",
colour_by=I(filtered$discard), other_field="phenotype") +
facet_wrap(~phenotype),
plotColData(original, x="experiment batch", y="altexps_ERCC_percent",
colour_by=I(filtered$discard), other_field="phenotype") +
facet_wrap(~phenotype),
ncol=1
)
13.4 Normalization
library(scran)
set.seed(10000)
clusters <- quickCluster(sce.mess)
sce.mess <- computeSumFactors(sce.mess, cluster=clusters)
sce.mess <- logNormCounts(sce.mess)
par(mfrow=c(1,2))
plot(sce.mess$sum, sizeFactors(sce.mess), log = "xy", pch=16,
xlab = "Library size (millions)", ylab = "Size factor",
col = ifelse(sce.mess$phenotype == "naive", "black", "grey"))
spike.sf <- librarySizeFactors(altExp(sce.mess, "ERCC"))
plot(sizeFactors(sce.mess), spike.sf, log = "xy", pch=16,
ylab = "Spike-in size factor", xlab = "Deconvolution size factor",
col = ifelse(sce.mess$phenotype == "naive", "black", "grey"))
13.5 Cell cycle phase assignment
Here, we use multiple cores to speed up the processing.
set.seed(10001)
hs_pairs <- readRDS(system.file("exdata", "human_cycle_markers.rds", package="scran"))
assigned <- cyclone(sce.mess, pairs=hs_pairs,
gene.names=rownames(sce.mess),
BPPARAM=BiocParallel::MulticoreParam(10))
sce.mess$phase <- assigned$phases
##
## G1 G2M S
## 460 406 322
13.6 Feature selection
dec <- modelGeneVarWithSpikes(sce.mess, "ERCC", block = sce.mess$`experiment batch`)
top.hvgs <- getTopHVGs(dec, prop = 0.1)
par(mfrow=c(1,3))
for (i in seq_along(dec$per.block)) {
current <- dec$per.block[[i]]
plot(current$mean, current$total, xlab="Mean log-expression",
ylab="Variance", pch=16, cex=0.5, main=paste("Batch", i))
fit <- metadata(current)
points(fit$mean, fit$var, col="red", pch=16)
curve(fit$trend(x), col='dodgerblue', add=TRUE, lwd=2)
}
13.7 Batch correction
We eliminate the obvious batch effect between batches with linear regression, which is possible due to the replicated nature of the experimental design.
We set keep=1:2
to retain the effect of the first two coefficients in design
corresponding to our phenotype of interest.
13.8 Dimensionality Reduction
We could have set d=
and subset.row=
in correctExperiments()
to automatically perform a PCA on the the residual matrix with the subset of HVGs,
but we’ll just explicitly call runPCA()
here to keep things simple.
set.seed(1101001)
sce.mess <- runPCA(sce.mess, subset_row = top.hvgs, exprs_values = "corrected")
sce.mess <- runTSNE(sce.mess, dimred = "PCA", perplexity = 40)
From a naive PCA, the cell cycle appears to be a major source of biological variation within each phenotype.
gridExtra::grid.arrange(
plotTSNE(sce.mess, colour_by = "phenotype") + ggtitle("By phenotype"),
plotTSNE(sce.mess, colour_by = "experiment batch") + ggtitle("By batch "),
plotTSNE(sce.mess, colour_by = "CDK1", swap_rownames="SYMBOL") + ggtitle("By CDK1"),
plotTSNE(sce.mess, colour_by = "phase") + ggtitle("By phase"),
ncol = 2
)
We perform contrastive PCA (cPCA) and sparse cPCA (scPCA) on the corrected log-expression data to obtain the same number of PCs. Given that the naive hESCs are actually reprogrammed primed hESCs, we will use the single batch of primed-only hESCs as the “background” dataset to remove the cell cycle effect.
library(scPCA)
is.bg <- sce.mess$`experiment batch`=="3"
target <- sce.mess[,!is.bg]
background <- sce.mess[,is.bg]
mat.target <- t(assay(target, "corrected")[top.hvgs,])
mat.background <- t(assay(background, "corrected")[top.hvgs,])
set.seed(1010101001)
con_out <- scPCA(
target = mat.target,
background = mat.background,
penalties = 0, # no penalties = non-sparse cPCA.
n_eigen = 50,
contrasts = 100
)
reducedDim(target, "cPCA") <- con_out$x
set.seed(101010101)
sparse_con_out <- scPCA(
target = mat.target,
background = mat.background,
penalties = 1e-4,
n_eigen = 50,
contrasts = 100,
alg = "rand_var_proj" # for speed.
)
reducedDim(target, "scPCA") <- sparse_con_out$x
We see greater intermingling between phases within both the naive and primed cells after cPCA and scPCA.
set.seed(1101001)
target <- runTSNE(target, dimred = "cPCA", perplexity = 40, name="cPCA+TSNE")
target <- runTSNE(target, dimred = "scPCA", perplexity = 40, name="scPCA+TSNE")
gridExtra::grid.arrange(
plotReducedDim(target, "cPCA+TSNE", colour_by = "phase") + ggtitle("After cPCA"),
plotReducedDim(target, "scPCA+TSNE", colour_by = "phase") + ggtitle("After scPCA"),
ncol=2
)
We can quantify the change in the separation between phases within each phenotype using the silhouette coefficient.
library(bluster)
naive <- target[,target$phenotype=="naive"]
primed <- target[,target$phenotype=="primed"]
N <- approxSilhouette(reducedDim(naive, "PCA"), naive$phase)
P <- approxSilhouette(reducedDim(primed, "PCA"), primed$phase)
c(naive=mean(N$width), primed=mean(P$width))
## naive primed
## 0.02032 0.03025
cN <- approxSilhouette(reducedDim(naive, "cPCA"), naive$phase)
cP <- approxSilhouette(reducedDim(primed, "cPCA"), primed$phase)
c(naive=mean(cN$width), primed=mean(cP$width))
## naive primed
## 0.007696 0.011941
scN <- approxSilhouette(reducedDim(naive, "scPCA"), naive$phase)
scP <- approxSilhouette(reducedDim(primed, "scPCA"), primed$phase)
c(naive=mean(scN$width), primed=mean(scP$width))
## naive primed
## 0.006614 0.014601
Session Info
R Under development (unstable) (2024-10-21 r87258)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.1 LTS
Matrix products: default
BLAS: /home/biocbuild/bbs-3.21-bioc/R/lib/libRblas.so
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
locale:
[1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
[3] LC_TIME=en_GB LC_COLLATE=C
[5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
[7] LC_PAPER=en_US.UTF-8 LC_NAME=C
[9] LC_ADDRESS=C LC_TELEPHONE=C
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
time zone: America/New_York
tzcode source: system (glibc)
attached base packages:
[1] stats4 stats graphics grDevices utils datasets methods
[8] base
other attached packages:
[1] bluster_1.17.0 scPCA_1.21.0
[3] batchelor_1.23.0 scran_1.35.0
[5] scater_1.35.0 ggplot2_3.5.1
[7] scuttle_1.17.0 AnnotationHub_3.15.0
[9] BiocFileCache_2.15.0 dbplyr_2.5.0
[11] ensembldb_2.31.0 AnnotationFilter_1.31.0
[13] GenomicFeatures_1.59.1 AnnotationDbi_1.69.0
[15] scRNAseq_2.21.0 SingleCellExperiment_1.29.1
[17] SummarizedExperiment_1.37.0 Biobase_2.67.0
[19] GenomicRanges_1.59.0 GenomeInfoDb_1.43.0
[21] IRanges_2.41.0 S4Vectors_0.45.1
[23] BiocGenerics_0.53.1 generics_0.1.3
[25] MatrixGenerics_1.19.0 matrixStats_1.4.1
[27] BiocStyle_2.35.0 rebook_1.17.0
loaded via a namespace (and not attached):
[1] BiocIO_1.17.0 bitops_1.0-9
[3] filelock_1.0.3 tibble_3.2.1
[5] CodeDepends_0.6.6 graph_1.85.0
[7] XML_3.99-0.17 lifecycle_1.0.4
[9] httr2_1.0.6 Rdpack_2.6.1
[11] edgeR_4.5.0 globals_0.16.3
[13] lattice_0.22-6 alabaster.base_1.7.1
[15] magrittr_2.0.3 limma_3.63.2
[17] sass_0.4.9 rmarkdown_2.29
[19] jquerylib_0.1.4 yaml_2.3.10
[21] metapod_1.15.0 cowplot_1.1.3
[23] DBI_1.2.3 ResidualMatrix_1.17.0
[25] abind_1.4-8 zlibbioc_1.53.0
[27] Rtsne_0.17 purrr_1.0.2
[29] RCurl_1.98-1.16 rappdirs_0.3.3
[31] GenomeInfoDbData_1.2.13 ggrepel_0.9.6
[33] irlba_2.3.5.1 listenv_0.9.1
[35] alabaster.sce_1.7.0 RSpectra_0.16-2
[37] parallelly_1.39.0 dqrng_0.4.1
[39] DelayedMatrixStats_1.29.0 codetools_0.2-20
[41] DelayedArray_0.33.1 tidyselect_1.2.1
[43] UCSC.utils_1.3.0 farver_2.1.2
[45] ScaledMatrix_1.15.0 viridis_0.6.5
[47] GenomicAlignments_1.43.0 jsonlite_1.8.9
[49] BiocNeighbors_2.1.0 tools_4.5.0
[51] Rcpp_1.0.13-1 glue_1.8.0
[53] gridExtra_2.3 SparseArray_1.7.1
[55] xfun_0.49 dplyr_1.1.4
[57] HDF5Array_1.35.1 gypsum_1.3.0
[59] withr_3.0.2 BiocManager_1.30.25
[61] fastmap_1.2.0 sparsepca_0.1.2
[63] rhdf5filters_1.19.0 fansi_1.0.6
[65] digest_0.6.37 rsvd_1.0.5
[67] R6_2.5.1 mime_0.12
[69] colorspace_2.1-1 RSQLite_2.3.7
[71] utf8_1.2.4 data.table_1.16.2
[73] rtracklayer_1.67.0 httr_1.4.7
[75] S4Arrays_1.7.1 pkgconfig_2.0.3
[77] gtable_0.3.6 blob_1.2.4
[79] XVector_0.47.0 htmltools_0.5.8.1
[81] bookdown_0.41 ProtGenerics_1.39.0
[83] scales_1.3.0 alabaster.matrix_1.7.0
[85] png_0.1-8 knitr_1.49
[87] rjson_0.2.23 curl_6.0.0
[89] cachem_1.1.0 rhdf5_2.51.0
[91] stringr_1.5.1 BiocVersion_3.21.1
[93] KernSmooth_2.23-24 parallel_4.5.0
[95] vipor_0.4.7 restfulr_0.0.15
[97] pillar_1.9.0 grid_4.5.0
[99] alabaster.schemas_1.7.0 vctrs_0.6.5
[101] origami_1.0.7 BiocSingular_1.23.0
[103] beachmat_2.23.0 cluster_2.1.6
[105] beeswarm_0.4.0 evaluate_1.0.1
[107] cli_3.6.3 locfit_1.5-9.10
[109] compiler_4.5.0 Rsamtools_2.23.0
[111] rlang_1.1.4 crayon_1.5.3
[113] future.apply_1.11.3 labeling_0.4.3
[115] ggbeeswarm_0.7.2 stringi_1.8.4
[117] alabaster.se_1.7.0 viridisLite_0.4.2
[119] BiocParallel_1.41.0 assertthat_0.2.1
[121] munsell_0.5.1 Biostrings_2.75.1
[123] coop_0.6-3 lazyeval_0.2.2
[125] Matrix_1.7-1 dir.expiry_1.15.0
[127] ExperimentHub_2.15.0 future_1.34.0
[129] sparseMatrixStats_1.19.0 bit64_4.5.2
[131] Rhdf5lib_1.29.0 KEGGREST_1.47.0
[133] statmod_1.5.0 alabaster.ranges_1.7.0
[135] kernlab_0.9-33 rbibutils_2.3
[137] igraph_2.1.1 memoise_2.0.1
[139] bslib_0.8.0 bit_4.5.0