## ----install, eval=FALSE------------------------------------------------------ # if (!require("BiocManager", quietly=TRUE)) # install.packages("BiocManager") # # pkgs <- c("HDF5Array", "ExperimentHub", "DelayedMatrixStats", "RSpectra") # BiocManager::install(pkgs) ## ----load, message=FALSE------------------------------------------------------ library(HDF5Array) library(ExperimentHub) library(DelayedMatrixStats) library(RSpectra) ## ----source_make_timings_table_R, echo=FALSE, results='hide'------------------ ## Needed for the make_timings_table() function. path <- system.file(package="HDF5Array", "scripts", "make_timings_table.R", mustWork=TRUE) source(path, verbose=FALSE) ## ----ExperimentHub------------------------------------------------------------ hub <- ExperimentHub() hub["EH1039"]$description # sparse representation hub["EH1040"]$description # dense representation ## ----get_EH1039_and_EH1040, message=FALSE------------------------------------- ## Note that this will be quick if the HDF5 files are already in the ## local ExperimentHub cache. Otherwise, it will take a while! full_sparse_h5 <- hub[["EH1039"]] full_dense_h5 <- hub[["EH1040"]] ## ----TENxMatrix_constructor--------------------------------------------------- ## Use 'h5ls(full_sparse_h5)' to find out the group. full_sparse <- TENxMatrix(full_sparse_h5, group="mm10") ## ----full_sparse_class_and_dim------------------------------------------------ class(full_sparse) dim(full_sparse) ## ----HDF5Array_constructor---------------------------------------------------- ## Use 'h5ls(full_dense_h5)' to find out the name of the dataset. full_dense <- HDF5Array(full_dense_h5, name="counts") ## ----full_dense_class_and_dim------------------------------------------------- class(full_dense) dim(full_dense) ## ----set_full_dense_dimnames-------------------------------------------------- dimnames(full_dense) <- dimnames(full_sparse) ## ----create_test_datasets----------------------------------------------------- sparse1 <- full_sparse[ , 1:12500] dense1 <- full_dense[ , 1:12500] sparse2 <- full_sparse[ , 1:25000] dense2 <- full_dense[ , 1:25000] sparse3 <- full_sparse[ , 1:50000] dense3 <- full_dense[ , 1:50000] sparse4 <- full_sparse[ , 1:100000] dense4 <- full_dense[ , 1:100000] sparse5 <- full_sparse[ , 1:200000] dense5 <- full_dense[ , 1:200000] ## ----simple_normalize_function------------------------------------------------ ## Also does variable gene selection by keeping the 1000 most variable ## genes by default. simple_normalize <- function(mat, num_variable_genes=1000) { stopifnot(length(dim(mat)) == 2, !is.null(rownames(mat))) mat <- mat[rowSums(mat) > 0, ] mat <- t(t(mat) * 10000 / colSums(mat)) row_vars <- rowVars(mat) rv_order <- order(row_vars, decreasing=TRUE) variable_idx <- head(rv_order, n=num_variable_genes) mat <- log1p(mat[variable_idx, ]) mat / rowSds(mat) } ## ----simple_PCA_function------------------------------------------------------ simple_PCA <- function(mat, k=25) { stopifnot(length(dim(mat)) == 2) row_means <- rowMeans(mat) Ax <- function(x, args) (as.numeric(mat %*% x) - row_means * sum(x)) Atx <- function(x, args) (as.numeric(x %*% mat) - as.vector(row_means %*% x)) RSpectra::svds(Ax, Atrans=Atx, k=k, dim=dim(mat)) } ## ----set_block_size_for_normalize_sparse-------------------------------------- DelayedArray::setAutoBlockSize(2.5e8) ## ----normalize_sparse1-------------------------------------------------------- dim(sparse1) system.time(sparse1n <- simple_normalize(sparse1)) gc() dim(sparse1n) ## ----show_sparse1n_delayed_ops------------------------------------------------ class(sparse1n) showtree(sparse1n) ## ----writeTENxMatrix_sparse1n, results="hide"--------------------------------- sparse1n_path <- tempfile() sparse1n <- writeTENxMatrix(sparse1n, sparse1n_path, group="matrix", level=0) ## ----show_pristine_sparse1n--------------------------------------------------- class(sparse1n) showtree(sparse1n) # "pristine" object (i.e. no more delayed operations) ## ----normalize_sparse2-------------------------------------------------------- dim(sparse2) system.time(sparse2n <- simple_normalize(sparse2)) gc() dim(sparse2n) ## ----writeTENxMatrix_sparse2n, results="hide"--------------------------------- sparse2n_path <- tempfile() writeTENxMatrix(sparse2n, sparse2n_path, group="matrix", level=0) ## ----set_block_size_for_normalize_dense--------------------------------------- DelayedArray::setAutoBlockSize(4e7) ## ----normalize_dense1--------------------------------------------------------- dim(dense1) system.time(dense1n <- simple_normalize(dense1)) gc() dim(dense1n) ## ----show_dense1n_delayed_ops------------------------------------------------- class(dense1n) showtree(dense1n) ## ----writeHDF5Array_dense1n, results="hide"----------------------------------- dense1n_path <- tempfile() dense1n <- writeHDF5Array(dense1n, dense1n_path, name="normalized_counts", level=0) ## ----show_pristine_dense1n---------------------------------------------------- class(dense1n) showtree(dense1n) # "pristine" object (i.e. no more delayed operations) ## ----normalize_dense2--------------------------------------------------------- dim(dense2) system.time(dense2n <- simple_normalize(dense2)) gc() dim(dense2n) ## ----writeHDF5Array_dense2n, results="hide"----------------------------------- dense2n_path <- tempfile() writeHDF5Array(dense2n, dense2n_path, name="normalized_counts", level=0) ## ----set_block_size_for_PCA_sparse-------------------------------------------- DelayedArray::setAutoBlockSize(4e7) ## ----PCA_sparse1n------------------------------------------------------------- sparse1n <- TENxMatrix(sparse1n_path) dim(sparse1n) system.time(pca1s <- simple_PCA(sparse1n)) gc() ## ----PCA_sparse2n------------------------------------------------------------- sparse2n <- TENxMatrix(sparse2n_path) dim(sparse2n) system.time(pca2s <- simple_PCA(sparse2n)) gc() ## ----set_block_size_for_PCA_dense--------------------------------------------- DelayedArray::setAutoBlockSize() ## ----PCA_dense1n-------------------------------------------------------------- dense1n <- HDF5Array(dense1n_path, name="normalized_counts") dim(dense1n) system.time(pca1d <- simple_PCA(dense1n)) gc() ## ----PCA_dense2n-------------------------------------------------------------- dense2n <- HDF5Array(dense2n_path, name="normalized_counts") dim(dense2n) system.time(pca2d <- simple_PCA(dense2n)) gc() ## ----sanity_checks------------------------------------------------------------ stopifnot(all.equal(pca1s, pca1d)) stopifnot(all.equal(pca2s, pca2d)) ## ----xps15_timings, echo=FALSE, results='asis'-------------------------------- make_timings_table("xps15") ## ----rex3_timings, echo=FALSE, results='asis'--------------------------------- make_timings_table("rex3") ## ----kjohnson3_timings, echo=FALSE, results='asis'---------------------------- make_timings_table("kjohnson3") ## ----sessionInfo-------------------------------------------------------------- sessionInfo()