%\VignetteEngine{knitr::knitr} %\VignetteIndexEntry{1. Working with large arrays in R (slides from July 2017)} %\VignetteDepends{knitr,Matrix,DelayedArray,HDF5Array,SummarizedExperiment,airway,lobstr} % 2019-12-22: A temporary fix to avoid the following pdflatex error caused by % an issue in LaTeX package filehook-scrlfile (used by beamer): % ! Package filehook Error: Detected unknown definition of \InputIfFileExists. % Use the 'force' option of 'filehook' to overwrite it.. % The error appeared on tokay2 in Dec 2019 after reinstalling MiKTeX 2.9. % See comment by Phelype Oleinik here for the fix: % https://tex.stackexchange.com/questions/512189/problem-with-chemmacros-beamer-and-filehook-scrlfile-sty \PassOptionsToPackage{force}{filehook} \documentclass[8pt]{beamer} \mode { \usetheme{Madrid} \usecolortheme{whale} } \usepackage{slides} \renewcommand\Rclass[1]{{\texttt{#1}\index{#1 (class)}}} \AtBeginSection[] { \begin{frame} \tableofcontents[currentsection] \end{frame} } \title{Working with large arrays in R} \subtitle{A look at HDF5Array/RleArray/DelayedArray objects} \author{Herv\'e Pag\`es\\ \href{mailto:hpages.on.github@gmail.com}{hpages.on.github@gmail.com}} \institute{Bioconductor conference\\Boston} \date{July 2017} \begin{document} <>= library(knitr) opts_chunk$set(size="scriptsize") mydata_dir <- file.path(tempdir(), "mydata") if (!dir.exists(mydata_dir)) dir.create(mydata_dir) options(width=80) library(Matrix) library(DelayedArray) library(HDF5Array) library(SummarizedExperiment) library(airway) library(lobstr) @ \maketitle \frame{\tableofcontents} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Motivation and challenges} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Motivation and challenges} R ordinary {\bf matrix} or {\bf array} is not suitable for big datasets: \begin{block}{} \begin{itemize} \item 10x Genomics dataset (single cell experiment): 30,000 genes x 1.3 million cells = 36.5 billion values \item in an ordinary integer matrix ==> 136G in memory! \end{itemize} \end{block} \bigskip Need for alternative containers: \begin{block}{} \begin{itemize} \item but at the same time, the object should be (almost) as easy to manipulate as an ordinary matrix or array \item {\em standard R matrix/array API}: \Rcode{dim}, \Rcode{dimnames}, \Rcode{t}, \Rcode{is.na}, \Rcode{==}, \Rcode{+}, \Rcode{log}, \Rcode{cbind}, \Rcode{max}, \Rcode{sum}, \Rcode{colSums}, etc... \item not limited to 2 dimensions ==> also support arrays of arbitrary number of dimensions \end{itemize} \end{block} \bigskip 2 approaches: {\bf in-memory data} vs {\bf on-disk data} \end{frame} \begin{frame}[fragile] \frametitle{Motivation and challenges} \centerline{\bf In-memory data} \begin{block}{} \begin{itemize} \item a 30k x 1.3M matrix might still fit in memory if the data can be efficiently compressed \item example: sparse data (small percentage of nonzero values) ==> {\em sparse representation} (storage of nonzero values only) \item example: data with long runs of identical values ==> {\em RLE compression (Run Length Encoding)} \item choose the {\em smallest type} to store the values: \Rcode{raw} (1 byte) < \Rcode{integer} (4 bytes) < \Rcode{double} (8 bytes) \item if using {\em RLE compression}: \begin{itemize} \item choose the {\em best orientation} to store the values: {\em by row} or {\em by column} (one might give better compression than the other) \item store the data by chunk ==> opportunity to pick up {\em best type} and {\em best orientation} on a chunk basis (instead of for the whole data) \end{itemize} \item size of 30k x 1.3M matrix in memory can be reduced from 136G to 16G! \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Motivation and challenges} \centerline{\bf Examples of in-memory containers} \bigskip {\bf dgCMatrix} container from the \Biocpkg{Matrix} package: \begin{block}{} \begin{itemize} \item sparse matrix representation \item nonzero values stored as \Rcode{double} \end{itemize} \end{block} \bigskip {\bf RleArray} and {\bf RleMatrix} containers from the \Biocpkg{DelayedArray} package: \begin{block}{} \begin{itemize} \item use RLE compression \item arbitrary number of dimensions \item type of values: any R atomic type (\Rcode{integer}, \Rcode{double}, \Rcode{logical}, \Rcode{complex}, \Rcode{character}, and \Rcode{raw}) \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Motivation and challenges} \centerline{\bf On-disk data} \bigskip However... \begin{itemize} \item if data is too big to fit in memory (even after compression) ==> must use {\em on-disk representation} \item challenge: should still be (almost) as easy to manipulate as an ordinary matrix! ({\em standard R matrix/array API}) \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{Motivation and challenges} \centerline{\bf Examples of on-disk containers} \bigskip Direct manipulation of an {\bf HDF5 dataset} via the \Biocpkg{rhdf5} API. Low level API! \bigskip {\bf HDF5Array} and {\bf HDF5Matrix} containers from the \Biocpkg{HDF5Array} package: \begin{block}{} Provide access to the HDF5 dataset via an API that mimics the standard R matrix/array API \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Memory footprint} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Memory footprint} \centerline{\bf The "airway" dataset} \begin{columns}[t] \begin{column}{0.36\textwidth} \begin{exampleblock}{} <>= library(airway) data(airway) m <- unname(assay(airway)) dim(m) typeof(m) @ \end{exampleblock} \end{column} \begin{column}{0.52\textwidth} \begin{exampleblock}{} <>= head(m, n=4) tail(m, n=4) sum(m != 0) / length(m) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{Memory footprint} \centerline{{\bf dgCMatrix} vs {\bf RleMatrix} vs {\bf HDF5Matrix}} \begin{columns}[t] \begin{column}{0.60\textwidth} \begin{exampleblock}{} <>= library(lobstr) # for obj_size() obj_size(m) library(Matrix) obj_size(as(m, "dgCMatrix")) library(DelayedArray) obj_size(as(m, "RleMatrix")) obj_size(as(t(m), "RleMatrix")) library(HDF5Array) obj_size(as(m, "HDF5Matrix")) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{Memory footprint} Some limitations of the sparse matrix implementation in the \Biocpkg{Matrix} package: \begin{block}{} \begin{itemize} \item nonzero values always stored as \Rcode{double}, the most memory consuming type \item number of nonzero values must be $< 2^{31}$ \item limited to 2 dimensions: no support for arrays of arbitrary number of dimensions \end{itemize} \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{RleArray and HDF5Array objects} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} RleMatrix/RleArray and HDF5Matrix/HDF5Array provide: \begin{block}{} \begin{itemize} \item support all R atomic types \item no limits in size (but each dimension must be $< 2^{31}$) \item arbitrary number of dimensions \end{itemize} \end{block} \bigskip And also: \begin{block}{} \begin{itemize} \item {\bf delayed operations} \item {\bf block processing} (behind the scene) \item TODO: multicore block processing (sequential only at the moment) \end{itemize} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \centerline{\bf Delayed operations} \bigskip \centerline{We start with HDF5Matrix object \Rcode{M}:} \begin{columns}[t] \begin{column}{0.60\textwidth} \begin{exampleblock}{} <>= M <- as(m, "HDF5Matrix") M @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} Subsetting is delayed: \begin{columns}[t] \begin{column}{0.40\textwidth} \begin{exampleblock}{} <>= M2 <- M[10:12, 1:5] M2 @ \end{exampleblock} \end{column} \begin{column}{0.48\textwidth} \begin{exampleblock}{} <>= seed(M2) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} Transposition is delayed: \begin{columns}[t] \begin{column}{0.40\textwidth} \begin{exampleblock}{} <<>>= M3 <- t(M2) M3 @ \end{exampleblock} \end{column} \begin{column}{0.48\textwidth} \begin{exampleblock}{} <<>>= seed(M3) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \Rcode{cbind()} / \Rcode{rbind()} are delayed: \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} <<>>= M4 <- cbind(M3, M[1:5, 6:8]) M4 @ \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} <>= seed(M4) # Error! (more than one seed) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} All the operations in the following groups are delayed: \begin{itemize} \item \Rcode{Arith} (\Rcode{+}, \Rcode{-}, ...) \item \Rcode{Compare} (\Rcode{==}, \Rcode{<}, ...) \item \Rcode{Logic} (\Rcode{\&}, \Rcode{|}) \item \Rcode{Math} (\Rcode{log}, \Rcode{sqrt}) \item and more ... \end{itemize} \begin{columns}[t] \begin{column}{0.42\textwidth} \begin{exampleblock}{} <<>>= M5 <- M == 0 M5 @ \end{exampleblock} \end{column} \begin{column}{0.47\textwidth} \begin{exampleblock}{} <<>>= seed(M5) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} <<>>= M6 <- round(M[11:14, ] / M[1:4, ], digits=3) M6 @ \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} <>= seed(M6) # Error! (more than one seed) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \centerline{\bf Realization} \bigskip Delayed operations can be {\bf realized} by coercing the DelayedMatrix object to HDF5Array: \begin{columns}[t] \begin{column}{0.40\textwidth} \begin{exampleblock}{} <<>>= M6a <- as(M6, "HDF5Array") M6a @ \end{exampleblock} \end{column} \begin{column}{0.48\textwidth} \begin{exampleblock}{} <<>>= seed(M6a) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \bigskip ... or by coercing it to RleArray: \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} <<>>= M6b <- as(M6, "RleArray") M6b @ \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} <<>>= seed(M6b) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \centerline{\bf Controlling where HDF5 datasets are realized} \bigskip {\em HDF5 dump management utilities}: a set of utilities to control where HDF5 datasets are written to disk. \begin{columns}[t] \begin{column}{0.44\textwidth} \begin{exampleblock}{} <<>>= hdf5_dumpfile <- file.path(mydata_dir, "M6c.h5") setHDF5DumpFile(hdf5_dumpfile) setHDF5DumpName("M6c") M6c <- as(M6, "HDF5Array") @ \end{exampleblock} \end{column} \begin{column}{0.44\textwidth} \begin{exampleblock}{} <<>>= seed(M6c) h5ls(hdf5_dumpfile) @ \end{exampleblock} \end{column} \end{columns} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \centerline{\Rcode{showHDF5DumpLog()}} \begin{exampleblock}{} <<>>= showHDF5DumpLog() @ \end{exampleblock} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \centerline{\bf Block processing} \bigskip The following operations are NOT delayed. They are implemented via a {\em block processing} mechanism that loads and processes one block at a time: \begin{itemize} \item operations in the \Rcode{Summary} group (\Rcode{max}, \Rcode{min}, \Rcode{sum}, \Rcode{any}, \Rcode{all}) \item \Rcode{mean} \item Matrix row/col summarization operations (\Rcode{col/rowSums}, \Rcode{col/rowMeans}, ...) \item \Rcode{anyNA}, \Rcode{which} \item \Rcode{apply} \item and more ... \end{itemize} \end{frame} \begin{frame}[fragile] \frametitle{RleArray and HDF5Array objects} \begin{columns}[t] \begin{column}{0.75\textwidth} \begin{exampleblock}{} <<>>= DelayedArray:::set_verbose_block_processing(TRUE) colSums(M) @ \end{exampleblock} Control the block size: \begin{exampleblock}{} <<>>= getAutoBlockSize() setAutoBlockSize(1e6) colSums(M) @ \end{exampleblock} \end{column} \end{columns} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Hands-on} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Hands-on} \begin{block}{} 1. Load the "airway" dataset. \end{block} \begin{block}{} 2. It's wrapped in a SummarizedExperiment object. Get the count data as an ordinary matrix. \end{block} \begin{block}{} 3. Wrap it in an HDF5Matrix object: (1) using \Rcode{writeHDF5Array()}; then (2) using coercion. \end{block} \begin{block}{} 4. When using coercion, where has the data been written on disk? \end{block} \begin{block}{} 5. See \Rcode{?setHDF5DumpFile} for how to control the location of "automatic" HDF5 datasets. Try to control the destination of the data when coercing. \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Hands-on} \begin{block}{} 6. Use \Rcode{showHDF5DumpLog()} to see all the HDF5 datasets written to disk during the current session. \end{block} \bigskip \begin{block}{} 7. Try some operations on the HDF5Matrix object: (1) some delayed ones; (2) some non-delayed ones (block processing). \end{block} \bigskip \begin{block}{} 8. Use \Rcode{DelayedArray:::set\_verbose\_block\_processing(TRUE)} to see block processing in action. \end{block} \bigskip \begin{block}{} 9. Control the block size with \Rcode{setAutoBlockSize()}. \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Hands-on} \begin{block}{} 10. Stick the HDF5Matrix object back in the SummarizedExperiment object. The resulting object is an "HDF5-backed SummarizedExperiment object". \end{block} \bigskip \begin{block}{} 11. The HDF5-backed SummarizedExperiment object can be manipulated (almost) like an in-memory SummarizedExperiment object. Try \Rcode{[}, \Rcode{cbind}, \Rcode{rbind} on it. \end{block} \bigskip \begin{block}{} 12. The \Biocpkg{SummarizedExperiment} package provides \Rcode{saveHDF5SummarizedExperiment} to save a SummarizedExperiment object (HDF5-backed or not) as an HDF5-backed SummarizedExperiment object. Try it. \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{DelayedArray/HDF5Array: Future developments} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{frame}[fragile] \frametitle{Future developments} \centerline{\bf Block processing improvements} \begin{block}{} Block genometry: (1) better by default, (2) let the user have more control on it \end{block} \begin{block}{} Support multicore \end{block} \begin{block}{} Expose it: \Rcode{blockApply()} \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Future developments} \centerline{\bf HDF5Array improvements} \begin{block}{} Store the \Rcode{dimnames} in the HDF5 file (in {\em HDF5 Dimension Scale datasets} - \url{https://www.hdfgroup.org/HDF5/Tutor/h5dimscale.html}) \end{block} \begin{block}{} Use better automatic chunk geometry when realizing an HDF5Array object \end{block} \begin{block}{} Block processing should take advantage of the chunk geometry (e.g. \Rcode{realize()} should use blocks that are clusters of chunks) \end{block} \begin{block}{} Unfortunately: not possible to support multicore realization at the moment (HDF5 does not support concurrent writing to a dataset yet) \end{block} \end{frame} \begin{frame}[fragile] \frametitle{Future developments} \centerline{\bf RleArray improvements} \begin{block}{} Let the user have more control on the chunk geometry when constructing/realizing an RleArray object \end{block} \begin{block}{} Like for HDF5Array objects, block processing should take advantage of the chunk geometry \end{block} \begin{block}{} Support multicore realization \end{block} \begin{block}{} Provide C/C++ low-level API for direct row/column access from C/C++ code (e.g. from the \Biocpkg{beachmat} package) \end{block} \end{frame} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% <>= setHDF5DumpFile() unlink(mydata_dir, recursive=TRUE, force=TRUE) @ \end{document}