# Introduction 1. Correct before fast 2. Correct after fast -- reproducible script; test cases 3. Kernigan -- Everyone knows that debugging is twice as hard as writing a program in the first place. So if you are as clever as you can be when you write it, how will you ever debug it? # Profiling Toy functions slow <- function(n=3) { for (i in seq_len(n * 1000000)) {} TRUE } fast <- fast1 <- fast2 <- function(n=1) { for (i in seq_len(n * 1000000)) {} TRUE } Simple example -- self and total time f1 <- function() { fast1() slow() fast2() } Rprof(); f1(); Rprof(NULL); summaryRprof() Re-use -- times are cummulative f2 <- function() { fast1(); fast1(); fast1() slow() fast1(); fast1(); fast1() } Rprof(); f2(); Rprof(NULL); summaryRprof() What to do? f3 <- function() { ## heavy calculation ahead, so see if necessary if (!slow()) # check, usually fails return() if (fast()) # check, usually succeeds return() slow(); slow(); slow(); } Rprof(); f3(); Rprof(NULL); summaryRprof() Solution? Line profiling -- not very mature source("line-profiling.R", keep.source=TRUE) Rprof(line.profiling=TRUE); f4(); Rprof(NULL) summaryRprof(lines="show") # Memory Compile with --enable-memory-profiling COW (copy-on-write) x <- 1:10 tracemem(x) y <- x x[1] <- 3L # copy-on-write Function calls g <- function(v) sqrt(v[1:5]) tracemem(x <- 1:10) g(x) # no write h <- function(v) { v[1] <- 2L; v} tracemem(x <- 1:10) h(x) # copy-on-write 'NAMED' status -- sometimes safe to write without copy x <- 1:10 tracemem(x) x[1] <- 3L # no copy! x <- 1:10 .Internal(inspect(x)) # marked safe not to copy x[1] <- 3L .Internal(inspect(x)) y <- x .Internal(inspect(x)) # same point in memory .Internal(inspect(y)) # marked for copy-on-write x[1] <- 2L .Internal(inspect(x)) # new memory, not copy-on-write .Internal(inspect(y)) # old memory, copy-on-write # Common pitfalls e.g., http://bioconductor.org/help/course-materials/2013/CSAMA2013/thursday/morning/morgan-big-data.pdf Efficient code 1. Vectorize 2. Pre-allocate and fill 3. Exploit existing software 4. Appropriate algorithms 5. Avoid expensive conveniences Big data resource management 1. Restriction 2. Sampling 3. Iteration 4. Parallelization