--- title: "Normal approximation vs. empirical simulation" subtitle: '' author: "Developed by [Gabriel Hoffman](http://gabrielhoffman.github.io/)" date: "Run on `r Sys.time()`" documentclass: article output: BiocStyle::html_document vignette: > %\VignetteIndexEntry{crumblr_theory} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} %\usepackage[utf8]{inputenc} --- ```{r setup, echo=FALSE, results="hide"} knitr::opts_chunk$set( tidy = FALSE, cache = TRUE, echo = FALSE, dev = c("png", "pdf"), package.startup.message = FALSE, message = FALSE, error = FALSE, warning = FALSE # collapse = TRUE, # comment = "#>", # fig.path = "" # Added this line to the standard setup chunk) ) options(width = 100) ```

## Asymptotic normal approximation Let the vector ${\bf p}$ be the true fractions across $D$ categories. Consider $C$ total counts sampled from a Dirichlet-multinomial (DMN) distribution with overdispersion $\tau$, where $\tau=1$ reduces to the multinomial distribution. The [centered log ratio](https://rdrr.io/cran/compositions/man/clr.html) (CLR) of the $i^{th}$ estimated fraction in ${\bf \hat p}$ is \begin{equation} \tag{1} \text{clr}_i({\bf \hat p}) = \log(\hat p_i) - \frac{1}{D}\sum_{j=1}^D \log(\hat p_j) \end{equation} and we show that the sampling variance is \begin{equation} \tag{2} \text{var}[\text{clr}_i({\bf \hat p})] = \frac{\tau}{C} \left[ \frac{1}{\hat p_i} - \frac{2}{ D \hat p_i} + \frac{1}{D^2}\sum_{j=1}^D \frac{1}{\hat p_j} \right]. \label{eqn2} \end{equation} ## Simulations The sampling variance is derived from asymototic theory, so we examine its behavior for finite total counts. Here we evaluate the empirical variance from $1,000$ draws from a Dirichlet-multinomial distribution while varying $D$, $\tau$, $C$. A pseudocount of 0.5 is added to the observed counts since the asymptotic theory is not defined for counts of zero. Here we plot the standard deviation after CLR transform from the empirical DMN and the asymptotic normal approximation under a range of conditions. Results are shown for instances with at least 2 counts.

```{r define.fxn, cache=FALSE} library(ggplot2) library(crumblr) library(HMP) library(parallel) library(glue) library(tidyverse) rmultinomdir <- function(n, size, alpha) { p <- rdirichlet(n, alpha) res <- lapply(seq(1, n), function(i) { rmultinom(1, size, prob = p[i, ]) }) res <- do.call(cbind, res) t(res) } run_sim <- function(prop_other = NULL, pseudocount = .5) { n_sims <- 1000 j <- 1 # Some thoughts on counts in sequencing studies. NAR G&B # doi: 10.1093/nargab/lqaa094 df_res <- lapply(c(500, 5000), function(countTotal) { countsOther <- sum(prop_other) * countTotal # evaluate at set of equally spaced integers targetValues <- seq(1, max(countTotal - countsOther - 1, 1), length.out = 100) targetValues <- unique(targetValues) df_res <- mclapply(targetValues, function(targetCount) { df_res <- lapply(c(1, 5, 10), function(tau) { targetCount2 <- max(countTotal - targetCount - countsOther, 0) # create vector of alpha values if (!is.null(prop_other)) { expectedCounts <- c(targetCount, targetCount2, prop_other * countTotal) } else { expectedCounts <- c(targetCount, targetCount2) } p <- expectedCounts / sum(expectedCounts) if (tau == 1) { # large a0 corresponds to multinomial alpha <- p * 1e9 } else if (tau < 1) { stop("tau must be greater than 1") } else { # convert tau overdispersion value to alpha from p a0 <- (countTotal - tau) / (tau - 1) alpha <- p * a0 } # Multinomial-Dirichlet counts <- Dirichlet.multinomial(rep(countTotal, n_sims), alpha) keep <- colMeans(counts) > 2 keep[1:2] <- TRUE counts <- counts[, keep, drop = FALSE] expectedCounts <- expectedCounts[keep] # transform x_clr <- clr(counts, pseudocount) # mu_sample_md = colMeans(x_clr) v_empirical <- apply(x_clr, 2, var) # expected variance based on *true* values D <- ncol(counts) p <- (expectedCounts + pseudocount) / sum(expectedCounts + pseudocount) v_theoretical_est <- tau * (1 / p - 2 / (p * D) + sum(1 / p) / D^2) / countTotal data.frame( targetCount = targetCount, countTotal = countTotal, tau = tau, countsOther = countsOther, v_empirical = v_empirical[j], v_theoretical_est = v_theoretical_est[j] ) }) do.call(rbind, df_res) }, mc.cores = 10) do.call(rbind, df_res) }) df_res <- do.call(rbind, df_res) df_res } ``` ### D=2 categories ```{r sim1, cache=TRUE} df_res <- run_sim() # "v_theoretical", cols <- c("tau", "targetCount", "countTotal", "v_empirical", "v_theoretical_est") df_v <- reshape2::melt(df_res[, cols], id.vars = cols[1:3]) # set facets df_v$facet_x <- with(df_v, glue('tau*" = {tau}"')) df_v$facet_x <- factor(df_v$facet_x, unique(df_v$facet_x)) df_v$facet_y <- with(df_v, glue('C*" = {countTotal}"')) df_v$facet_y <- factor(df_v$facet_y, unique(df_v$facet_y)) ``` ```{r plot1, fig.height=6, fig.align = 'center'} df_sub <- df_v %>% filter(targetCount >= 2 & (countTotal - targetCount) >= 2) ggplot() + geom_point(data = filter(df_sub, variable == "v_empirical"), aes(targetCount / countTotal, sqrt(value), color = variable)) + geom_line(data = filter(df_sub, variable != "v_empirical"), aes(targetCount / countTotal, sqrt(value), color = variable), linewidth = 1) + theme_classic() + theme(aspect.ratio = 1, plot.title = element_text(hjust = 0.5), legend.position = "bottom") + facet_grid(facet_y ~ facet_x, labeller = label_parsed, scales = "free_y") + xlab("Proportion") + ylab("Standard Deviation") + ggtitle("Standard deviations from empirical DMN and asymptotic normal approximation") + scale_color_manual(name = "Method", values = c("blue4", "red"), labels = c("Empirical (DMN)", "Asymptotic normal approx.")) + scale_linetype_manual(name = "Method", values = c(1, 2), labels = c("Empirical (DMN)", "Asymptotic normal approx.")) + scale_y_continuous(limits = c(0, NA)) ``` ### D=15 categories ```{r sim2, cache=FALSE} prop_other <- rep(.05, 15) df_res <- run_sim(prop_other) cols <- c("tau", "targetCount", "countTotal", "countsOther", "v_empirical", "v_theoretical_est") df_v <- reshape2::melt(df_res[, cols], id.vars = cols[1:4]) # set facets df_v$facet_x <- with(df_v, glue('tau*" = {tau}"')) df_v$facet_x <- factor(df_v$facet_x, unique(df_v$facet_x)) df_v$facet_y <- with(df_v, glue('C*" = {countTotal}"')) df_v$facet_y <- factor(df_v$facet_y, unique(df_v$facet_y)) ``` ```{r plot2, fig.height=6, cache=FALSE, fig.align = 'center'} df_sub <- df_v %>% filter(targetCount >= 2 & (countTotal - targetCount - countsOther) >= 2) ggplot() + geom_point(data = filter(df_sub, variable == "v_empirical"), aes(targetCount / countTotal, sqrt(value), color = variable)) + geom_line(data = filter(df_sub, variable != "v_empirical"), aes(targetCount / countTotal, sqrt(value), color = variable), linewidth = 1) + theme_classic() + theme(aspect.ratio = 1, plot.title = element_text(hjust = 0.5), legend.position = "bottom") + facet_grid(facet_y ~ facet_x, labeller = label_parsed, scales = "free_y") + xlab("Proportion") + ylab("Standard Deviation") + ggtitle("Standard deviations from empirical DMN and asymptotic normal approximation") + scale_color_manual(name = "Method", values = c("blue4", "red"), labels = c("Empirical (DMN)", "Asymptotic normal approx.")) + scale_linetype_manual(name = "Method", values = c(1, 2), labels = c("Empirical (DMN)", "Asymptotic normal approx.")) + scale_y_continuous(limits = c(0, NA)) ```

### Interpretation The asymptotic standard deviation shows good agreement with the empirical results even for small values of $C$, *when at least 2 counts are observed*. In practice, it is often reasonable to assume a sufficient number of counts before a variable is included in an analysis. Importantly, with less than 2 counts the asymptotic theory gives a *larger* standard deviation than the emprical results (results not shown). Therefore, this approach is conservative and should not underestimate the true amount of variation. The asymptotic normal approximation is most accurate for large total counts $C$, large proportions $p$, and small overdispersion $\tau$.

```{r for.manuscript, eval=FALSE, cache=FALSE} df_sub <- df_v %>% filter(targetCount >= 2 & (countTotal - targetCount - countsOther) >= 2) %>% filter(tau == 10 & countTotal == 5000) fig <- ggplot() + geom_point(data = filter(df_sub, variable == "v_empirical"), aes(targetCount / countTotal, sqrt(value), color = variable)) + geom_line(data = filter(df_sub, variable != "v_empirical"), aes(targetCount / countTotal, sqrt(value), color = variable), linewidth = 1) + theme_classic() + theme(aspect.ratio = 1, plot.title = element_text(hjust = 0.5), legend.position = c(.4, .7), legend.justification = c(0, 0)) + xlab("Proportion") + ylab("Standard Deviation") + scale_color_manual(name = "Method", values = c("blue4", "red"), labels = c("Empirical (DMN)", "Asymptotic normal approx.")) + scale_linetype_manual(name = "Method", values = c(1, 2), labels = c("Empirical (DMN)", "Asymptotic normal approx.")) + ggtitle(expression(tau == 10 ~ C == 5000 ~ D == 15)) + scale_y_continuous(limits = c(0, NA), expand = c(0, 0)) ggsave("sim_figure.pdf", fig, height = 4, width = 4) ``` #### Consideration of overdispersion Based on Equation (2), the variance of the CLR-transformed proportions is a _linear_ function of $\tau$. Importantly, downstream analysis of the CLR-transformed proportions with a precision-weighted linear (mixed) model or a variance stabilizing transform depends only on the _relative_ variances. Since relative variances are invariant to the scale of $\tau$, for these applications the value of $\tau$ can be set to 1 instead of being estimated from the data. For other applications, `crumblr` can estimate $\tau$ from the data by using `crumblr(counts, tau=NULL)`. This calls `dmn.mle()` to estimate the parameters of the DMN distribution and is substantially faster than alternatives. # Session Info

```{r session, echo=FALSE} sessionInfo() ```