## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----echo=FALSE, results='asis'----------------------------------------------- cat(' ') ## ----fig.cap="This figure shows the mapping from a parameter to a collection of repro samples via the generating function. In this case, the blue and red parameters generate repro samples that are 'close' (lies in the prediction set $B_\\alpha$) to the observed statistic, so they are included in the confidence set. On the other hand, the green parameter generates repro samples that are not 'close enough' to the observed statistic, hence it's not included in the confidence set $\\Gamma_\\alpha$. For additional details on the prediction set $B_\\alpha$ and confidence set $\\Gamma_\\alpha$, please refer to the paper. (Taken from Figure 1 in @Awan25.)", out.width="100%", echo=FALSE---- knitr::include_graphics("figs/figure1.png") ## ----setup-------------------------------------------------------------------- library(SimBaRepro) ## ----------------------------------------------------------------------------- n <- 50 # sample size repro_size <- 200 # number of repro samples ## ----------------------------------------------------------------------------- seeds <- matrix(rnorm(repro_size * n), nrow = repro_size, ncol = n) ## ----------------------------------------------------------------------------- s_sample <- function(seeds, theta) { # generate the raw data points data <- theta[1] + sqrt(theta[2]) * seeds[, 1:n] # compute the regular statistics s_mean <- apply(data, 1, mean) s_var <- apply(data, 1, var) return(cbind(s_mean, s_var)) } ## ----------------------------------------------------------------------------- s_obs <- c(1.12, 0.67) ## ----------------------------------------------------------------------------- lower_bds = c(0, 1e-6) upper_bds = c(0, 10) result <- p_value(lower_bds, upper_bds, seeds, s_sample, s_obs) print(result$p_val) # p_value ## ----------------------------------------------------------------------------- lower_bds = c(-1, 1) # lower bounds for mean and variance upper_bds = c(2, 5) # upper bounds for mean and variance result <- p_value(lower_bds, upper_bds, seeds, s_sample, s_obs) print(result$p_val) # the largest p_value found print(result$theta_hat) # the parameter corresponding to the largest p value ## ----------------------------------------------------------------------------- lower_bds <- c(-5, 1e-6) upper_bds <- c(5, 10) parameter_index <- 1 # an integer specifying the parameter of interest alpha <- .05 # significance level tol <- 1e-3 # tolerance # choosing j=1 indicates that we're obtaining a confidence interval for the first argument in s_obs. In this case, it's the mean. mean_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_obs, tol) print(mean_CI) ## ----------------------------------------------------------------------------- parameter_index <- 2 var_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_obs, tol) print(var_CI) ## ----------------------------------------------------------------------------- lower_bds <- c(0.5, 1e-6) upper_bds <- c(1.5, 1.5) resolution <- 13 result <- confidence_grid(alpha, lower_bds, upper_bds, seeds, s_sample, s_obs, tol, resolution) indicator_array <- result$ind_array print(result) ## ----------------------------------------------------------------------------- print(c(result$search_lower_bds[1], result$search_upper_bds[1])) print(c(result$search_lower_bds[2], result$search_upper_bds[2])) ## ----------------------------------------------------------------------------- parameter_names <- c("mean", "variance") plot_grid(indicator_array, lower_bds, upper_bds, parameter_names) ## ----------------------------------------------------------------------------- n <- 100 # sample size repro_size <- 200 # number of repro samples seeds <- matrix(rnorm(repro_size * (2 * n)), nrow = repro_size, ncol = 2 * n) s_sample <- function(seeds, theta) { n_obs <- ncol(seeds) / 2 mu1 <- theta[1] mu2 <- theta[2] sigma <- sqrt(theta[3]) seeds1 <- seeds[, 1:n_obs, drop = FALSE] # first n columns for X seeds2 <- seeds[, (n_obs + 1):(2 * n_obs), drop = FALSE] # next n columns for Y # generate bivariate data data1 <- mu1 + sigma * seeds1 # X ~ N(mu_1, sigma^2) data2 <- mu2 + sigma * seeds2 # Y ~ N(mu_2, sigma^2) # compute statistics s_mean1 <- rowMeans(data1) # sample mean of X s_mean2 <- rowMeans(data2) # sample mean of Y s_var1 <- apply(data1, 1, var) # sample variance of X s_var2 <- apply(data2, 1, var) # sample variance of Y return(cbind(s_mean1, s_mean2, s_var1, s_var2)) } s_obs <- c(-0.1, 0.1, 1.05, 0.9) alpha <- 0.05 tol <- 1e-3 lower_bds <- c(-0.5, -0.5, 1e-6) upper_bds <- c(0.5, 0.5, 2) resolution <- 10 result <- confidence_grid(alpha, lower_bds, upper_bds, seeds, s_sample, s_obs, tol, resolution) indicator_array <- result$ind_array ## ----------------------------------------------------------------------------- index_set <- c(1,2) means_array <- grid_projection(indicator_array, index_set) ## ----------------------------------------------------------------------------- means_lower_bds <- lower_bds[index_set] means_upper_bds <- upper_bds[index_set] parameter_names <- c("mu1", "mu2") plot_grid(means_array, means_lower_bds, means_upper_bds, parameter_names) ## ----------------------------------------------------------------------------- index_set2 <- c(1,3) mu1_var_array <- grid_projection(indicator_array, index_set2) index_set3 <- c(2,3) mu2_var_array <- grid_projection(indicator_array, index_set3) ## ----fig.show='hold'---------------------------------------------------------- mu1_var_lower_bds <- lower_bds[index_set2] mu1_var_upper_bds <- upper_bds[index_set2] parameter_names <- c("mu1", "variance") plot_grid(mu1_var_array, mu1_var_lower_bds, mu1_var_upper_bds, parameter_names) mu2_var_lower_bds <- lower_bds[index_set3] mu2_var_upper_bds <- upper_bds[index_set3] parameter_names <- c("mu2", "variance") plot_grid(mu2_var_array, mu2_var_lower_bds, mu2_var_upper_bds, parameter_names) ## ----------------------------------------------------------------------------- n <- 50 # sample size repro_size <- 200 # number of repro samples ## ----------------------------------------------------------------------------- upper_clamp <- 3 lower_clamp <- -3 eps <- 4 # privacy guarantee parameter ## ----------------------------------------------------------------------------- regular_seeds <- matrix(rnorm(repro_size * n), nrow = repro_size, ncol = n) dp_seeds <- matrix((2 * rbinom(n = repro_size * 2, size = 1, prob = 1/2) - 1) * rexp(repro_size * 2), nrow = repro_size, ncol = 2) seeds <- cbind(regular_seeds, dp_seeds) ## ----------------------------------------------------------------------------- s_sample <- function(seeds, theta) { # generate the i.i.d. normal r.v.s using the first n columns of the seeds raw_data <- theta[1] + sqrt(theta[2]) * seeds[, 1:n] # clamp the raw data using the clamps defined above clamped <- pmin(pmax(raw_data, lower_clamp), upper_clamp) # compute the private statistics (adding a properly scaled Laplace noise to the sample mean and sample variance) s_mean <- apply(clamped, 1, mean) + (upper_clamp - lower_clamp) / (n * eps / 2) * seeds[, n+1] s_var <- apply(clamped, 1, var) + (upper_clamp - lower_clamp)^2 / (n * eps / 2) * seeds[, n+2] return(cbind(s_mean, s_var)) } ## ----------------------------------------------------------------------------- s_DP <- c(1.12, 1.07) # define the observed statistic as a vector ## ----------------------------------------------------------------------------- lower_bds <- c(0, 1e-6) upper_bds <- c(1, 10) ## ----------------------------------------------------------------------------- result <- p_value(lower_bds, upper_bds, seeds, s_sample, s_DP) print(result$p_val) # the largest p_value found print(result$theta_hat) # the parameter corresponding to the largest p value ## ----------------------------------------------------------------------------- # search lower bounds and upper bounds for mean and variance alpha <- .05 # significance level tol <- 1e-3 # tolerance lower_bds <- c(-5, 1e-6) upper_bds <- c(5, 10) ## ----------------------------------------------------------------------------- # choose parameter_index=1 indicates that we're obtaining a confidence interval for the first argument in s_DP. In this case, it's the mean. parameter_index <- 1 mean_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_DP, tol) print(mean_CI) ## ----------------------------------------------------------------------------- parameter_index <- 2 var_CI <- get_CI(alpha, lower_bds, upper_bds, parameter_index, seeds, s_sample, s_DP, tol) print(var_CI) ## ----------------------------------------------------------------------------- lower_bds <- c(0, 1e-6) upper_bds <- c(2, 5) resolution <- 20 result <- confidence_grid(alpha, lower_bds, upper_bds, seeds, s_sample, s_DP, tol, resolution) indicator_array <- result$ind_array print(result) ## ----------------------------------------------------------------------------- parameter_names <- c("mean", "variance") # optional input plot_grid(indicator_array, lower_bds, upper_bds, parameter_names)