## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup, message=FALSE, warning=FALSE, echo=FALSE-------------------------- library(dplyr) # For data manipulation library(survey) # For complex survey analysis library(srvyr) # For complex survey analysis with dplyr syntax library(svrep) ## ----------------------------------------------------------------------------- library(survey) # For complex survey analysis library(svrep) set.seed(2022) # Load an example dataset from a multistage sample, with two stages of SRSWOR data("mu284", package = 'survey') multistage_srswor_design <- svydesign(data = mu284, ids = ~ id1 + id2, fpc = ~ n1 + n2) bootstrap_rep_design <- as_bootstrap_design(multistage_srswor_design, type = "Rao-Wu-Yue-Beaumont", replicates = 500) svytotal(x = ~ y1, design = multistage_srswor_design) svytotal(x = ~ y1, design = bootstrap_rep_design) ## ----eval=FALSE--------------------------------------------------------------- # # Load example dataset of U.S. counties and states with 2004 Presidential vote counts # data("election", package = 'survey') # pps_wor_design <- svydesign(data = election_pps, # pps = HR(), # fpc = ~ p, # Inclusion probabilities # ids = ~ 1) # bootstrap_rep_design <- as_bootstrap_design(pps_wor_design, # type = "Rao-Wu-Yue-Beaumont", # replicates = 100) # # svytotal(x = ~ Bush + Kerry, design = pps_wor_design) # svytotal(x = ~ Bush + Kerry, design = bootstrap_rep_design) ## ----------------------------------------------------------------------------- # Declare a multistage design # where first-stage probabilities are PPSWOR sampling # and second-stage probabilities are based on SRSWOR multistage_design <- svydesign( data = library_multistage_sample, ids = ~ PSU_ID + SSU_ID, probs = ~ PSU_SAMPLING_PROB + SSU_SAMPLING_PROB, pps = "brewer" ) # Convert to a bootstrap replicate design boot_design <- as_bootstrap_design( design = multistage_design, type = "Rao-Wu-Yue-Beaumont", samp_method_by_stage = c("PPSWOR", "SRSWOR"), replicates = 1000 ) # Compare variance estimates svytotal(x = ~ TOTCIR, na.rm = TRUE, design = multistage_design) svytotal(x = ~ TOTCIR, na.rm = TRUE, design = boot_design) ## ----------------------------------------------------------------------------- make_quad_form_matrix( variance_estimator = "SD2", cluster_ids = c(1,2,3,4,5) |> data.frame(), strata_ids = c(1,1,1,1,1) |> data.frame(), sort_order = c(1,2,3,4,5) ) ## ----eval=TRUE---------------------------------------------------------------- # Load an example dataset of a stratified systematic sample data('library_stsys_sample', package = 'svrep') # First, sort the rows in the order used in sampling library_stsys_sample <- library_stsys_sample |> dplyr::arrange(SAMPLING_SORT_ORDER) # Create a survey design object survey_design <- svydesign( data = library_stsys_sample, ids = ~ 1, strata = ~ SAMPLING_STRATUM, fpc = ~ STRATUM_POP_SIZE ) # Obtain the quadratic form for the target estimator sd2_quad_form <- get_design_quad_form( design = survey_design, variance_estimator = "SD2" ) class(sd2_quad_form) dim(sd2_quad_form) ## ----------------------------------------------------------------------------- # Obtain weighted values wtd_y <- as.matrix(library_stsys_sample[['LIBRARIA']] / library_stsys_sample[['SAMPLING_PROB']]) wtd_y[is.na(wtd_y)] <- 0 # Obtain point estimate for a population total point_estimate <- sum(wtd_y) # Obtain the variance estimate using the quadratic form variance_estimate <- t(wtd_y) %*% sd2_quad_form %*% wtd_y std_error <- sqrt(variance_estimate[1,1]) # Summarize results sprintf("Estimate: %s", round(point_estimate)) sprintf("Standard Error: %s", round(std_error)) ## ----------------------------------------------------------------------------- # Load example data from stratified systematic sample data('library_stsys_sample', package = 'svrep') # First, ensure data are sorted in same order as was used in sampling library_stsys_sample <- library_stsys_sample[ order(library_stsys_sample$SAMPLING_SORT_ORDER), ] # Create a survey design object design_obj <- svydesign( data = library_stsys_sample, strata = ~ SAMPLING_STRATUM, ids = ~ 1, fpc = ~ STRATUM_POP_SIZE ) ## ----------------------------------------------------------------------------- # Convert to generalized bootstrap replicate design gen_boot_design_sd2 <- as_gen_boot_design( design = design_obj, variance_estimator = "SD2", replicates = 2000 ) # Estimate sampling variances svymean(x = ~ TOTSTAFF, na.rm = TRUE, design = gen_boot_design_sd2) ## ----eval=FALSE--------------------------------------------------------------- # # Load example data of a PPS survey of counties and states # data('election', package = 'survey') # # # Create survey design object # pps_design_ht <- svydesign( # data = election_pps, # id = ~1, fpc = ~p, # pps = ppsmat(election_jointprob), # variance = "HT" # ) # # # # Convert to generalized bootstrap replicate design # gen_boot_design_ht <- pps_design_ht |> # as_gen_boot_design(variance_estimator = "Horvitz-Thompson", # replicates = 5000, tau = "auto") # # # Compare sampling variances from bootstrap vs. Horvitz-Thompson estimator # svytotal(x = ~ Bush + Kerry, design = pps_design_ht) # svytotal(x = ~ Bush + Kerry, design = gen_boot_design_ht) ## ----------------------------------------------------------------------------- library(dplyr) # For data manipulation # Create a multistage survey design multistage_design <- svydesign( data = library_multistage_sample |> mutate(Weight = 1/SAMPLING_PROB), ids = ~ PSU_ID + SSU_ID, fpc = ~ PSU_POP_SIZE + SSU_POP_SIZE, weights = ~ Weight ) # Convert to a generalized bootstrap design multistage_boot_design <- as_gen_boot_design( design = multistage_design, variance_estimator = "Stratified Multistage SRS" ) # Compare variance estimates svytotal(x = ~ TOTCIR, na.rm = TRUE, design = multistage_design) svytotal(x = ~ TOTCIR, na.rm = TRUE, design = multistage_boot_design) ## ----------------------------------------------------------------------------- # View overall scale factor overall_scale_factor <- multistage_boot_design$scale print(overall_scale_factor) # Check that the scale factor was calculated correctly tau <- multistage_boot_design$tau print(tau) B <- ncol(multistage_boot_design$repweights) print(B) print( (tau^2) / B ) ## ----------------------------------------------------------------------------- # Load an example dataset of a stratified systematic sample data('library_stsys_sample', package = 'svrep') # Represent the SD2 successive-difference estimator as a quadratic form, # and obtain the matrix of that quadratic form sd2_quad_form <- make_quad_form_matrix( variance_estimator = 'SD2', cluster_ids = library_stsys_sample |> select(FSCSKEY), strata_ids = library_stsys_sample |> select(SAMPLING_STRATUM), strata_pop_sizes = library_stsys_sample |> select(STRATUM_POP_SIZE), sort_order = library_stsys_sample |> pull("SAMPLING_SORT_ORDER") ) ## ----------------------------------------------------------------------------- rep_adj_factors <- make_gen_boot_factors( Sigma = sd2_quad_form, num_replicates = 500, tau = "auto" ) ## ----------------------------------------------------------------------------- tau <- attr(rep_adj_factors, 'tau') B <- ncol(rep_adj_factors) ## ----------------------------------------------------------------------------- # Retrieve value of 'scale' rep_adj_factors |> attr('scale') # Compare to manually-calculated value (tau^2) / B # Retrieve value of 'rscales' rep_adj_factors |> attr('rscales') |> head() # Only show first 5 values ## ----------------------------------------------------------------------------- gen_boot_design <- svrepdesign( data = library_stsys_sample |> mutate(SAMPLING_WEIGHT = 1/SAMPLING_PROB), repweights = rep_adj_factors, weights = ~ SAMPLING_WEIGHT, combined.weights = FALSE, type = "other", scale = attr(rep_adj_factors, 'scale'), rscales = attr(rep_adj_factors, 'rscales') ) ## ----------------------------------------------------------------------------- gen_boot_design |> svymean(x = ~ TOTSTAFF, na.rm = TRUE, deff = TRUE) ## ----------------------------------------------------------------------------- library(survey) data('api', package = 'survey') # Declare a bootstrap survey design object ---- boot_design <- svydesign( data = apistrat, weights = ~pw, id = ~1, strata = ~stype, fpc = ~fpc ) |> svrep::as_bootstrap_design(replicates = 5000) # Produce estimates of interest, and save the estimate from each replicate ---- estimated_means_and_proportions <- svymean(x = ~ api00 + api99 + stype, design = boot_design, return.replicates = TRUE) # Estimate the number of replicates needed to obtain a target simulation CV ---- estimate_boot_reps_for_target_cv( svrepstat = estimated_means_and_proportions, target_cv = c(0.01, 0.05, 0.10) ) ## ----------------------------------------------------------------------------- estimate_boot_sim_cv(estimated_means_and_proportions)