## ----setup, include = FALSE--------------------------------------------------- library(ordbetareg) library(dplyr) library(ggplot2) library(haven) library(brms) library(tidyr) library(stringr) library(Hmisc) library(modelsummary) library(marginaleffects) knitr::opts_chunk$set( collapse = TRUE, comment = "#>", warn=FALSE ) knitr::opts_chunk$set( fig.align="center", fig.width=7, fig.height=3 ) options(rmarkdown.html_vignette.check_title = FALSE) options(modelsummary_factory_default = "gt") set.seed(628871) ## ----runmod------------------------------------------------------------------- # whether to run models from scratch run_model <- F # number of iterations for sampling niters <- 200 # number of warmup iterations nwarmup <- 100 ## ----load_data---------------------------------------------------------------- data("pew") pew %>% ggplot(aes(x=as.numeric(therm))) + geom_histogram(bins=100) + theme_minimal() + theme(panel.grid=element_blank()) + scale_x_continuous(breaks=c(0,25,50,75,100), labels=c("0","Colder","50","Warmer","100")) + ylab("") + xlab("") + labs(caption=paste0("Figure shows the distribution of ",sum(!is.na(pew$therm))," non-missing survey responses.")) ## ----munge_data--------------------------------------------------------------- model_data <- select(pew,therm,age="F_AGECAT_FINAL", sex="F_SEX_FINAL", income="F_INCOME_FINAL", ideology="F_IDEO_FINAL", race="F_RACETHN_RECRUITMENT", education="F_EDUCCAT2_FINAL", region="F_CREGION_FINAL", approval="POL1DT_W28", born_again="F_BORN_FINAL", relig="F_RELIG_FINAL", news="NEWS_PLATFORMA_W28") %>% mutate_at(c("race","ideology","income","approval","sex","education","born_again","relig"), function(c) { factor(c, exclude=levels(c)[length(levels(c))]) }) %>% # need to make these ordered factors for BRMS mutate(education=ordered(education), income=ordered(income)) ## ----run_ordbetareg----------------------------------------------------------- if(run_model) { ord_fit_mean <- ordbetareg(formula=therm ~ mo(education)*mo(income) + (1|region), data=model_data, control=list(adapt_delta=0.95), cores=1,chains=1,iter=niters,warmup=nwarmup, refresh=0) # NOTE: to do parallel processing within chains # add the options below #threads=5, #backend="cmdstanr" #where threads is the number of cores per chain # you must have cmdstanr set up to do so # see https://mc-stan.org/cmdstanr/ } else { data("ord_fit_mean") } ## ----plot_cut----------------------------------------------------------------- all_draws <- prepare_predictions(ord_fit_mean) cutzero <- plogis(all_draws$dpars$cutzero) cutone <- plogis(all_draws$dpars$cutzero + exp(all_draws$dpars$cutone)) pew %>% ggplot(aes(x=therm)) + geom_histogram(bins=100) + theme_minimal() + theme(panel.grid=element_blank()) + scale_x_continuous(breaks=c(0,25,50,75,100), labels=c("0","Colder","50","Warmer","100")) + geom_vline(xintercept = mean(cutzero)*100,linetype=2) + geom_vline(xintercept = mean(cutone)*100,linetype=2) + ylab("") + xlab("") + labs(caption=paste0("Figure shows the distribution of ",sum(!is.na(pew$therm))," non-missing survey responses.")) ## ----post_predict,message=F--------------------------------------------------- # new theme option will add in new ggplot2 themes or themes # from other packages plots <- pp_check_ordbeta(ord_fit_mean, ndraws=100, outcome_label="Thermometer Rating", new_theme=ggthemes::theme_economist()) plots$discrete plots$continuous ## ----posterior_epred_ordbeta-------------------------------------------------- # average probability of a respondent choosing either 0 or 100 # i.e., the top and bottom components of the scale posterior_epred_ordbeta(ord_fit_mean,component = "top") %>% hist posterior_epred_ordbeta(ord_fit_mean,component = "bottom") %>% hist ## ----plot_heiss--------------------------------------------------------------- # turn off caption as it doesn't fit well in vignette mode plot_heiss(ord_fit_mean, grouping_fac="education", ndraws = 100, plot_caption = "") ## ----coef_plot---------------------------------------------------------------- library(modelsummary) modelsummary(ord_fit_mean,statistic = "conf.int", metrics = "RMSE", coef_map=c("b_Intercept"="Intercept", "bsp_moeducation"="Education", "bsp_moincome"="Income", "bsp_moeducation:moincome"="EducationXIncome")) ## ----marg_effect-------------------------------------------------------------- avg_slopes(ord_fit_mean, variables="education") %>% select(Variable="term", Level="contrast", `5% Quantile`="conf.low", `Posterior Mean`="estimate", `95% Quantile`="conf.high") %>% knitr::kable(caption = "Marginal Effect of Education on Professor Thermometer", format.args=list(digits=2), align=c('llccc')) ## ----mult_variate------------------------------------------------------------- # generate a new Gaussian/Normal outcome with same predictor X and mediator # Z X <- runif(n = 100,0,1) Z <- rnorm(100, mean=3*X) # use logit function to map unbounded continuous data to [0,1] interval # X is mediated by Z outcome <- rordbeta(n=100, mu = plogis(.4 * X + 1.5 * Z)) # use the bf function from brms to specify two formulas/responses # set_rescor must be FALSE as one distribution is not Gaussian (ordered beta) # OLS for mediator mod1 <- bf(Z ~ X,family = gaussian) # ordered beta mod2 <- bf(outcome ~ X + Z) if(run_model) { fit_multivariate <- ordbetareg(formula=mod1 + mod2 + set_rescor(FALSE), data=tibble(outcome=outcome, X=X,Z=Z), cores=1,chains=1, iter=niters,warmup=nwarmup) } # need to calculate each sub-model's marginal effects separately knitr::kable(avg_slopes(fit_multivariate,resp="outcome")) knitr::kable(avg_slopes(fit_multivariate, resp="Z")) suppressWarnings(modelsummary(fit_multivariate,statistic = "conf.int", metrics="none")) ## ----mediation---------------------------------------------------------------- bayestestR::mediation(fit_multivariate) ## ----run_brms_phi------------------------------------------------------------- if(run_model) { ord_fit_phi <- ordbetareg(bf(therm ~ 1, phi ~ age + sex), phi_reg = "only", data=model_data, cores=2,chains=2,iter=niters,warmup=nwarmup, refresh=0) # NOTE: to do parallel processing within chains # add the options below #threads=threading(5), #backend="cmdstanr" #where threads is the number of cores per chain # you must have cmdstanr set up to do so # see https://mc-stan.org/cmdstanr/ } else { data("ord_fit_phi") } ## ----phicoef------------------------------------------------------------------ summary(ord_fit_phi) ## ----plot_phi_sim------------------------------------------------------------- # we can use some dplyr functions to make this really easy female_data <- distinct(model_data, age) %>% mutate(sex="Female") male_data <- distinct(model_data, age) %>% mutate(sex="Male") to_predict <- bind_rows(female_data, male_data) %>% filter(!is.na(age)) pred_post <- posterior_predict(ord_fit_phi, newdata=to_predict) # better with iterations as rows pred_post <- t(pred_post) colnames(pred_post) <- 1:ncol(pred_post) # need to convert to a data frame data_pred <- as_tibble(pred_post) %>% mutate(sex=to_predict$sex, age=to_predict$age) %>% gather(key="iter",value='estimate',-sex,-age) data_pred %>% ggplot(aes(x=estimate)) + geom_density(aes(fill=sex),alpha=0.5,colour=NA) + scale_fill_viridis_d() + theme(panel.background = element_blank(), panel.grid=element_blank()) ## ----cutprior_model----------------------------------------------------------- #| eval: false ## # we'll use our data, although we won't fit this model ## ## cutpoint_mod <- ordbetareg(formula=bf(therm ~ education + ## (1|region), ## cutzero ~ education, ## cutone ~ education), ## data=model_data, ## manual_prior=set_prior("normal(0,5)") + ## set_prior("normal(0,5)", class="b",dpar="cutone") + ## set_prior("normal(0,5)", class="b",dpar="cutzero"), ## control=list(adapt_delta=0.95), ## cores=1,chains=1,iter=500, ## refresh=0) ## ## ----check_data,eval=FALSE---------------------------------------------------- ## ## # NOT RUN IN THE VIGNETTE ## ## single_data <- sim_ordbeta(N=100,iter=1, ## return_data=T) ## ## # examine the first dataset ## ## knitr::kable(head(single_data$data[[1]])) ## ## ----sim_data_full------------------------------------------------------------ if(run_model) { sim_data <- sim_ordbeta(N=c(250,500,750), k=1, beta_coef = .5, iter=100,cores=10, beta_type="binary", treat_assign=0.3) } else { data("sim_data") } ## ----sim_plot----------------------------------------------------------------- sim_data %>% select(`Proportion S Errors`="s_err",N,Power="power", `M Errors`="m_err",Variance="var_marg") %>% gather(key = "type",value="estimate",-N) %>% ggplot(aes(y=estimate,x=N)) + #geom_point(aes(colour=model),alpha=0.1) + stat_summary(fun.data="mean_cl_boot") + ylab("") + xlab("N") + scale_x_continuous(breaks=c(250,500,750)) + scale_color_viridis_d() + facet_wrap(~type,scales="free_y",ncol = 2) + labs(caption=stringr::str_wrap("Summary statistics calculated as mean with bootstrapped confidence interval from simulation draws. M Errors and S errors are magnitude of bias (+1 equals no bias) and incorrect sign of the estimated marginal effect respectively. Variance refers to estimated posterior variance (uncertainty) of the marginal effect(s).",width=50)) + guides(color=guide_legend(title=""), linetype=guide_legend(title="")) + theme_minimal() + theme(plot.caption = element_text(size=7), axis.text.x=element_text(size=8)) ## ----------------------------------------------------------------------------- library(DeclareDesign) # helper function for glmmTMB ordbetareg fit tidy_avg_slopes <- function(x) { tidy(avg_slopes(x)) } # first, a simulated proportion using rordbeta (ordered beta distribution) # see https://osf.io/preprints/socarxiv/2sx6y # cutpoints = number of observations at the bound (i.e. 0 or 1) # phi = dispersion, a value of 1 means relatively "flat" design_rordbeta <- declare_model( N = 100, potential_outcomes(Y ~ rordbeta(N, mu = .5 + .05*Z,phi = 1, cutpoints=c(-3,3) )) ) + declare_inquiry(ATE = 0.05) + declare_assignment(Z = complete_ra(N, m = 50)) + declare_measurement(Y = reveal_outcomes(Y ~ Z)) + declare_estimator(Y ~ Z, .method = glmmTMB::glmmTMB, inquiry = "ATE", family=glmmTMB::ordbeta, .summary= tidy_avg_slopes, term="Z") ## ----eval=FALSE--------------------------------------------------------------- ## ## # create two designs with identical treatment effects/expected values ## # first uses rordbeta to generate proportion in [0,1] ## # second uses rordbeta to generate proportion in [0,1], then ## # dichotomizes to 0 or 1 by rounding at 0.5 ## # compare to see which has greater power given same number of obsevations ## # & check for bias ## ## # equivalent to dichotomizing (if a proportion) ## design_dichot <- ## declare_model( ## N = 100, ## potential_outcomes(Y ~ as.numeric(rordbeta(N, mu = .5 + .05*Z,phi = 1, ## cutpoints=c(-3,3))>0.5)) ## ) + ## declare_inquiry(ATE = 0.05) + ## declare_assignment(Z = complete_ra(N, m = 50)) + ## declare_measurement(Y = reveal_outcomes(Y ~ Z)) + ## declare_estimator(Y ~ Z, .method = lm_robust, inquiry = "ATE") ## ## # NB: DeclareDesign is using lm_robust as it's default estimation ## # However, it should be unbiased for the mean/expected value for the ## # binomial/dichotomized model ## ## diagnosands <- ## declare_diagnosands(bias = mean(estimate - estimand), ## power = mean(p.value <= 0.05)) ## ## # compare in terms of bias on the ATE & Power ## ## diagnose_design(design_rordbeta, diagnosands = diagnosands) ## diagnose_design(design_dichot, diagnosands = diagnosands) ##