## ----knitr, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----load libraries----------------------------------------------------------- library(mgcv) # for datasets and the gam function library(dplyr) # for data manipulation library(ale) ## ----data setup--------------------------------------------------------------- # Create and prepare the data # Specific seed chosen to illustrate the spuriousness of the random variable set.seed(6) math <- # Start with math achievement scores per student MathAchieve |> as_tibble() |> mutate( school = School |> as.character() |> as.integer(), minority = Minority == 'Yes', female = Sex == 'Female' ) |> # summarize the scores to give per-school values summarize( .by = school, minority_ratio = mean(minority), female_ratio = mean(female), math_avg = mean(MathAch), ) |> # merge the summarized student data with the school data inner_join( MathAchSchool |> mutate(school = School |> as.character() |> as.integer()), by = c('school' = 'school') ) |> mutate( public = Sector == 'Public', high_minority = HIMINTY == 1, ) |> select(-School, -Sector, -HIMINTY) |> rename( size = Size, academic_ratio = PRACAD, discrim = DISCLIM, mean_ses = MEANSES, ) |> # Remove ID column for analysis select(-school) |> select( math_avg, size, public, academic_ratio, female_ratio, mean_ses, minority_ratio, high_minority, discrim, everything() ) |> mutate( rand_norm = rnorm(nrow(MathAchSchool)) ) glimpse(math) ## ----y summary---------------------------------------------------------------- summary(math$math_avg) ## ----enable progressr, eval = FALSE------------------------------------------- # # Run this in an R console; it will not work directly within an R Markdown or Quarto block # progressr::handlers(global = TRUE) # progressr::handlers('cli') ## ----train gam model---------------------------------------------------------- gam_math <- gam( math_avg ~ public + high_minority + s(size) + s(academic_ratio) + s(female_ratio) + s(mean_ses) + s(minority_ratio) + s(discrim) + s(rand_norm), data = math ) gam_math ## ----p_funs------------------------------------------------------------------- # # To generate the code, uncomment the following lines. # # But it is slow because it retrains the model 1000 times, # # so this vignette loads a pre-created p-values object. # gam_math_p_funs <- create_p_funs( # math, # gam_math # ) # saveRDS(gam_math_p_funs, file.choose()) gam_math_p_funs <- url('https://github.com/tripartio/ale/raw/main/download/gam_math_p_funs.rds') |> readRDS() ## ----model bootstrap---------------------------------------------------------- mb_gam_math <- model_bootstrap( math, gam_math, # Pass the p_funs object so that p-values will be generated ale_options = list(p_values = gam_math_p_funs), # For the GAM model coefficients, show details of all variables, parametric or not tidy_options = list(parametric = TRUE), # tidy_options = list(parametric = NULL), boot_it = 40, # 100 by default but reduced here for a faster demonstration parallel = 2 # CRAN limit (delete this line on your own computer for faster speed) ) ## ----model_stats-------------------------------------------------------------- mb_gam_math$model_stats ## ----model_coefs-------------------------------------------------------------- mb_gam_math$model_coefs ## ----model_coefs stat sig variables------------------------------------------- mb_gam_math$model_coefs |> # filter is TRUE if conf.low and conf.high are both positive or both negative because # multiplying two numbers of the same sign results in a positive number. filter((conf.low * conf.high) > 0) ## ----all-ALE-plots, fig.width=7, fig.height=10-------------------------------- mb_gam_math$ale$plots |> patchwork::wrap_plots(ncol = 2) ## ----model-bootstrap-without-p_funs, fig.width=7, fig.height=10--------------- mb_gam_no_p <- model_bootstrap( math, gam_math, # For the GAM model coefficients, show details of all variables, parametric or not tidy_options = list(parametric = TRUE), # tidy_options = list(parametric = NULL), boot_it = 40, # 100 by default but reduced here for a faster demonstration parallel = 2 # CRAN limit (delete this line on your own computer) ) mb_gam_no_p$ale$plots |> patchwork::wrap_plots(ncol = 2) ## ----ALE-effects-plot, fig.width=8, fig.height=6------------------------------ mb_gam_math$ale$stats$effects_plot ## ----ALE-plot-for-public-1---------------------------------------------------- mb_gam_math$ale$plots$public ## ----ALE stats for public----------------------------------------------------- mb_gam_math$ale$stats$by_term$public ## ----ALE-plot-forl-academic_ratio-1------------------------------------------- mb_gam_math$ale$plots$academic_ratio ## ----ALE stats for academic_ratio--------------------------------------------- mb_gam_math$ale$stats$by_term$academic_ratio ## ----math-rand_norm-ALE-plot, fig.width=3.5, fig.height=3--------------------- mb_gam_math$ale$plots$rand_norm ## ----ALE stats for rand_norm-------------------------------------------------- mb_gam_math$ale$stats$by_term$rand_norm ## ----ale data for public------------------------------------------------------ mb_gam_math$ale$data$public ## ----ale data for academic_ratio---------------------------------------------- mb_gam_math$ale$data$academic_ratio ## ----ALE-plot-for-mean_ses---------------------------------------------------- mb_gam_math$ale$plots$mean_ses ## ----conf_regions for mean_ses------------------------------------------------ mb_gam_math$ale$conf_regions$by_term$mean_ses ## ----text conf_regions for mean_ses------------------------------------------- ale:::summarize_conf_regions_in_words(mb_gam_math$ale$conf_regions$by_term$mean_ses) ## ----ALE-plot-for-public-2---------------------------------------------------- mb_gam_math$ale$plots$public ## ----conf_regions for public-------------------------------------------------- mb_gam_math$ale$conf_regions$by_term$public ## ----ALE-plot-for-rand_norm--------------------------------------------------- mb_gam_math$ale$plots$rand_norm ## ----conf_regions for rand_norm----------------------------------------------- mb_gam_math$ale$conf_regions$by_term$rand_norm ## ----significant conf_regions------------------------------------------------- mb_gam_math$ale$conf_regions$significant ## ----variables with significant conf_regions---------------------------------- mb_gam_math$ale$conf_regions$significant$term |> unique()