## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
 collapse = TRUE,
 comment = "#>",
 fig.width = 7,
 fig.height = 4,
 fig.align = "center"
)

## ----load libraries-----------------------------------------------------------
library(riskdiff)
library(dplyr)
library(ggplot2)

## ----load example data--------------------------------------------------------
data(cachar_sample)

# Basic structure
glimpse(cachar_sample)

# Key variables for our analysis
cachar_sample %>%
  select(abnormal_screen, areca_nut, smoking, alcohol, age, sex, residence) %>%
  summary()

## ----simple unadjusted analysis-----------------------------------------------
# Simple unadjusted analysis
rd_simple <- calc_risk_diff(
  data = cachar_sample,
  outcome = "abnormal_screen",
  exposure = "areca_nut"
)

print(rd_simple)

## ----visualisation------------------------------------------------------------
# Let's visualize what this risk difference means
exposure_summary <- cachar_sample %>%
  group_by(areca_nut) %>%
  summarise(
    n = n(),
    cases = sum(abnormal_screen),
    risk = mean(abnormal_screen),
    se = sqrt(risk * (1 - risk) / n)
  ) %>%
  mutate(
    risk_percent = risk * 100,
    se_percent = se * 100
  )

print(exposure_summary)

# The risk difference is simply:
rd_value <- diff(exposure_summary$risk)
cat("Risk difference:", round(rd_value * 100, 1), "percentage points\n")

## ----adjusted analysis--------------------------------------------------------
# Age and sex adjusted analysis
rd_adjusted <- calc_risk_diff(
  data = cachar_sample,
  outcome = "abnormal_screen",
  exposure = "areca_nut",
  adjust_vars = c("age", "sex")
)
print(rd_adjusted)

# Show comparison in a readable format
cat("\n=== COMPARISON: UNADJUSTED vs ADJUSTED ===\n")
cat("Unadjusted Risk Difference:", sprintf("%.2f%%", rd_simple$rd * 100), 
    sprintf("(%.2f%%, %.2f%%)", rd_simple$ci_lower * 100, rd_simple$ci_upper * 100), "\n")
cat("Adjusted Risk Difference:  ", sprintf("%.2f%%", rd_adjusted$rd * 100), 
    sprintf("(%.2f%%, %.2f%%)", rd_adjusted$ci_lower * 100, rd_adjusted$ci_upper * 100), "\n")
cat("Difference in estimates:   ", sprintf("%.2f%%", (rd_adjusted$rd - rd_simple$rd) * 100), "percentage points\n")

## ----stratified analysis------------------------------------------------------
# Stratified by residence (urban vs rural)
rd_stratified <- calc_risk_diff(
  data = cachar_sample,
  outcome = "abnormal_screen",
  exposure = "areca_nut",
  strata = "residence"
)

print(rd_stratified)

## ----visualise stratified results---------------------------------------------

# Stratified analysis by residence
rd_stratified <- calc_risk_diff(
  data = cachar_sample,
  outcome = "abnormal_screen",
  exposure = "areca_nut",
  strata = "residence"
)

print(rd_stratified)

# Check if estimates are stable enough for visualization
rd_summary <- rd_stratified %>%
  summarise(
    n_valid = sum(!is.na(rd)),
    max_ci_width = max((ci_upper - ci_lower) * 100, na.rm = TRUE),
    .groups = "drop"
  )

# Only create plot if we have reasonable estimates
if (rd_summary$n_valid > 0 && rd_summary$max_ci_width < 200) {
  # Use the fixed forest plot function
  plot_obj <- create_forest_plot(
    rd_stratified, 
    title = "Risk Difference for Cancer by Areca Nut Chewing",
    max_ci_width = 30
  )
  
  if (!is.null(plot_obj)) {
    print(plot_obj)
  } else {
    cat(paste0(riskdiff:::.safe_warning()), "Could not create plot due to unstable estimates.\n")
  }
} else {
  cat(paste0(riskdiff:::.safe_warning()), "Estimates too unstable for meaningful visualization.\n")
  cat("Consider:\n")
  cat(paste0(riskdiff:::.safe_bullet()), "Larger sample sizes\n") 
  cat(paste0(riskdiff:::.safe_bullet()), "Different stratification variables\n")
  cat(paste0(riskdiff:::.safe_bullet()), "Pooled analysis instead of stratification\n")
  
  # Show tabular results instead using the robust summary function
  formatted_results <- create_summary_table(
    rd_stratified, 
    caption = "Risk Differences by Residence"
  )
  
  knitr::kable(formatted_results, caption = "Risk Differences by Residence")
}

## ----convergence handling-----------------------------------------------------
# Force different link functions
rd_identity <- calc_risk_diff(
  cachar_sample, "abnormal_screen", "areca_nut", 
  link = "identity"
)

rd_log <- calc_risk_diff(
  cachar_sample, "abnormal_screen", "areca_nut", 
  link = "log"
)

# Compare model types used
cat("Identity link model type:", rd_identity$model_type, "\n")
cat("Log link model type:", rd_log$model_type, "\n")

## ----rare outcomes------------------------------------------------------------
# Create a rare outcome (1% prevalence)
cachar_sample$rare_outcome <- rbinom(nrow(cachar_sample), 1, 0.01)

rd_rare <- calc_risk_diff(
  cachar_sample, 
  "rare_outcome", 
  "areca_nut"
)

print(rd_rare)

## ----missing data-------------------------------------------------------------
# Create a copy with some missing data for demonstration
set.seed(123)  # For reproducibility
cachar_with_missing <- cachar_sample %>%
  mutate(
    # Introduce more modest missing data (~3% in age, ~2% in alcohol)
    age_with_missing = ifelse(runif(n()) < 0.03, NA, age),
    alcohol_with_missing = ifelse(runif(n()) < 0.02, NA, alcohol)
  )

# Check the missing data patterns
missing_summary <- cachar_with_missing %>%
  summarise(
    total_observations = n(),
    age_missing = sum(is.na(age_with_missing)),
    alcohol_missing = sum(is.na(alcohol_with_missing)),
    total_missing_any = sum(!complete.cases(select(., age_with_missing, alcohol_with_missing, abnormal_screen, areca_nut))),
    complete_cases = sum(complete.cases(select(., age_with_missing, alcohol_with_missing, abnormal_screen, areca_nut)))
  )

print(missing_summary)

# Analysis with variables that have missing data
rd_missing <- calc_risk_diff(
  cachar_with_missing,
  "abnormal_screen",
  "areca_nut",
  adjust_vars = c("age_with_missing", "alcohol_with_missing")
)

# Compare with complete case analysis
rd_complete <- calc_risk_diff(
  cachar_sample,
  "abnormal_screen", 
  "areca_nut",
  adjust_vars = c("age", "alcohol")
)

cat("\n=== IMPACT OF MISSING DATA ===\n")
cat("Complete data analysis (n=", rd_complete$n_obs, "): ", sprintf("%.2f%%", rd_complete$rd * 100), 
    sprintf(" (%.2f%%, %.2f%%)", rd_complete$ci_lower * 100, rd_complete$ci_upper * 100), "\n")

# Check if missing data analysis succeeded
if (!is.na(rd_missing$rd)) {
  cat("Missing data analysis (n=", rd_missing$n_obs, "):  ", sprintf("%.2f%%", rd_missing$rd * 100), 
      sprintf(" (%.2f%%, %.2f%%)", rd_missing$ci_lower * 100, rd_missing$ci_upper * 100), "\n")
  cat("Cases lost to missing data: ", rd_complete$n_obs - rd_missing$n_obs, "\n")
} else {
  cat("Missing data analysis: FAILED (insufficient data or convergence issues)\n")
  cat("Attempted to use n =", rd_missing$n_obs, "complete cases\n")
  cat("Cases lost to missing data: ", nrow(cachar_with_missing) - rd_missing$n_obs, "\n\n")
  
  cat("LESSON: This demonstrates why missing data can be problematic:\n")
  cat(paste0(riskdiff:::.safe_bullet()), "Listwise deletion can dramatically reduce sample size\n")
  cat(paste0(riskdiff:::.safe_bullet()), "Small samples may cause model convergence failures\n") 
  cat(paste0(riskdiff:::.safe_bullet()), "Consider multiple imputation for better missing data handling\n")
  cat(paste0(riskdiff:::.safe_bullet()), "The riskdiff package gracefully handles these failures\n")
}

## ----basic syntax example, eval=FALSE-----------------------------------------
# # Example usage:
# result <- calc_risk_diff(
#   data = cachar_sample,           # Your dataset
#   outcome = "abnormal_screen",    # Binary outcome variable (0/1)
#   exposure = "areca_nut",         # Exposure of interest
#   adjust_vars = c("age", "sex"),  # Variables to adjust for
#   strata = "residence",           # Stratification variables
#   link = "auto",                  # Link function: "auto", "identity", "log", "logit"
#   alpha = 0.05,                   # Significance level (0.05 = 95% CI)
#   verbose = FALSE                 # Print diagnostic messages if TRUE
# )