## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

Prepare_data <- function(dat, population_incidence, breaks){

  ### Input:

  #' @param dat Data.frame with one row per individual which at least includes
  #' a column **d** with event indicator (1 for event, 0 for censored), a column
  #' **y** with event/censoring time.
  #' @param population_incidence A vector (in combination with breaks) or
  #' a data.frame (columns 1) 'start age group', 2) 'end age group', 3)'S_pop')
  #' with population incidence per 100,000 per interval k.
  #' @param breaks Cut-points for the (age/time) groups. Only needed when
  #' population_incidence is a vector.

  ### Output:

  #' @return Data.frame one row per individual and a.o. columns *id* unique ID;
  #' *d* non-censoring indicator; **k** interval of (age) group; **S_k**
  #' population interval-based proportion of individuals experiencing the
  #' event in intervals later than k; **S_k.** sample
  #' proportion of individuals experiencing the event in intervals later
  #' than k.

  # --------------- Load packages.

#  require(survival)
#  require(dplyr)
#  require(tidyr)

  # --------------- Organize external information (population incidence rate).
  # N.B.: there are different options regarding the breaks (1-4).

  if(is.vector(population_incidence)){

    if(is.numeric(breaks)){ # Option 1: two vectors population_incidence and breaks.
      if(length(breaks)!=(length(population_incidence)+1) ) warning("Number of
                                                                  breaks should
                                                                  equal the
                                                                  number of
                                                                  groups plus
                                                                  one.")
      n_agegroups <- length(population_incidence)
      from <- breaks[-(n_agegroups + 1)]  # Remove last
      to <- breaks[-1] # Remove first
      dat_inc <- data.frame(start = from, end = to, incidence_rate =
                              population_incidence/100000)

    } else if(breaks == "5 years"){ # Option 2: 5 years age groups.
      n_agegroups <- length(population_incidence)
      breaks <- seq(0, 5*(n_agegroups), by = 5)
      from <- breaks[-(n_agegroups + 1)]  # Remove last
      to <- breaks[-1] # Remove first
      dat_inc <- data.frame(start = from, end = to, incidence_rate =
                              population_incidence/100000)

    } else if(breaks == "10 years"){ # Option 3: 10 years age groups.
      n_agegroups <- length(population_incidence)
      breaks <- seq(0, 10*(n_agegroups), by = 10)
      from <- breaks[-(n_agegroups + 1)]  # Remove last
      to <- breaks[-1] # Remove first
      dat_inc <- data.frame(start = from, end = to, incidence_rate =
                              population_incidence/100000)
    }

  } else { # Option 4: if population incidence is given as a data.frame.
    dat_inc <- data.frame(start = population_incidence[,1],
                          end = population_incidence[,2],
                          incidence_rate = population_incidence[,3]/100000)
    breaks <- population_incidence[,1]
  }

  dat_inc <- dat_inc %>% mutate( k = paste("(", start, ",", end, "]", sep=""),
                                 t = end - start)
  # t is the width of the age interval, in years.
  # Note that in the paper, this is referred to
  # as (alpha_k - alpha_({k-1}).

  # --------------- Calculate S_k population from population incidence.

  dat_inc$S_k <- c(exp(-(dat_inc$t * dat_inc$incidence_rate)))

  # --------------- Calculate S_k. based on sample data.

  dat$y_cat <- cut(dat$y, breaks = breaks)
  index <- which(is.na(dat$y_cat)==T)
  if(length(index)>0){
    dat<- dat[-index,]}  # Include only those that fall within an age category.

  # Store number of cases and unaffected individuals per age group.
  agegroups.info <- aggregate(id ~ y_cat + d, data = dat, length)

  agegroups.info <- agegroups.info %>% tidyr::spread(d, -y_cat)
  colnames(agegroups.info) <- c("k","s_k","r_k")

  # Transform NA to 0 (its real meaning).
  agegroups.info$s_k <- ifelse(!is.na(agegroups.info$s_k), agegroups.info$s_k, 0)
  agegroups.info$r_k <- ifelse(!is.na(agegroups.info$r_k), agegroups.info$r_k, 0)

  # Calculate person years per outcome and age group.
  personyears <- aggregate(y ~ y_cat + d, data =
                             dat[,c("d","y_cat", "y"),], sum) %>%
    tidyr::spread(d, -y_cat)
  colnames(personyears) <- c("k","q_k","p_k")
  agegroups.info <- merge(agegroups.info, personyears, by = "k")  # Merge.

  agegroups.info$q_k <- ifelse(!is.na(agegroups.info$q_k),
                               agegroups.info$q_k, 0)
  agegroups.info$p_k <- ifelse(!is.na(agegroups.info$p_k),
                               agegroups.info$p_k, 0)

  agegroups.info$years.before <- breaks[which(breaks != max(breaks))]

  # Person years among censored individuals.
  q <- agegroups.info$q_k - (agegroups.info$years.before * agegroups.info$s_k)

  # Person years among cases.
  p <- agegroups.info$p_k - (agegroups.info$years.before * agegroups.info$r_k)

  # Years per age group.
  # N.B. in the paper referred to as (alpha_k - alpha_{k-1}).
  t <- agegroups.info$t <- dat_inc$t

  # Number of censored.
  s <- agegroups.info$s_k

  # Number of cases.
  r <- agegroups.info$r_k

  # Age group label.
  k <- agegroups.info$k

  # Obtain incidence rate (mu) based on sample data.
  n_agegroups <- nrow(agegroups.info)
  for (k in 1:n_agegroups){
    if(k!=n_agegroups){calcsum <- sum(agegroups.info$r_k[(k+1):n_agegroups],
                                      agegroups.info$s_k[(k+1):n_agegroups])
    } else {calcsum <- 0}
    agegroups.info$mu_k[k] = agegroups.info$r_k[k]/
      (agegroups.info$p_k[k]+agegroups.info$q_k[k]+agegroups.info$t[k]*calcsum)

  }

  # Recalculate S_k. (in the sample!) based on the intervals.
  S_k. <- c()
  S_k. <- c(exp(-(agegroups.info$mu_k*t)))

  agegroups.info$S_k. <- S_k.
  agegroups.info$S_k <- dat_inc$S_k

  # --------------- Add S_k. to the sample data.

  dat_out <- merge(dat, agegroups.info, by.x = "y_cat", by.y = "k") %>%
    mutate(k = y_cat) %>%
    mutate(population_incidence = mu_k)

  dat_out$years.before <- NULL; dat_out$t <- NULL;
  dat_out$y_cat <- NULL; dat_out$mu_k <- NULL;
  dat_out$s_k <- dat_out$r_k <- dat_out$p_k <- dat_out$q_k <- NULL

  # --------------- Output the data.frame.

  dat_out

}
#' Calculate inverse probability of selection weights.
#'
#' @description
#' This function calculates weights to correct for ascertainment bias in
#' time-to-event data where clusters are outcome-dependently sampled,
#' for example high-risk families in genetic epidemiological studies in
#' cancer research.
#'
#' @details
#' Weights are based on a comparison between the survival between sample and
#' population. Therefore, besides the sample data, the population incidence rate
#' (per 100 000) is needed as input, as well as the cut-offs of the
#' (age/time-to-event) groups for which this is available. The function provides
#' two options for the latter: cut-offs can be provided manually or using the
#' standard 5- or 10-years (age) categories (0-4, 5-9, ... or 0-9, 10-14, ...).
#' Note that resulting intervals are of the form [xx, xx).
#'
#' @export


Calculate_weights <- function(dat){

### Input:

#' @param dat Data.frame with one row per individual with columns *d*
#' non-censoring indicator; **k** interval of (age) group; **S_k**
#' population interval-based proportion of individuals experiencing the
#' event in intervals later than k; **S_k.** sample
#' proportion of individuals experiencing the event in intervals later
#' than k.

### Output:

#' @return Vector with weights.

  # --------------- Extract variables from input data.frame.

  # Group/interval.
  k <- dat$k

  # Population proportion of individuals experiencing the event in intervals
  # later than k.
  S_k <- dat$S_k

  # Sample proportion of individuals experiencing the event in intervals
  # later than k.
  S_k. <- dat$S_k.

  # --------------- Create empty containers.

  v <- w <- rep(NA, nrow(dat))

  # --------------- Calculate the weights.

  for (n in 1:nrow(dat)){

    # Weights for unaffected.
    v[n] = 1

    # Weights for cases.
    w[n] = ( (1 - S_k[n]) / S_k[n] ) * (S_k.[n] / ( 1 - S_k.[n]))
  }

  merged <- cbind(dat, w, v)
  merged$weight <- NA

  # --------------- Assign w for uncensored, v for censored (interval-wise).

  merged$weight[which(merged$d == 1)] <- merged$w[which(merged$d == 1)]
  merged$weight[which(merged$d == 0)] <- merged$v[which(merged$d == 0)]

  merged$w <- merged$v <- NULL  # Remove old variable.

  # --------------- Collect output.

  vec_weights <- merged$weight

  # If weight is 0, add very small value (coxph does not accept weights of 0).
  vec_weights[which(vec_weights==0)] <- 0.0000001

  # --------------- Print warning if weights are invalid.

  ifelse((sum(vec_weights<0)>0), print("Invalid (negative) weights!"),
         print("No negative weights"))

  # --------------- Return output.

  vec_weights

}


## ----Loading, message = F-----------------------------------------------------

library(wcox)

require(dplyr)
require(tidyr)
require(survival)

## ----Load data set, message = F-----------------------------------------------
# Load toy data.
data("fam_dat")

## ----Show first lines data----------------------------------------------------
# Show the first few lines of the toy data.
head(fam_dat)

## ----Distribution of family size----------------------------------------------
# Show the number of families and top rows.
cat( "There are", unique(fam_dat$family_id) %>% length() , "families in data set. ")

# Examine family sizes.
fam_sizes <- fam_dat %>% group_by(family_id) %>% count()

cat( "Family size is on average ", 
     fam_sizes$n %>% mean() %>% round(1), 
     " and ranges from ", 
     fam_sizes$n %>% min(), " to ", 
     fam_sizes$n %>% max(), ".", sep = "")

## ----External information-----------------------------------------------------

# Enter the incidence by age group per 100 000, starting with the youngest age group.
incidence_rate <- c(2882, 1766, 1367, 1155, 987, 845, 775, 798, 636, 650)

# Define the age group cutoffs.
breaks_manually <- c(0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100)
breaks_10yrs <- "10 years"

## ----Rename variables---------------------------------------------------------
# Rename variables.
my_dat <- rename(fam_dat, id = individual_id, d = event_indicator, y = age)

## ----Prepare data, message = F------------------------------------------------

# Using option 1 (manual cut-offs):
my_dat_out <- Prepare_data(dat = my_dat, population_incidence = incidence_rate, 
                           breaks = breaks_manually)

# Unhash to use option 2 (pre-set cut-offs):
# my_dat_out <- Prepare_data(dat = my_dat, population_incidence = incidence_rate, 
# breaks = "10 years")

## ----Look at prepared data----------------------------------------------------

# Select the newly add columns.
my_dat_out %>% select(id, k, d, S_k, S_k.) %>% arrange(id) %>% head(8)

## ----Calculate weights--------------------------------------------------------
# Calculate weights using the object that is output from the Prepare_data() function.

w <- Calculate_weights(dat = my_dat_out)

## ----Show weights-------------------------------------------------------------
# Show the weights.
my_dat_out %>% mutate(weight = w) %>% 
  select(id, d, weight) %>% 
  arrange(id) %>% 
  filter(id %in% c(1,3))

## ----Fit model using weights, message = F-------------------------------------
# Fit the model.
fit <- coxph(Surv(y, d==1) ~ x + cluster(family_id), weights = w, data =  my_dat_out)

fit

## ----Examine estimates of fitted model----------------------------------------
# Extract estimates.
fit.summary <- summary(fit)

# Summarize findings.
cat("Covariate effect (95% CI): ",
exp(fit.summary$coefficients[1]), " (",
exp(fit.summary$coefficients[1] - 1.96*fit.summary$coefficients[4]), ", ",
exp(fit.summary$coefficients[1] + 1.96*fit.summary$coefficients[4]), ").", sep = "")