## ----include=FALSE------------------------------------------------------------ knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.align='center', dpi = 92, fig.retina = 2 ) options(tibble.print_min = 4L, tibble.print_max = 4L) ## ----eval=FALSE--------------------------------------------------------------- # # Uncomment: # # install.packages("groupdata2") ## ----eval=FALSE--------------------------------------------------------------- # # Uncomment: # # install.packages("devtools") # # devtools::install_github("LudvigOlsen/groupdata2") ## ----error=FALSE, message=FALSE, warning=FALSE-------------------------------- # Attaching groupdata2 library(groupdata2) # Attaching other packages used in this vignette library(dplyr) library(tidyr) require(ggplot2, quietly = TRUE) # Attach if installed library(knitr) # We will also be using plyr a few times, but we don't attach this # because of possible conflicts with dplyr. Instead we use its functions # like so: plyr::count() ## ----------------------------------------------------------------------------- df <- data.frame( "x" = c(1:12), "species" = factor(rep(c('cat', 'pig', 'human'), 4)), "age" = sample(c(1:100), 12) ) ## ----------------------------------------------------------------------------- groups <- group_factor(df, 5, method = 'n_dist') groups df$groups <- groups df %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df %>% group_by(groups) %>% summarize(mean_age = mean(age)) %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df <- data.frame( "x" = c(1:12), "species" = factor(rep(c('cat', 'pig', 'human'), 4)), "age" = sample(c(1:100), 12) ) ## ----------------------------------------------------------------------------- groups <- group_factor(df, 5, method = 'n_dist', force_equal = TRUE) groups plyr::count(groups) %>% rename(group = x, size = freq) %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df <- head(df, length(groups)) %>% mutate(group = groups) df %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df <- data.frame( "x" = c(1:12), "species" = factor(rep(c('cat', 'pig', 'human'), 4)), "age" = sample(c(1:100), 12) ) ## ----------------------------------------------------------------------------- df_grouped <- group(df, 5, method = 'n_dist') df_grouped %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df_means <- df %>% group(5, method = 'n_dist') %>% summarise(mean_age = mean(age)) df_means %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df <- data.frame( "x" = c(1:12), "species" = factor(rep(c('cat', 'pig', 'human'), 4)), "age" = sample(c(1:100), 12) ) ## ----------------------------------------------------------------------------- df_grouped <- df %>% group(5, method = 'n_dist', force_equal = TRUE) df_grouped %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df <- data.frame( "x" = c(1:12), "species" = factor(rep(c('cat', 'pig', 'human'), 4)), "age" = sample(c(1:100), 12) ) ## ----------------------------------------------------------------------------- df_list <- splt(df, 5, method = 'n_dist') df_list %>% kable(align = 'c') ## ----------------------------------------------------------------------------- v <- c(1:6) splt(v, 3, method = 'n_dist') ## ----------------------------------------------------------------------------- df <- data.frame( "x" = c(1:12), "species" = factor(rep(c('cat', 'pig', 'human'), 4)), "age" = sample(c(1:100), 12) ) ## ----------------------------------------------------------------------------- df_list <- splt(df, 5, method = 'n_dist', force_equal = TRUE) df_list %>% kable(align = 'c') ## ----------------------------------------------------------------------------- df <- data.frame( "participant" = factor(rep(c('1', '2', '3', '4', '5', '6'), 3)), "age" = rep(sample(c(1:100), 6), 3), "diagnosis" = factor(rep(c('a', 'b', 'a', 'a', 'b', 'b'), 3)), "score" = sample(c(1:100), 3 * 6) ) df <- df %>% arrange(participant) # Remove index rownames(df) <- NULL # Add session info df$session <- rep(c('1','2', '3'), 6) kable(df, align = 'c') ## ----------------------------------------------------------------------------- df_folded <- fold(df, 3, method = 'n_dist') # Order by folds df_folded <- df_folded %>% arrange(.folds) kable(df_folded, align = 'c') ## ----------------------------------------------------------------------------- df_folded <- fold(df, 3, cat_col = 'diagnosis', method = 'n_dist') # Order by folds df_folded <- df_folded %>% arrange(.folds) kable(df_folded, align = 'c') ## ----------------------------------------------------------------------------- df_folded %>% count(.folds, diagnosis) %>% kable(align='c') ## ----------------------------------------------------------------------------- df_folded <- fold(df, 3, id_col = 'participant', method = 'n_dist') # Order by folds df_folded <- df_folded %>% arrange(.folds) # Remove index (Looks prettier in the table!) rownames(df_folded) <- NULL kable(df_folded, align = 'c') ## ----------------------------------------------------------------------------- df_folded %>% count(.folds, participant) %>% kable(align='c') ## ----------------------------------------------------------------------------- df_folded <- fold(df, 3, cat_col = 'diagnosis', id_col = 'participant', method = 'n_dist') # Order by folds df_folded <- df_folded %>% arrange(.folds) kable(df_folded, align = 'c') ## ----------------------------------------------------------------------------- df_folded %>% count(.folds, diagnosis, participant) %>% kable(align='c') ## ----------------------------------------------------------------------------- df <- data.frame( "participant" = factor(rep(c('1', '2', '3', '4', '5', '6'), 3)), "age" = rep(sample(c(1:100), 6), 3), "diagnosis" = factor(rep(c('a', 'b', 'a', 'a', 'b', 'b'), 3)), "score" = sample(c(1:100), 3 * 6) ) df <- df %>% arrange(participant) # Remove index rownames(df) <- NULL # Add session info df$session <- rep(c('1','2', '3'), 6) kable(df, align = 'c') ## ----------------------------------------------------------------------------- df_partitioned <- partition(df, 0.3, list_out = FALSE) # Order by partitions df_partitioned <- df_partitioned %>% arrange(.partitions) # Partition Sizes df_partitioned %>% count(.partitions) %>% kable(align = 'c') kable(df_partitioned, align = 'c') ## ----------------------------------------------------------------------------- df_partitioned <- partition(df, 0.3, cat_col = 'diagnosis', list_out = FALSE) # Order by partitions df_partitioned <- df_partitioned %>% arrange(.partitions) kable(df_partitioned, align = 'c') ## ----------------------------------------------------------------------------- df_partitioned %>% count(.partitions, diagnosis) %>% kable(align='c') ## ----------------------------------------------------------------------------- df_partitioned <- partition(df, 0.5, id_col = 'participant', list_out = FALSE) # Order by partitions df_partitioned <- df_partitioned %>% arrange(.partitions) kable(df_partitioned, align = 'c') ## ----------------------------------------------------------------------------- df_partitioned %>% count(.partitions, participant) %>% kable(align='c') ## ----------------------------------------------------------------------------- df_partitioned <- partition( data = df, p = 0.5, cat_col = 'diagnosis', id_col = 'participant', list_out = FALSE ) # Order by folds df_partitioned <- df_partitioned %>% arrange(.partitions) kable(df_partitioned, align = 'c') ## ----------------------------------------------------------------------------- df_partitioned %>% count(.partitions, diagnosis, participant) %>% kable(align='c') ## ----echo=FALSE--------------------------------------------------------------- set.seed(2) ## ----------------------------------------------------------------------------- df <- data.frame( "participant" = factor(rep(c('1', '2', '3', '4', '5', '6'), 3)), "age" = rep(sample(c(1:100), 6), 3), "diagnosis" = factor(rep(c('a', 'b', 'a', 'a', 'b', 'b'), 3)), "score" = sample(c(1:100), 3 * 6) ) df <- df %>% arrange(participant) # Add session info df$session <- rep(c('1','2', '3'), 6) # Sample dataset to get imbalances df <- df %>% sample_frac(0.7) %>% arrange(participant) # Remove index rownames(df) <- NULL # Counts df %>% count(diagnosis, participant) %>% kable(align = 'c') df %>% count(diagnosis) %>% kable(align = 'c') kable(df, align = 'c') ## ----------------------------------------------------------------------------- df_balanced <- balance(df, "min", cat_col = "diagnosis") %>% arrange(diagnosis, participant) # Counts df_balanced %>% count(diagnosis) %>% kable(align = 'c') kable(df_balanced, align = 'c') ## ----------------------------------------------------------------------------- df_balanced <- balance(df, "min", cat_col = "diagnosis", id_col = "participant", id_method = "n_rows_c") %>% arrange(diagnosis, participant) # Partition Sizes df_balanced %>% count(diagnosis) %>% kable(align = 'c') kable(df_balanced, align = 'c') ## ----------------------------------------------------------------------------- df_balanced %>% count(diagnosis, participant) %>% kable(align='c') ## ----------------------------------------------------------------------------- df_balanced <- balance(df, "max", cat_col = "diagnosis") %>% arrange(diagnosis, participant) # Counts df_balanced %>% count(diagnosis) %>% kable(align = 'c') kable(df_balanced, align = 'c') ## ----------------------------------------------------------------------------- df_balanced <- balance(df, "max", cat_col = "diagnosis", id_col = "participant", id_method = "n_rows_c") %>% arrange(diagnosis, participant) # Partition Sizes df_balanced %>% count(diagnosis) %>% kable(align = 'c') kable(df_balanced, align = 'c') ## ----------------------------------------------------------------------------- df_balanced %>% count(diagnosis, participant) %>% kable(align='c') ## ----------------------------------------------------------------------------- df_balanced <- balance(df, 3, cat_col = "diagnosis") %>% arrange(diagnosis, participant) # Counts df_balanced %>% count(diagnosis) %>% kable(align = 'c') kable(df_balanced, align = 'c') ## ----------------------------------------------------------------------------- df_balanced %>% count(diagnosis, participant) %>% kable(align='c') ## ----------------------------------------------------------------------------- df_balanced <- balance(df, 3, cat_col = "diagnosis", id_col = "participant", id_method = "n_rows_c") %>% arrange(diagnosis, participant) # Partition Sizes df_balanced %>% count(diagnosis) %>% kable(align = 'c') kable(df_balanced, align = 'c') ## ----------------------------------------------------------------------------- df_balanced %>% count(diagnosis, participant) %>% kable(align='c') ## ----------------------------------------------------------------------------- df <- data.frame( "x" = c(1:12), "species" = factor(rep(c('cat', 'pig', 'human'), 4)), "age" = sample(c(1:100), 12) ) ## ----------------------------------------------------------------------------- groups <- group_factor(df, 5, method = 'n_dist', randomize = TRUE) groups ## ----------------------------------------------------------------------------- df_list <- splt(df, 5, method = 'n_dist', randomize = TRUE) df_list %>% kable(align = 'c') ## ----echo=FALSE, eval=requireNamespace("ggplot2")----------------------------- # # Examples to show difference between methods # This could be made interactive! This way you could test what happens in different situations by # by simply moving a slider! # vec <- c(1:57) n <- 6 if (exists ('n_meth_v57n6')){ rm(n_meth_v57n6) } for (meth in c('n_dist', 'n_fill' ,'n_last','n_rand')){ data_temp <- data.frame(plyr::count(group_factor(vec, n, method = meth))) names(data_temp)[names(data_temp)=="freq"] <- meth if (exists ('n_meth_v57n6')) { n_meth_v57n6 <- cbind(n_meth_v57n6, data_temp) } else { n_meth_v57n6 <- data_temp } } forced_equal <- plyr::count(group_factor(vec, n, method = 'n_last', force_equal = TRUE)) n_meth_v57n6$forced_equal <- forced_equal$freq n_meth_v57n6 <- n_meth_v57n6[ , !duplicated(colnames(n_meth_v57n6))] # gather() data frame for plotting data_plot <- n_meth_v57n6 %>% gather(method, group_size,-1) upper_limit <- max(data_plot$group_size)+1 lower_limit <- min(data_plot$group_size)-1 v57n6_plot <- ggplot(data_plot, aes(x, group_size)) ## Output # Data frame n_meth_v57n6 # Plot v57n6_plot + geom_point() + scale_y_continuous(limit = c(lower_limit, upper_limit), breaks = round(seq(lower_limit, upper_limit, by = 2),1)) + #scale_y_continuous(limit = c(lower_limit, upper_limit))+ facet_wrap('method', ncol=1) + labs(x = 'group', y = 'group Size', title = 'Distribution of Elements in groups')+ theme_bw()+ theme(axis.text.y = element_text(size=9), axis.text.x = element_text(size=9)) ## ----echo=FALSE, eval=requireNamespace("ggplot2")----------------------------- vec <- c(1:117) n <- 11 if (exists ('n_meth_v117n11')){ rm(n_meth_v117n11) } for (meth in c('n_dist', 'n_fill' ,'n_last','n_rand')){ data_temp <- data.frame(plyr::count(group_factor(vec, n, method = meth))) names(data_temp)[names(data_temp)=="freq"] <- meth if (exists ('n_meth_v117n11')) { n_meth_v117n11 <- cbind(n_meth_v117n11, data_temp) } else { n_meth_v117n11 <- data_temp } } forced_equal <- plyr::count(group_factor(vec, n, method = 'n_last', force_equal = TRUE)) n_meth_v117n11$forced_equal <- forced_equal$freq n_meth_v117n11 <- n_meth_v117n11[ , !duplicated(colnames(n_meth_v117n11))] # gather() data frame for plotting data_plot <- n_meth_v117n11 %>% gather(method, group_size,-1) v117n11_plot <- ggplot(data_plot, aes(x, group_size)) upper_limit <- max(data_plot$group_size)+1 lower_limit <- min(data_plot$group_size)-1 ## Output # Data frame n_meth_v117n11 # Plot v117n11_plot + geom_point() + scale_y_continuous(limit = c(lower_limit, upper_limit), breaks = round(seq(lower_limit, upper_limit, by = 2),1)) + facet_wrap('method', ncol=1) + labs(x = 'group', y = 'group Size', title = 'Distribution of Elements in groups')+ theme_bw()+ theme(axis.text.y = element_text(size=9), axis.text.x = element_text(size=9)) ## ----echo=FALSE, eval=requireNamespace("ggplot2")----------------------------- vec <- c(1:100) if (exists ('greedy_data')){ rm(greedy_data) } for (n in c(8,15,20)){ group_sizes <- plyr::count(group_factor(vec, n, method='greedy')) data_temp <- data.frame(group_sizes, 'Size' = factor(n)) if (exists ('greedy_data')) { greedy_data <- rbind(greedy_data, data_temp) } else { greedy_data <- data_temp } } greedy_plot <- ggplot(greedy_data, aes(x, freq, color=Size)) greedy_plot + geom_point() + labs(x = 'group', y = 'group Size', title = 'Greedy Distribution of Elements in groups', color = 'Size') + theme_bw()+ theme(plot.margin = unit(c(1,1,1,1), "cm"))+ theme(axis.text.y = element_text(size=9), axis.text.x = element_text(size=9)) ## ----echo=FALSE, eval=requireNamespace("ggplot2")----------------------------- vec <- c(1:1000) if (exists ('staircase_data')){ rm(staircase_data) } for (n in c(2, 5, 11)){ group_sizes <- plyr::count(group_factor(vec, n, method='staircase')) data_temp <- data.frame(group_sizes, 'step_size' = factor(n)) if (exists ('staircase_data')) { staircase_data <- rbind(staircase_data, data_temp) } else { staircase_data <- data_temp } } staircase_plot <- ggplot(staircase_data, aes(x, freq, color=step_size)) staircase_plot + geom_point() + #scale_x_continuous(breaks = round(seq(1, max(data_temp$x), by = 2),1))+ labs(x = 'group', y = 'group Size', title = 'Staircasing Distribution of Elements in groups', color = 'Step Size') + theme_bw()+ theme(axis.text.y = element_text(size=9), axis.text.x = element_text(size=7)) ## ----echo=FALSE, eval=requireNamespace("ggplot2")----------------------------- staircase_data <- staircase_data %>% group_by(step_size) %>% mutate(cumsum = cumsum(freq)) staircase_cumulative_plot <- ggplot(staircase_data, aes(x, cumsum, color=step_size)) staircase_cumulative_plot + geom_point() + labs(x = 'group', y = 'Cumulative sum of group sizes', title = 'Staircasing Cumulative Sum of group Sizes', color = 'Step Size') + theme_bw()+ theme(axis.text.y = element_text(size=9), axis.text.x = element_text(size=7)) ## ----echo=FALSE, eval=requireNamespace("ggplot2")----------------------------- vec <- c(1:1000) if (exists ('primes_data')){ rm(primes_data) } for (n in c(2, 5, 11)){ group_sizes <- plyr::count(group_factor(vec, n, method='primes')) data_temp <- data.frame(group_sizes, 'start_at' = factor(n)) if (exists ('primes_data')) { primes_data <- rbind(primes_data, data_temp) } else { primes_data <- data_temp } } primes_plot <- ggplot(primes_data, aes(x, freq, color=start_at)) primes_plot + geom_point() + #scale_x_continuous(breaks = round(seq(1, max(data_temp$x), by = 2),1))+ labs(x = 'group', y = 'group Size', title = 'Prime numbers method - Elements per groups', color = 'Start at') + theme_bw()+ theme(axis.text.y = element_text(size=9), axis.text.x = element_text(size=7)) ## ----echo=FALSE, eval=requireNamespace("ggplot2")----------------------------- primes_data <- primes_data %>% group_by(start_at) %>% mutate(cumsum = cumsum(freq)) primes_cumulative_plot <- ggplot(primes_data, aes(x, cumsum, color=start_at)) primes_cumulative_plot + geom_point() + labs(x = 'group', y = 'Cumulative sum of group sizes', title = 'Primes Cumulative Sum of group Sizes', color = 'Start At') + theme_bw()+ theme(axis.text.y = element_text(size=9), axis.text.x = element_text(size=7))