## ---- echo = FALSE, message=FALSE--------------------------------------------- knitr::opts_chunk$set( fig.width = 7, fig.height = 5 ) ## ---- eval=FALSE-------------------------------------------------------------- # library(rubias) # library(tidyverse) ## ---- echo=FALSE-------------------------------------------------------------- # this is what we actually evaluate. library(rubias) # all the following libraries can be loaded with "library(tidyverse)" # but then you have to put tidyverse in the Suggests because this is # in the vignette, and that is bad practice, so, load the packages separately... library(tibble) library(dplyr) library(tidyr) library(stringr) library(ggplot2) ## ----head-chinook------------------------------------------------------------- head(chinook[, 1:8]) ## ----head-chinook-mix--------------------------------------------------------- head(chinook_mix[, 1:8]) ## ----------------------------------------------------------------------------- # combine small_chinook_ref and small_chinook_mix into one big data frame, # but drop the California_Coho collection because Coho all # have pretty much the same genotype at these loci! small_chinook_all <- bind_rows(small_chinook_ref, small_chinook_mix) %>% filter(collection != "California_Coho") # then toss them into a function. matchy_pairs <- close_matching_samples(D = small_chinook_all, gen_start_col = 5, min_frac_non_miss = 0.85, min_frac_matching = 0.94 ) # see that that looks like: matchy_pairs %>% arrange(desc(num_non_miss), desc(num_match)) ## ----------------------------------------------------------------------------- # then toss them into a function. This takes half a minute or so... matchy_pairs2 <- close_matching_samples(D = small_chinook_all, gen_start_col = 5, min_frac_non_miss = 0.85, min_frac_matching = 0.80 ) # see that that looks like: matchy_pairs2 %>% arrange(desc(num_non_miss), desc(num_match)) ## ----infer_mixture1----------------------------------------------------------- mix_est <- infer_mixture(reference = chinook, mixture = chinook_mix, gen_start_col = 5) ## ----look-at-mix-est---------------------------------------------------------- lapply(mix_est, head) ## ----------------------------------------------------------------------------- prior_tibble <- chinook %>% count(repunit, collection) %>% filter(repunit == "CentralValleyfa") %>% select(collection) %>% mutate(pi_param = 2) # see what it looks like: prior_tibble ## ----------------------------------------------------------------------------- set.seed(12) mix_est_with_prior <- infer_mixture(reference = chinook, mixture = chinook_mix, gen_start_col = 5, pi_prior = prior_tibble) ## ----------------------------------------------------------------------------- comp_mix_ests <- list( `pi (default prior)` = mix_est$mixing_proportions, `pi (cv fall gets 2s prior)` = mix_est_with_prior$mixing_proportions ) %>% bind_rows(.id = "prior_type") %>% filter(mixture_collection == "rec1") %>% select(prior_type, repunit, collection, pi) %>% spread(prior_type, pi) %>% mutate(coll_group = ifelse(repunit == "CentralValleyfa", "CV_fall", "Not_CV_fall")) ggplot(comp_mix_ests, aes(x = `pi (default prior)`, y = `pi (cv fall gets 2s prior)`, colour = coll_group )) + geom_point() + geom_abline(slope = 1, intercept = 0, linetype = "dashed") ## ----------------------------------------------------------------------------- comp_mix_ests %>% group_by(coll_group) %>% summarise(with_explicit_prior = sum(`pi (cv fall gets 2s prior)`), with_default_prior = sum(`pi (default prior)`)) ## ----aggregating-------------------------------------------------------------- # for mixing proportions rep_mix_ests <- mix_est$mixing_proportions %>% group_by(mixture_collection, repunit) %>% summarise(repprop = sum(pi)) # adding mixing proportions over collections in the repunit # for individuals posteriors rep_indiv_ests <- mix_est$indiv_posteriors %>% group_by(mixture_collection, indiv, repunit) %>% summarise(rep_pofz = sum(PofZ)) ## ----plot-6------------------------------------------------------------------- # find the top 6 most abundant: top6 <- rep_mix_ests %>% filter(mixture_collection == "rec1") %>% arrange(desc(repprop)) %>% slice(1:6) # check how many MCMC sweeps were done: nsweeps <- max(mix_est$mix_prop_traces$sweep) # keep only rec1, then discard the first 200 sweeps as burn-in, # and then aggregate over reporting units # and then keep only the top6 from above trace_subset <- mix_est$mix_prop_traces %>% filter(mixture_collection == "rec1", sweep > 200) %>% group_by(sweep, repunit) %>% summarise(repprop = sum(pi)) %>% filter(repunit %in% top6$repunit) # now we can plot those: ggplot(trace_subset, aes(x = repprop, colour = repunit)) + geom_density() ## ----------------------------------------------------------------------------- top6_cis <- trace_subset %>% group_by(repunit) %>% summarise(loCI = quantile(repprop, probs = 0.025), hiCI = quantile(repprop, probs = 0.975)) top6_cis ## ----plot-zs------------------------------------------------------------------ # get the maximum-a-posteriori population for each individual map_rows <- mix_est$indiv_posteriors %>% group_by(indiv) %>% top_n(1, PofZ) %>% ungroup() ## ----z-score-density---------------------------------------------------------- normo <- tibble(z_score = rnorm(1e06)) ggplot(map_rows, aes(x = z_score)) + geom_density(colour = "blue") + geom_density(data = normo, colour = "black") ## ----------------------------------------------------------------------------- no_kc <- infer_mixture(small_chinook_ref, small_chinook_mix, gen_start_col = 5) ## ----------------------------------------------------------------------------- no_kc$mixing_proportions %>% arrange(mixture_collection, desc(pi)) ## ----------------------------------------------------------------------------- # make reference file that includes the known_collection column kc_ref <- small_chinook_ref %>% mutate(known_collection = collection) %>% select(known_collection, everything()) # see what that looks like kc_ref[1:10, 1:8] ## ----------------------------------------------------------------------------- kc_mix <- small_chinook_mix %>% mutate(known_collection = NA) %>% select(known_collection, everything()) kc_mix$known_collection[kc_mix$collection == "rec1"][1:8] <- "Deer_Cr_sp" # here is what that looks like now (dropping most of the genetic data columns) kc_mix[1:20, 1:7] ## ----------------------------------------------------------------------------- # note that the genetic data start in column 6 now with_kc <- infer_mixture(kc_ref, kc_mix, 6) ## ----------------------------------------------------------------------------- with_kc$mixing_proportions %>% arrange(mixture_collection, desc(pi)) ## ---- eval=FALSE-------------------------------------------------------------- # full_model_results <- infer_mixture( # reference = chinook, # mixture = chinook_mix, # gen_start_col = 5, # method = "BR" # ) ## ----self-ass----------------------------------------------------------------- sa_chinook <- self_assign(reference = chinook, gen_start_col = 5) ## ----self-ass-results--------------------------------------------------------- head(sa_chinook, n = 100) ## ----summ2repu---------------------------------------------------------------- sa_to_repu <- sa_chinook %>% group_by(indiv, collection, repunit, inferred_repunit) %>% summarise(repu_scaled_like = sum(scaled_likelihood)) head(sa_to_repu, n = 200) ## ----chin-sims, message=FALSE------------------------------------------------- chin_sims <- assess_reference_loo(reference = chinook, gen_start_col = 5, reps = 5, mixsize = 200) ## ----show-chin-sims----------------------------------------------------------- chin_sims ## ----top6list----------------------------------------------------------------- top6 ## ----roundem------------------------------------------------------------------ round(top6$repprop * 350) ## ----make-arep---------------------------------------------------------------- arep <- top6 %>% ungroup() %>% mutate(dirichlet = 10 * repprop) %>% select(repunit, dirichlet) arep ## ----chin-sim-top6, message=FALSE--------------------------------------------- chin_sims_repu_top6 <- assess_reference_loo(reference = chinook, gen_start_col = 5, reps = 5, mixsize = 200, alpha_repunit = arep) ## ----summ-top6---------------------------------------------------------------- # now, call those repunits that we did not specify in arep "OTHER" # and then sum up over reporting units tmp <- chin_sims_repu_top6 %>% mutate(repunit = ifelse(repunit %in% arep$repunit, repunit, "OTHER")) %>% group_by(iter, repunit) %>% summarise(true_repprop = sum(true_pi), reprop_posterior_mean = sum(post_mean_pi), repu_n = sum(n)) %>% mutate(repu_n_prop = repu_n / sum(repu_n)) ## ----plot-top6---------------------------------------------------------------- # then plot them ggplot(tmp, aes(x = true_repprop, y = reprop_posterior_mean, colour = repunit)) + geom_point() + geom_abline(intercept = 0, slope = 1) + facet_wrap(~ repunit) ## ----plot-top6-n-------------------------------------------------------------- ggplot(tmp, aes(x = repu_n_prop, y = reprop_posterior_mean, colour = repunit)) + geom_point() + geom_abline(intercept = 0, slope = 1) + facet_wrap(~ repunit) ## ----chin-sim-top6-indiv, message=FALSE--------------------------------------- set.seed(100) chin_sims_with_indivs <- assess_reference_loo(reference = chinook, gen_start_col = 5, reps = 5, mixsize = 200, alpha_repunit = arep, return_indiv_posteriors = TRUE) # print out the indiv posteriors chin_sims_with_indivs$indiv_posteriors ## ----boxplot-pofz-indiv-sim--------------------------------------------------- # summarise things repu_pofzs <- chin_sims_with_indivs$indiv_posteriors %>% filter(repunit == simulated_repunit) %>% group_by(iter, indiv, simulated_collection, repunit) %>% # first aggregate over reporting units summarise(repu_PofZ = sum(PofZ)) %>% ungroup() %>% arrange(repunit, simulated_collection) %>% mutate(simulated_collection = factor(simulated_collection, levels = unique(simulated_collection))) # also get the number of simulated individuals from each collection num_simmed <- chin_sims_with_indivs$indiv_posteriors %>% group_by(iter, indiv) %>% slice(1) %>% ungroup() %>% count(simulated_collection) # note, the last few steps make simulated collection a factor so that collections within # the same repunit are grouped together in the plot. # now, plot it ggplot(repu_pofzs, aes(x = simulated_collection, y = repu_PofZ)) + geom_boxplot(aes(colour = repunit)) + geom_text(data = num_simmed, mapping = aes(y = 1.025, label = n), angle = 90, hjust = 0, vjust = 0.5, size = 3) + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 9, vjust = 0.5)) + ylim(c(NA, 1.05)) ## ---- message=FALSE----------------------------------------------------------- set.seed(101) # for reproducibility # do the simulation chin_sims_by_gc <- assess_reference_loo(reference = chinook, gen_start_col = 5, reps = 5, mixsize = 200, alpha_repunit = arep, return_indiv_posteriors = TRUE, resampling_unit = "gene_copies") ## ----boxplot-pofz-gc-sim------------------------------------------------------ # summarise things repu_pofzs_gc <- chin_sims_by_gc$indiv_posteriors %>% filter(repunit == simulated_repunit) %>% group_by(iter, indiv, simulated_collection, repunit) %>% # first aggregate over reporting units summarise(repu_PofZ = sum(PofZ)) %>% ungroup() %>% arrange(repunit, simulated_collection) %>% mutate(simulated_collection = factor(simulated_collection, levels = unique(simulated_collection))) # also get the number of simulated individuals from each collection num_simmed_gc <- chin_sims_by_gc$indiv_posteriors %>% group_by(iter, indiv) %>% slice(1) %>% ungroup() %>% count(simulated_collection) # note, the last few steps make simulated collection a factor so that collections within # the same repunit are grouped together in the plot. # now, plot it ggplot(repu_pofzs_gc, aes(x = simulated_collection, y = repu_PofZ)) + geom_boxplot(aes(colour = repunit)) + geom_text(data = num_simmed_gc, mapping = aes(y = 1.025, label = n), angle = 90, hjust = 0, vjust = 0.5, size = 3) + theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 9, vjust = 0.5)) + ylim(c(NA, 1.05)) ## ----------------------------------------------------------------------------- arep ## ----------------------------------------------------------------------------- arep_subs <- tribble( ~collection, ~sub_ppn, "Eel_R", 0.1, "Russian_R", 0.9, "Butte_Cr_fa", 0.7, "Feather_H_sp", 0.3 ) ## ---- message=FALSE----------------------------------------------------------- chin_sims_sub_ppn <- assess_reference_loo(reference = chinook, gen_start_col = 5, reps = 5, mixsize = 200, alpha_repunit = arep, alpha_collection = arep_subs, return_indiv_posteriors = FALSE) # don't bother returning individual posteriors ## ----------------------------------------------------------------------------- chin_sims_sub_ppn %>% group_by(repunit, collection) %>% summarise(mean_pi = mean(true_pi)) %>% group_by(repunit) %>% mutate(repunit_mean_pi = sum(mean_pi), fract_within = mean_pi / repunit_mean_pi) %>% mutate(fract_within = ifelse(fract_within < 1e-06, 0, fract_within)) %>% # anything less than 1 in a million gets called 0 filter(repunit_mean_pi > 0.0) ## ----------------------------------------------------------------------------- arep$repunit ## ----six-hundy1--------------------------------------------------------------- six_hundy_scenarios <- lapply(arep$repunit, function(x) tibble(repunit = x, ppn = 1.0)) names(six_hundy_scenarios) <- paste("All", arep$repunit, sep = "-") ## ----six-hundy2, message=FALSE------------------------------------------------ repu_hundy_results <- assess_reference_loo(reference = chinook, gen_start_col = 5, reps = 5, mixsize = 50, alpha_repunit = six_hundy_scenarios, alpha_collection = 10) repu_hundy_results ## ----hundy-colls1------------------------------------------------------------- set.seed(10) hundy_colls <- sample(unique(chinook$collection), 5) hundy_colls ## ----hundy-colls2------------------------------------------------------------- hundy_coll_list <- lapply(hundy_colls, function(x) tibble(collection = x, ppn = 1.0)) %>% setNames(paste("100%", hundy_colls, sep = "_")) ## ----hundy-colls-do, message=FALSE-------------------------------------------- hundy_coll_results <- assess_reference_loo(reference = chinook, gen_start_col = 5, reps = 5, mixsize = 50, alpha_collection = hundy_coll_list) hundy_coll_results ## ----infer_mixture_pb, eval=FALSE--------------------------------------------- # mix_est_pb <- infer_mixture(reference = chinook, # mixture = chinook_mix, # gen_start_col = 5, # method = "PB") ## ---- eval=FALSE-------------------------------------------------------------- # mix_est_pb$mixing_proportions %>% # group_by(mixture_collection, repunit) %>% # summarise(repprop = sum(pi)) %>% # left_join(mix_est_pb$bootstrapped_proportions) %>% # ungroup() %>% # filter(mixture_collection == "rec1") %>% # arrange(desc(repprop)) %>% # slice(1:10)