## ----include = FALSE----------------------------------------------------------
run_vignette <- Sys.getenv("RUNNER_OS") == "" && slendr::check_dependencies(python = TRUE, slim = TRUE, quit = FALSE) 

knitr::opts_chunk$set(
  collapse = FALSE,
  comment = "#>",
  fig.width = 8,
  fig.height = 6,
  dpi = 60,
  eval = run_vignette
)

RERUN <- FALSE

## ----collapse = TRUE, message = FALSE-----------------------------------------
library(slendr)
init_env()

library(dplyr)
library(ggplot2)
library(readr)

seed <- 42
set.seed(seed)

## ----eval=FALSE---------------------------------------------------------------
#  afr <- population("AFR", time = 100000, N = 20000)
#  eur <- population("EUR", time = 60000, N = 2000, parent = afr)
#  
#  # <... compile model and simulate a tree sequence `ts` ...>

## ----eval=FALSE---------------------------------------------------------------
#  # compute heterozygosity in the individual "EUR"
#  ts_diversity(ts, "EUR_1")
#  
#  # compute genetic divergence between selected Africans and Europeans
#  afr_samples <- c("AFR_1", "AFR_2", "AFR_3")
#  eur_samples <- c("EUR_1", "EUR_2", "EUR_3")
#  ts_divergence(ts, list(afr = afr_samples, eur = eur_samples))

## ----eval=FALSE---------------------------------------------------------------
#  afr <- population("AFR", time = 90000, N = 20000)
#  eur <- population("EUR", time = 60000, N = 2000, parent = afr)

## ----eval=FALSE---------------------------------------------------------------
#  model <- compile_model(populations = list(afr, eur), generation_time = 30)
#  
#  slim(model, sequence_length = 100000, recombination_rate = 1e-8, method = "gui")

## ----eval=FALSE---------------------------------------------------------------
#  pop <- population("pop", time = 1, N = 1000)
#  simple_model <- compile_model(pop, generation_time = 1, simulation_length = 1000)
#  
#  slim(simple_model, sequence_length = 1000, recombination_rate = 0, method = "gui")

## ----eval=FALSE---------------------------------------------------------------
#  pop <- population("pop", time = 1, N = 1000)
#  simple_model <- compile_model(pop, generation_time = 1, simulation_length = 1000)
#  
#  slim(simple_model, sequence_length = 1000, recombination_rate = 0, burnin = 100, method = "gui")

## -----------------------------------------------------------------------------
library(slendr)
init_env()

# African ancestral population
afr <- population("AFR", time = 90000, N = 3000)

# first migrants out of Africa
ooa <- population("OOA", parent = afr, time = 60000, N = 500, remove = 23000) %>%
  resize(N = 2000, time = 40000, how = "step")

# Eastern hunter-gatherers
ehg <- population("EHG", parent = ooa, time = 28000, N = 1000, remove = 6000)

# European population
eur <- population("EUR", parent = ehg, time = 25000, N = 5000)

# Anatolian farmers
ana <- population("ANA", time = 28000, N = 3000, parent = ooa, remove = 4000)

# Yamnaya steppe population
yam <- population("YAM", time = 8000, N = 500, parent = ehg, remove = 2500)

# define gene-flow events
gf <- list(
  gene_flow(from = ana, to = yam, rate = 0.4, start = 7900, end = 7800),
  gene_flow(from = ana, to = eur, rate = 0.5, start = 6000, end = 5000),
  gene_flow(from = yam, to = eur, rate = 0.65, start = 4000, end = 3500)
)

## ----model_plot, echo=FALSE---------------------------------------------------
model_for_plot <- compile_model(
  populations = list(afr, ooa, ehg, eur, ana, yam),
  gene_flow = gf, generation_time = 30
)

plot_model(model_for_plot, order = c("EUR", "EHG", "YAM", "ANA", "OOA", "AFR"), proportions = TRUE)

## -----------------------------------------------------------------------------
extension_path <- system.file("extdata", "extension_trajectory.txt", package = "slendr")

## ----echo=FALSE---------------------------------------------------------------
Sys.setenv(EXTENSION1 = extension_path)

## -----------------------------------------------------------------------------
model <- compile_model(
  populations = list(afr, ooa, ehg, eur, ana, yam),
  gene_flow = gf, generation_time = 30,
  extension = extension_path  # <--- include the SLiM extension snippet
)

## ----eval=FALSE---------------------------------------------------------------
#  slim(model, sequence_length = 1e6, recombination_rate = 0, ts = FALSE, method = "gui")

## ----echo=FALSE---------------------------------------------------------------
slim(model, sequence_length = 1e6, recombination_rate = 0, ts = FALSE, path = tempdir())

## -----------------------------------------------------------------------------
extension_path <- system.file("extdata", "extension_trajectory_params.txt", package = "slendr")

## ----echo=FALSE---------------------------------------------------------------
Sys.setenv(EXTENSION = extension_path)

## -----------------------------------------------------------------------------
extension <- substitute_values(
  extension_path,
  s = 0.1, onset_time = 15000,
  origin_pop = "EUR", target_pop = "EUR"
)

## ----eval = FALSE-------------------------------------------------------------
#  extension <- substitute_values(
#    extension_path,
#    onset_time = 15000,
#    origin_pop = "EUR", target_pop = "EUR"
#  )
#  
#  # Error: The extension script contains the following unsubstituted patterns: {{s}}

## -----------------------------------------------------------------------------
model <- compile_model(
  populations = list(afr, ooa, ehg, eur, ana, yam),
  gene_flow = gf, generation_time = 30,
  extension = extension
)

## ----echo = FALSE-------------------------------------------------------------
slim(model, sequence_length = 1e6, recombination_rate = 1e-8, ts = FALSE, path = tempdir())

## ----eval = FALSE-------------------------------------------------------------
#  slim(model, sequence_length = 1e6, recombination_rate = 0, path = tempdir(), ts = FALSE, random_seed = 42)

## -----------------------------------------------------------------------------
run_model <- function(origin_pop, onset_time) {
  extension <- substitute_values(
    extension_path,
    s = 0.1, onset_time = onset_time,
    origin_pop = origin_pop, target_pop = "EUR"
  )
  
  model <- compile_model(
    populations = list(afr, ooa, ehg, eur, ana, yam),
    gene_flow = gf, generation_time = 30,
    extension = extension
  )
  
  slim(model, sequence_length = 1e6, recombination_rate = 0,
       path = tempdir(), ts = FALSE, random_seed = 42)
}

run_model(origin_pop = "EUR", onset_time = 15000)
run_model(origin_pop = "ANA", onset_time = 15000)
run_model(origin_pop = "EHG", onset_time = 15000)
run_model(origin_pop = "YAM", onset_time = 8000)

## ----trajectories-------------------------------------------------------------
load_traj <- function(origin_pop) {
  df <- read.table(paste0(tempdir(), "/traj_EUR_", origin_pop, ".tsv"), header = TRUE)
  df$origin <- origin_pop
  df$target <- "EUR"
  df
}

traj <- rbind(load_traj("EUR"), load_traj("ANA"), load_traj("EHG"), load_traj("YAM"))

library(ggplot2)

ggplot(traj) +
  geom_line(aes(time, freq_target, linetype = "EUR"), color = "black") +
  geom_line(aes(time, freq_origin, color = origin), linetype = "dashed") +
  xlim(15000, 0) +
  labs(title = "Allele frequency in EUR given the origin in another population",
       x = "years before present", y = "allele frequency",
       color = "frequency\nin original\npopulation",
       linetype = "frequency\nin target\npopulation") +
  scale_linetype_manual(values = c("solid", "dashed")) +
  facet_wrap(~ origin)

## -----------------------------------------------------------------------------
# extension "template" provided as a single string (this contains the same code
# as the script used just above, except specified directly in R)
extension_template <- r"(
// Because we want to simulate non-neutral evolution, we have to provide a
// custom initialization callback -- slendr will use it to replace its default
// neutral genomic architecture (i.e. the initialize() {...} callback it uses
// by default for neutral simulations). Note that we can refer to slendr's
// constants SEQUENCE_LENGTH and RECOMBINATION_RATE, which will carry values
// passed through from R via slendr's slim() R function.
initialize() {
    initializeMutationType("m1", 0.5, "f", 0.0);

    initializeGenomicElementType("g1", m1, 1.0);
    initializeGenomicElement(g1, 0, SEQUENCE_LENGTH - 1);

    initializeMutationRate(0);
    initializeRecombinationRate(RECOMBINATION_RATE);
}

// Define model constants (to be substituted) all in one place
// (each {{placeholder}} will be replaced by a value passed from R).
// Note that string constant template patterns are surrounded by "quotes"!
initialize() {
    defineConstant("s", {{s}});
    defineConstant("onset_time", {{onset_time}});
    defineConstant("target_pop", "{{target_pop}}");
    defineConstant("origin_pop", "{{origin_pop}}");

    // compose the path to a trajectory file based on given parameters
    defineConstant("traj_file",
                   PATH + "/" + "traj_" + target_pop + "_" + origin_pop + ".tsv");
}

function (void) add_mutation(void) {
    // sample one target carrier of the new mutation...
    target = sample(population(origin_pop).genomes, 1);
    // ... and add the mutation in the middle of it
    mut = target.addNewMutation(m1, s, position = asInteger(SEQUENCE_LENGTH / 2));

    // save the mutation for later reference
    defineGlobal("MUTATION", mut);

    write_log("adding beneficial mutation to population " + origin_pop);

    writeFile(traj_file, "tick\ttime\tfreq_origin\tfreq_target");
}

tick(onset_time) late() {
    // save simulation state in case we need to restart if the mutation is lost
    save_state();

    add_mutation();
}

tick(onset_time):SIMULATION_END late() {
    // the mutation is not segregating and is not fixed either -- we must restart
    if (!MUTATION.isSegregating & !MUTATION.isFixed) {
        write_log("mutation lost -- restarting");

        reset_state();

        add_mutation();
    }

    // compute the frequency of the mutation of interest and save it (if the
    // mutation is missing at this time, save its frequency as NA)
    freq_origin = "NA";
    freq_target = "NA";
    if (population(origin_pop, check = T))
      freq_origin = population(origin_pop).genomes.mutationFrequenciesInGenomes();
    if (population(target_pop, check = T))
      freq_target = population(target_pop).genomes.mutationFrequenciesInGenomes();

    writeFile(traj_file,
              community.tick + "\t" +
              model_time(community.tick) + "\t" +
              freq_origin + "\t" +
              freq_target, append = T);
}
)"

## -----------------------------------------------------------------------------
run_model <- function(origin_pop, onset_time) {
  extension <- substitute_values(
    extension_template, # <--- template SLiM code string directly (not as a file!)
    s = 0.1, onset_time = onset_time,
    origin_pop = origin_pop, target_pop = "EUR"
  )
  
  model <- compile_model(
    populations = list(afr, ooa, ehg, eur, ana, yam),
    gene_flow = gf, generation_time = 30,
    extension = extension
  )
  
  slim(model, sequence_length = 1e6, recombination_rate = 0,
       path = tempdir(), ts = FALSE, random_seed = 42)
}

run_model("EUR", onset_time = 15000)

head(load_traj("EUR"))

## -----------------------------------------------------------------------------
# create the ancestor of everyone and a chimpanzee outgroup
# (we set both N = 1 to reduce the computational time for this model)
chimp <- population("CH", time = 6.5e6, N = 1000)

# two populations of anatomically modern humans: Africans and Europeans
afr <- population("AFR", parent = chimp, time = 6e6, N = 10000)
eur <- population("EUR", parent = afr, time = 70e3, N = 5000)

# Neanderthal population splitting at 600 ky ago from modern humans
# (becomes extinct by 40 ky ago)
nea <- population("NEA", parent = afr, time = 600e3, N = 1000, remove = 40e3)

# 5% Neanderthal introgression into Europeans between 55-50 ky ago
gf <- gene_flow(from = nea, to = eur, rate = 0.05, start = 55000, end = 45000)

model <- compile_model(
  populations = list(chimp, nea, afr, eur), gene_flow = gf,
  generation_time = 30
)

## ----introgression_model, echo=FALSE, fig.width=6, fig.height=4---------------
cowplot::plot_grid(
  plot_model(model, sizes = FALSE, proportions = TRUE),
  plot_model(model, sizes = FALSE, log = TRUE, proportions = TRUE),
  nrow = 1
)

## ----echo=FALSE---------------------------------------------------------------
eur <- population("EUR", time = 100e3, N = 10000)
nea <- population("NEA", time = 100e3, N = 1000, remove = 40000)

gf <- gene_flow(from = nea, to = eur, rate = 0.05, start = 55000, end = 54500)

## -----------------------------------------------------------------------------
extension <- r"(
initialize() {
  // model parameters to be substitute_values()'d from R below
  defineConstant("gene_length", {{gene_length}});
  defineConstant("n_genes", {{n_genes}});
  defineConstant("n_markers", {{n_markers}});
  defineConstant("introgression_time", {{introgression_time}});
  defineConstant("freq_file", PATH + "/{{freq_file}}");

  // total length of the genome to be simulated
  defineConstant("total_length", n_genes * gene_length);

  // positions of neutral Neanderthal markers along the genome
  defineConstant("neutral_pos", seq(0, total_length - 1, by = gene_length / n_markers));
  // positions of deleterious mutations in the center of each gene
  defineConstant("selected_pos", seq(gene_length / 2, total_length - 1, by = gene_length));
}

// Because we want to simulate non-neutral evolution, we have to provide a
// custom initialization callback -- slendr will use it to replace its default
// neutral genomic architecture (i.e. the initialize() {...} callback it uses
// by default for neutral simulations).
initialize() {
  initializeMutationType("m0", 0.5, "f", 0.0); // neutral Neanderthal marker mutation type
  initializeMutationType("m1", 0.5, "f", {{s}}); // deleterious Neanderthal mutation type
  initializeGenomicElementType("g1", m0, 1.0); // genomic type of 'genes'

  genes = c(); // gene start-end coordinates
  breakpoints = c(); // recombination breakpoints
  rates = c(); // recombination rates

  // compute coordinates of genes, as well as the recombination map breakpoints
  // between each gene
  start = 0;
  for (i in seqLen(n_genes)) {
    // end of the next gene
    end = start + gene_length - 1;
    genes = c(genes, start, end);

    // uniform recombination within a gene, followed by a 0.5 recombination breakpoint
    rates = c(rates, RECOMBINATION_RATE, 0.5);
    breakpoints = c(breakpoints, end, end + 1);

    // start of the following genes
    start = end + 1;
  }

  // odd elements --> starts of genes
  gene_starts = integerMod(seqAlong(genes), 2) == 0;
  // even elements --> ends of genes
  gene_ends = integerMod(seqAlong(genes), 2) == 1;

  // set up all the genes at once
  initializeGenomicElement(g1, genes[gene_starts], genes[gene_ends]);

  // set up the recombination map
  initializeRecombinationRate(rates, breakpoints);

  // no mutation rate (we will add neutral variation after the SLiM run)
  initializeMutationRate(0);
}

// Add Neanderthal-specific mutations one tick prior to the introgression
tick(introgression_time) - 1 late() {
  // get all Neanderthal chromosomes just prior to the introgression
  target = population("NEA").genomes;

  write_log("adding neutral Neanderthal markers");

  mutations = target.addNewDrawnMutation(m0, position = asInteger(neutral_pos));
  defineConstant("MARKERS", target.mutations);

  write_log("adding deleterious Neanderthal mutation");

  target.addNewDrawnMutation(m1, position = asInteger(selected_pos));
}

// At the end, write Neanderthal ancestry proportions along the genome to a file
// (in our R analysis code we will compare the results of this with the
// equivalent computation using a tree sequence)
SIMULATION_END late() {
  df = DataFrame(
    "gene", asInteger(MARKERS.position / gene_length),
    "pos", MARKERS.position,
    "freq", sim.mutationFrequencies(population("EUR"), MARKERS)
  );
  writeFile(freq_file, df.serialize(format = "tsv"));
}
)" %>%
  substitute_values(
    introgression_time = 55000, s = -0.003,
    gene_length = 5e6, n_genes = 200, n_markers = 100,
    freq_file = "frequencies.tsv"
  )

## -----------------------------------------------------------------------------
model <- compile_model(
  populations = list(nea, eur), gene_flow = gf,
  generation_time = 30,
  extension = extension
)

## -----------------------------------------------------------------------------
nea_samples <- schedule_sampling(model, times = 50000, list(nea, 1))
modern_samples <- schedule_sampling(model, times = 0, list(eur, 100))

samples <- rbind(nea_samples, modern_samples)

## ----echo=FALSE, eval = RERUN & run_vignette----------------------------------
#  x = Sys.time()
#  data_dir <- "~/Projects/introgression_data/"
#  slim(model, recombination_rate = 1e-8, samples = samples, path = data_dir)
#  y = Sys.time()
#  y - x # Time difference of 45.62365 mins

## ----eval=FALSE---------------------------------------------------------------
#  slim(model, recombination_rate = 1e-8, samples = samples, path = "~/Projects/introgression_data/")

## -----------------------------------------------------------------------------
n_genes <- 200
gene_length <- 5e6
window_length <- 100e3

## ----eval = run_vignette & file.exists("~/Projects/introgression_data/frequencies.tsv")----
freqs <- read_tsv("~/Projects/introgression_data/frequencies.tsv")

## ----introgression_freqs, eval = run_vignette & file.exists("~/Projects/introgression_data/frequencies.tsv")----
freqs %>%
mutate(pos = pos %% 5e6) %>%
group_by(pos) %>%
summarise(freq = 100 * mean(freq)) %>%
ggplot() +
  geom_line(aes(pos, freq)) +
  geom_vline(xintercept = gene_length / 2, linetype = 2) +
  labs(x = "coordinate along a gene", y = "Neanderthal ancestry proportion [%]",
       title = "Proportion of Neanderthal ancestry in Europeans along 5Mb independent genes",
       subtitle = "(dashed line indicates introgressed deleterious Neanderthal allele)") +
  coord_cartesian(ylim = c(0, 3))

## ----eval = run_vignette & file.exists("~/Projects/introgression_data/slim.trees")----
# load a tree sequence and extract the names of recorded individuals
ts <- ts_read(file = "~/Projects/introgression_data/slim.trees", model)
samples <- ts_names(ts, split = "pop")

samples

# compute coordinates of sliding windows along the genome
windows <- seq(from = 0, to = n_genes * gene_length - 1, by = window_length)

head(windows)
tail(windows)

# compute divergence from the tree sequence in each window separately
divergence <- ts_divergence(ts, samples, windows = windows, mode = "branch")$divergence[[1]]

## ----eval = run_vignette & file.exists("~/Projects/introgression_data/slim.trees")----
# compute average divergence at each position in a gene, and a 95% C.I.
div_df <- tibble(
  pop = "eur",
  pos = windows %% gene_length,
  div = divergence
) %>%
  group_by(pop, pos) %>%
  summarise(
    mean = mean(div), n = n(), std = sd(div),
    ci_low = mean - 2 * std / sqrt(n),
    ci_up = mean + 2 * std / sqrt(n)
  ) 

## ----introgression_divergence, eval = run_vignette & file.exists("~/Projects/introgression_data/slim.trees")----
ggplot(div_df) +
  geom_ribbon(aes(pos, ymin = ci_low, ymax = ci_up), fill = "grey70") +
  geom_line(aes(pos, mean)) +
  geom_vline(aes(xintercept = gene_length / 2), linetype = 2) +
  labs(x = "coordinate along a gene", y = "divergence to Neanderthal",
       title = "Divergence of Europeans to a Neanderthal genome along 5Mb independent genes",
       subtitle = "(dashed line indicates introgressed deleterious Neanderthal allele)")