## ---- include = FALSE--------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ---- cache = FALSE, include = FALSE------------------------------------------ ## copied from examples from the "simmer" R package ## after: https://www.enchufa2.es/archives/suggests-and-vignettes.html ## by Iñaki Úcar required <- c("bnpsd", "popkin", "kinship2", "RColorBrewer") if (!all(sapply(required, requireNamespace, quietly = TRUE))) knitr::opts_chunk$set(eval = FALSE) ## ----------------------------------------------------------------------------- library(simfam) # data dimensions n <- 15 # number of individuals per generation G <- 3 # number of generations # the result changes in every run unless we set the seed set.seed(1) # draw the random pedigree with those properties data <- sim_pedigree( n, G ) # save these objects separately fam <- data$fam ids <- data$ids kinship_local_G <- data$kinship_local ## ----------------------------------------------------------------------------- # the top of the table head( fam ) # and the bottom tail( fam ) ## ----------------------------------------------------------------------------- kinship_local_G ## ---- fig.width = 7, fig.height = 4, fig.align = 'center'--------------------- # load the package library(kinship2) # Convert the `fam` table into a `pedigree` object required by the `kinship2` package. obj <- pedigree( id = fam$id, dadid = fam$pat, momid = fam$mat, sex = fam$sex, affected = fam$pheno ) # plot the pedigree plot( obj, cex = 0.7 ) ## ---- fig.width = 7, fig.height = 4, fig.align = 'center'--------------------- # replace original with pruned FAM table # containing only ancestors of last generation fam <- prune_fam( fam, ids[[ G ]] ) # inspect fam # and plot pedigree obj <- pedigree( id = fam$id, dadid = fam$pat, momid = fam$mat, sex = fam$sex, affected = fam$pheno ) plot( obj, cex = 0.7 ) ## ----------------------------------------------------------------------------- # number of loci to draw m <- 5000 # draw uniform ancestral allele frequencies p_anc <- runif( m ) # draw independent (unstructured) genotypes for founders (includes founders without descendants) X_1 <- matrix( rbinom( n * m, 2, p_anc ), nrow = m, ncol = n ) # `simfam` requires names for the founders on `X_1`, to validate identities and align colnames( X_1 ) <- ids[[ 1 ]] ## ---- fig.width = 7, fig.height = 3.4, fig.align = 'center'------------------- library(popkin) # estimate kinship kinship_est_1 <- popkin( X_1 ) # true/expected kinship kinship_1 <- diag( n ) / 2 # assign same IDs as `X_1` and `fam`; `simfam` generally requires these to validate/align colnames( kinship_1 ) <- colnames( X_1 ) rownames( kinship_1 ) <- colnames( X_1 ) # plot them together for comparison plot_popkin( list( kinship_1, kinship_est_1 ), titles = c('Expected', 'Estimated'), names = TRUE, leg_width = 0.2, mar = c(3, 2) ) ## ----------------------------------------------------------------------------- X <- geno_fam( X_1, fam ) # expected kinship of entire pruned pedigree, starting from true kinship of founders. # Called "local" kinship because it ignores the ancestral population structure (will be reused) # NOTE: agrees with `kinship2::kinship( fam )`, but only because founders are unstructured kinship_local <- kinship_fam( kinship_1, fam ) ## ---- fig.width = 7, fig.height = 3.4, fig.align = 'center'------------------- # estimate kinship kinship_local_est <- popkin( X ) # plot them together for comparison plot_popkin( list( kinship_local, kinship_local_est ), titles = c('Expected', 'Estimated'), names = TRUE, names_cex = 0.8, leg_width = 0.2, mar = c(3, 2) ) ## ---- fig.width = 7, fig.height = 2.3, fig.align = 'center'------------------- # indexes (really logicals) marking individuals in last generation, # to subset data (which is aligned with `fam`) indexes_G <- fam$id %in% ids[[ G ]] # estimate kinship kinship_local_est <- popkin( X ) # plot them together for comparison plot_popkin( list( kinship_local[ indexes_G, indexes_G ], kinship_local_est[ indexes_G, indexes_G ], kinship_local_G ), titles = c('Expected', 'Estimated', 'Local'), names = TRUE, mar = c(3, 2) ) ## ----------------------------------------------------------------------------- library(bnpsd) # additional admixture parameters: # number of ancestries K <- 3 # define population structure # FST values for k=3 subpopulations inbr_subpops <- c(0.2, 0.4, 0.6) # admixture proportions from 1D geography (founders) admix_proportions_1 <- admix_prop_1d_linear( n, K, sigma = 1 ) # add founder names to `admix_proportions_1` (will propagate to other vars of interest) rownames( admix_proportions_1 ) <- ids[[ 1 ]] # also name subpopulations simply as S1, S2, S3 colnames( admix_proportions_1 ) <- paste0( 'S', 1:K ) # calculate true kinship matrix of the admixed founders # NOTE: overwrites `kinship` for unstructured founders kinship_1 <- coanc_to_kinship( coanc_admix( admix_proportions_1, inbr_subpops ) ) # draw genotypes from admixture model # NOTE: overwrites `X_1` for unstructured founders X_1 <- draw_all_admix( admix_proportions_1, inbr_subpops, m )$X ## ---- fig.width = 7, fig.height = 3.4, fig.align = 'center'------------------- # estimate kinship (NOTE: overwrites `kinship_est_1` for unstructured founders) kinship_est_1 <- popkin( X_1 ) # plot them together for comparison plot_popkin( list( kinship_1, kinship_est_1 ), titles = c('Expected', 'Estimated'), names = TRUE, leg_width = 0.2, mar = c(3, 2) ) ## ----------------------------------------------------------------------------- X <- geno_fam( X_1, fam ) # expected kinship of entire pruned pedigree, starting from true kinship of founders # NOTE: DOESN'T agree with `kinship2::kinship( fam )` anymore because founders are structured! kinship_total <- kinship_fam( kinship_1, fam ) ## ---- fig.width = 7, fig.height = 3.4, fig.align = 'center'------------------- # estimate kinship kinship_total_est <- popkin( X ) # plot them together for comparison plot_popkin( list( kinship_total, kinship_total_est ), titles = c('Expected', 'Estimated'), names = TRUE, names_cex = 0.8, leg_width = 0.2, mar = c(3, 2) ) ## ---- fig.width = 7, fig.height = 2.3, fig.align = 'center'------------------- plot_popkin( list( kinship_total[ indexes_G, indexes_G ], kinship_total_est[ indexes_G, indexes_G ], kinship_local_G ), titles = c('Total Expected', 'Total Estimated', 'Local'), names = TRUE, mar = c(3, 2) ) ## ---- fig.width = 7, fig.height = 2.5, fig.align = 'center'------------------- # admixture proportions of pruned family admix_proportions <- admix_fam( admix_proportions_1, fam ) # visualize as bar plot # for nice colors library(RColorBrewer) # colors for independent subpopulations col_subpops <- brewer.pal( K, "Paired" ) # shrink default margins par_orig <- par(mar = c(4, 4, 0.5, 0) + 0.2) barplot( t( admix_proportions ), col = col_subpops, legend.text = TRUE, args.legend = list( cex = 0.7, bty = 'n' ), border = NA, space = 0, ylab = 'Admixture prop.', las = 2 ) mtext('Individuals', side = 1, line = 3) par( par_orig ) # reset `par` ## ----------------------------------------------------------------------------- # this is the admixture-only relatedness kinship_admix <- coanc_to_kinship( coanc_admix( admix_proportions, inbr_subpops ) ) ## ---- fig.width = 7, fig.height = 6.8, fig.align = 'center'------------------- # this is the approximation of the total kinship: # NOTE: a bit complicated because diagonal has to be transformed. # Approximation operates on inbreeding coefficients, not self-kinship, # so `popkin::inbr_diag` converts self-kinship to inbreeding, # `bnpsd::coanc_to_kinship` converts inbreeding back to self-kinship. # Off-diagonal values are unmodified by both functions. kinship_total_approx <- coanc_to_kinship( inbr_diag( kinship_local ) + inbr_diag( kinship_admix ) - inbr_diag( kinship_admix ) * inbr_diag( kinship_local ) ) # plot all model kinship matrices (no estimates from genotypes this time) plot_popkin( list( kinship_admix, kinship_local, kinship_total, kinship_total_approx ), titles = c('Admixture only', 'Family only', 'Total', 'Total approx.'), names = TRUE, names_cex = 0.8, leg_width = 0.2, layout_rows = 2, mar = c(3, 2) ) ## ----------------------------------------------------------------------------- library(simfam) # data dimensions n <- 500 # number of individuals per generation G <- 10 # number of generations K <- 3 # number of ancestries m <- 5000 # number of loci # draw the random pedigree with those properties. data <- sim_pedigree( n, G ) fam <- data$fam ids <- data$ids # use this local kinship calculation in plot kinship_local_G <- data$kinship_local # prune pedigree to speed up simulations/etc fam <- prune_fam( fam, ids[[ G ]] ) ### admixture model # define population structure # FST values for k=3 subpopulations inbr_subpops <- c(0.2, 0.4, 0.6) # admixture proportions from 1D geography (founders) admix_proportions_1 <- admix_prop_1d_linear( n, K, sigma = 1 ) # add founder names to `admix_proportions_1` (will propagate to other vars of interest) rownames( admix_proportions_1 ) <- ids[[ 1 ]] # also name subpopulations simply as S1, S2, S3 colnames( admix_proportions_1 ) <- paste0( 'S', 1:K ) # draw genotypes from admixture model # admixed founders X_1 <- draw_all_admix( admix_proportions_1, inbr_subpops, m )$X # draw genotypes, returning last generation only X_G <- geno_last_gen( X_1, fam, ids ) # total kinship estimated from genotypes kinship_total_G_est <- popkin( X_G ) # calculate true total kinship matrix kinship_total_1 <- coanc_to_kinship( coanc_admix( admix_proportions_1, inbr_subpops ) ) kinship_total_G <- kinship_last_gen( kinship_total_1, fam, ids ) # kinship due to admixture only admix_proportions_G <- admix_last_gen( admix_proportions_1, fam, ids ) kinship_admix_G <- coanc_to_kinship( coanc_admix( admix_proportions_G, inbr_subpops ) ) # total kinship approximation from components kinship_total_approx_G <- coanc_to_kinship( inbr_diag( kinship_local_G ) + inbr_diag( kinship_admix_G ) - inbr_diag( kinship_admix_G ) * inbr_diag( kinship_local_G ) ) ## ---- fig.width = 7, fig.height = 3.4, fig.align = 'center'------------------- plot_popkin( list( kinship_total_G, kinship_total_G_est ), titles = c('Expected', 'Estimated'), leg_width = 0.2, mar = c(3, 2) ) ## ---- fig.width = 7, fig.height = 6.8, fig.align = 'center'------------------- # plot all model kinship matrices (no estimates from genotypes this time) plot_popkin( list( kinship_admix_G, kinship_local_G, kinship_total_G, kinship_total_approx_G ), titles = c('Admixture only', 'Family only', 'Total', 'Total approx.'), leg_width = 0.2, layout_rows = 2, mar = c(3, 2) ) ## ---- fig.width = 7, fig.height = 4, fig.align = 'center'--------------------- # the IDs in `fam` (the pruned FAM table) contain the individuals of interest frac_per_gen <- vector('numeric', G) for ( g in 1 : G ) { # this is the desired fraction frac_per_gen[g] <- sum( fam$id %in% ids[[ g ]] ) / n } # visualize as a barplot barplot( frac_per_gen, names.arg = 1 : G, xlab = 'Generation', ylab = 'Frac. individuals with descendants' ) # add a line marking the maximum fraction of individuals per generation abline( h = 1, lty = 2, col = 'red' ) ## ----------------------------------------------------------------------------- # real human recombination map is also used to define chromosome numbers and lengths # however, to keep example brief we use only first two chromosomes map <- recomb_map_hg38[ 1L:2L ] # tricky part is need to define ancestral haplotypes, following the desired genetic structure # will simulate each chromosome separately haplos_anc <- vector( 'list', length( map ) ) for ( chr in 1L : length( map ) ) { # Positions matter for genetic map, so for this example choose random positions with a # desired density, here 100 per Kb on average, keep it sparse for small example pos_max <- max( map[[ chr ]]$pos ) m_loci_chr <- round( pos_max / 100000L ) pos_chr <- sample.int( pos_max, m_loci_chr ) # Draw haplotypes. Here we use admixture code to get genotypes # with same structure as before for first generation X_chr <- draw_all_admix( admix_proportions_1, inbr_subpops, m_loci_chr )$X # and randomly split genotypes into haplotypes # (there was no LD/phase so this works well enough for our purposes) H1_chr <- matrix( rbinom( X_chr, 1L, X_chr/2L ), nrow = m_loci_chr ) # second haplotype is complement of first given genotypes H2_chr <- X_chr - H1_chr # column names are required for code later, identifying ancestors (need _pat/mat suffixes) colnames( H1_chr ) <- paste0( colnames( X_chr ), '_pat' ) colnames( H2_chr ) <- paste0( colnames( X_chr ), '_mat' ) # final data all together X_chr <- cbind( H1_chr, H2_chr ) # add to structure, in a list haplos_anc[[ chr ]] <- list( X = X_chr, pos = pos_chr ) } # initialize trivial founders chromosomes (unrecombined chromosomes with unique labels) # must use IDs of first generation from pedigree as input founders <- recomb_init_founders( ids[[ 1 ]], map ) # draw recombination breaks along pedigree, with coordinates in genetic distance (centiMorgans) inds <- recomb_last_gen( founders, fam, ids ) # map recombination break coordinates to base pairs inds <- recomb_map_inds( inds, map ) # determine haplotypes of descendants given ancestral haplotypes haplos <- recomb_haplo_inds( inds, haplos_anc ) # and reduce haplotype data to genotypes, same standard matrix format as previous examples X_G_recomb <- recomb_geno_inds( haplos ) # total kinship estimated from genotypes (simulated with recombination) kinship_total_G_recomb_est <- popkin( X_G_recomb ) ## ---- fig.width = 7, fig.height = 2.3, fig.align = 'center'------------------- plot_popkin( list( kinship_total_G, kinship_total_G_est, kinship_total_G_recomb_est ), titles = c('Expected', 'Estimated (indep loci)', 'Estimated (recomb)'), mar = c(3, 2) ) ## ---- fig.width = 7, fig.height = 3.4, fig.align = 'center'------------------- # Notice that the number of loci is similar for both simulations # (rows of the genotype matrices) dim( X_G ) dim( X_G_recomb ) # Calculate correlation matrices between loci (hence transposition). # Exclude fixed loci to prevent warnings about zero standard deviations. ld_G <- abs( cor( t( X_G[ !fixed_loci( X_G ), ] ) ) ) ld_G_recomb <- abs( cor( t( X_G_recomb[ !fixed_loci( X_G_recomb ), ] ) ) ) # cap extreme values, to be able to see the relatively weak LD signal ld_max <- 0.3 ld_G[ ld_G > ld_max ] <- ld_max ld_G_recomb[ ld_G_recomb > ld_max ] <- ld_max # plot! plot_popkin( list( ld_G, ld_G_recomb ), titles = c('Indep loci', 'Recomb'), leg_width = 0.2, mar = c(3, 2), leg_title = 'Absolute Correlation', ylab = 'Loci' )