## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 8, fig.height = 6, warning = FALSE, message = FALSE ) ## ----install, eval=FALSE------------------------------------------------------ # # From Bioconductor (when available) # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install("ClonalSim") # # # From GitHub (development version) # BiocManager::install("gbucci/ClonalSim") ## ----library------------------------------------------------------------------ library(ClonalSim) ## ----basic_sim---------------------------------------------------------------- # Set seed for reproducibility set.seed(123) sim <- simulateTumor( subclone_freqs = c(0.15, 0.25, 0.30, 0.30), n_mut_per_clone = c(20, 25, 30, 15), n_mut_founder = 10 ) # Display summary sim ## ----access_results----------------------------------------------------------- # Get mutation data mutations <- getMutations(sim) head(mutations) # Get simulation parameters params <- getSimParams(sim) params$subclone_freqs # Get true vs observed VAF true_vaf <- getTrueVAF(sim) obs_vaf <- getObservedVAF(sim) # Compare summary(true_vaf) summary(obs_vaf) ## ----plot_vaf_density, fig.cap="VAF density plot showing mixed tumor sample"---- # ggplot2 is already loaded above plot(sim, type = "vaf_density") ## ----plot_vaf_scatter, fig.cap="VAF scatter plot colored by mutation type"---- plot(sim, type = "vaf_scatter") ## ----plot_depth, fig.cap="Sequencing depth distribution"---------------------- plot(sim, type = "depth_histogram") ## ----plot_matrix, fig.cap="Clonal matrix showing mutation presence"----------- plot(sim, type = "clone_matrix") ## ----bio_noise_demo, fig.cap="Effect of biological noise concentration parameter"---- # High heterogeneity (low concentration) set.seed(123) sim_high_het <- simulateTumor( subclone_freqs = c(0.3, 0.4, 0.3), n_mut_per_clone = c(30, 40, 30), biological_noise = list(enabled = TRUE, concentration = 20) ) # Low heterogeneity (high concentration) set.seed(123) sim_low_het <- simulateTumor( subclone_freqs = c(0.3, 0.4, 0.3), n_mut_per_clone = c(30, 40, 30), biological_noise = list(enabled = TRUE, concentration = 100) ) # Compare VAF spread par(mfrow = c(1, 2)) hist(getTrueVAF(sim_high_het), main = "High Heterogeneity\n(concentration = 20)", xlab = "True VAF", breaks = 30, col = "skyblue") hist(getTrueVAF(sim_low_het), main = "Low Heterogeneity\n(concentration = 100)", xlab = "True VAF", breaks = 30, col = "lightcoral") ## ----depth_demo, fig.cap="Depth distributions: negative binomial vs Poisson"---- # Realistic (negative binomial) set.seed(123) sim_nb <- simulateTumor( sequencing_noise = list( mean_depth = 100, depth_variation = "negative_binomial", depth_dispersion = 20 ) ) # Unrealistic (Poisson) set.seed(124) sim_poisson <- simulateTumor( sequencing_noise = list( mean_depth = 100, depth_variation = "poisson" ) ) par(mfrow = c(1, 2)) hist(getMutations(sim_nb)$Depth, main = "Negative Binomial\n(realistic)", xlab = "Depth", breaks = 30, col = "skyblue") hist(getMutations(sim_poisson)$Depth, main = "Poisson\n(unrealistic)", xlab = "Depth", breaks = 30, col = "lightcoral") ## ----binomial_demo, fig.cap="Stochastic variation from binomial sampling"----- # With binomial sampling (realistic) set.seed(123) sim_binom <- simulateTumor( sequencing_noise = list(binomial_sampling = TRUE) ) # Without binomial sampling (deterministic) set.seed(123) sim_det <- simulateTumor( sequencing_noise = list(binomial_sampling = FALSE) ) # Compare True_VAF vs observed VAF par(mfrow = c(1, 2)) with(getMutations(sim_binom), { plot(True_VAF, VAF, main = "With Binomial Sampling", xlab = "True VAF", ylab = "Observed VAF", pch = 16, col = rgb(0,0,1,0.3)) abline(0, 1, col = "red", lty = 2) }) with(getMutations(sim_det), { plot(True_VAF, VAF, main = "Without Binomial Sampling", xlab = "True VAF", ylab = "Observed VAF", pch = 16, col = rgb(0,0,1,0.3)) abline(0, 1, col = "red", lty = 2) }) ## ----low_purity--------------------------------------------------------------- set.seed(123) sim_low_purity <- simulateTumor( subclone_freqs = c(0.05, 0.10, 0.15), # Sum = 0.30 (30% tumor) n_mut_per_clone = c(20, 25, 30) ) cat("Tumor purity:", sum(getSimParams(sim_low_purity)$subclone_freqs), "\n") plot(sim_low_purity, type = "vaf_density") ## ----high_coverage------------------------------------------------------------ set.seed(123) sim_deep <- simulateTumor( sequencing_noise = list( enabled = TRUE, mean_depth = 500, depth_dispersion = 100, # More uniform error_rate = 0.0005 ) ) summary(getMutations(sim_deep)$Depth) plot(sim_deep, type = "depth_histogram") ## ----low_coverage------------------------------------------------------------- set.seed(123) sim_exome <- simulateTumor( sequencing_noise = list( mean_depth = 50, depth_dispersion = 10, # More variable error_rate = 0.002 ) ) summary(getMutations(sim_exome)$Depth) ## ----no_noise----------------------------------------------------------------- set.seed(123) sim_ideal <- simulateTumor( biological_noise = list(enabled = FALSE), sequencing_noise = list(enabled = FALSE) ) # VAF exactly matches expected frequencies head(getMutations(sim_ideal)[, c("Clone", "True_VAF", "VAF")]) ## ----germline_variants, fig.cap="VAF distribution showing germline peak at 0.5"---- # 70% purity tumor with germline variants set.seed(789) sim_germline <- simulateTumor( subclone_freqs = c(0.3, 0.4), # 70% tumor purity n_mut_per_clone = c(40, 50), germline_variants = list( enabled = TRUE, n_variants = 150, # Number of germline SNPs vaf_expected = 0.5 # Heterozygous diploid ) ) # Check mutation types table(getMutations(sim_germline)$Type) # Compare VAFs germline_vafs <- getMutations(sim_germline)[getMutations(sim_germline)$Type == "germline", "VAF"] somatic_vafs <- getMutations(sim_germline)[getMutations(sim_germline)$Type != "germline", "VAF"] cat("Germline VAF (expected ~0.5):", round(mean(germline_vafs), 3), "\n") cat("Somatic VAF mean:", round(mean(somatic_vafs), 3), "\n") # Plot showing germline peak plot(sim_germline, type = "vaf_density") ## ----granges------------------------------------------------------------------ library(GenomicRanges) gr <- toGRanges(sim) gr # Access metadata mcols(gr)[1:5, ] # Subset by chromosome gr_chr1 <- gr[seqnames(gr) == "chr1"] length(gr_chr1) ## ----vcf---------------------------------------------------------------------- # Get VRanges object vr <- suppressWarnings(toVCF(sim, sample_name = "TumorSample")) vr # Write to VCF file (using tempdir to avoid polluting user's filesystem) vcf_file <- tempfile(fileext = ".vcf") suppressWarnings(toVCF(sim, output_file = vcf_file)) file.exists(vcf_file) ## ----pyclone------------------------------------------------------------------ pyclone_file <- tempfile(fileext = ".tsv") toPyClone(sim, file = pyclone_file, sample_id = "sample1") head(read.table(pyclone_file, header = TRUE, sep = "\t")) ## ----sciclone----------------------------------------------------------------- sciclone_file <- tempfile(fileext = ".tsv") toSciClone(sim, file = sciclone_file) head(read.table(sciclone_file, header = TRUE, sep = "\t")) ## ----csv---------------------------------------------------------------------- df <- toDataFrame(sim) csv_file <- tempfile(fileext = ".csv") write.csv(df, csv_file, row.names = FALSE) head(read.csv(csv_file)) ## ----benchmark_workflow, eval=FALSE------------------------------------------- # # 1. Generate ground truth # set.seed(42) # sim <- simulateTumor( # subclone_freqs = c(0.2, 0.3, 0.5), # n_mut_per_clone = c(50, 75, 50) # ) # # # 2. Export to VCF (ground truth) # toVCF(sim, output_file = "ground_truth.vcf") # # # 3. Get true positive set # true_mutations <- getMutations(sim) # # # 4. Run your variant caller on simulated data # # (your variant caller code here) # # # 5. Compare results # # - Sensitivity: TP / (TP + FN) # # - Precision: TP / (TP + FP) # # - F1 score: 2 * (Precision * Sensitivity) / (Precision + Sensitivity) # # # 6. Evaluate VAF estimation accuracy # # cor(true_mutations$VAF, called_vaf) ## ----complex_hierarchy-------------------------------------------------------- set.seed(123) sim_complex <- simulateTumor( subclone_freqs = c(0.1, 0.15, 0.2, 0.25, 0.3), n_mut_per_clone = c(30, 40, 50, 40, 30), n_mut_shared = list( "2 3 4 5" = 20, # Shared by clones 2,3,4,5 "3 4 5" = 15, # Shared by clones 3,4,5 "4 5" = 10, # Shared by clones 4,5 "1 2" = 8 # Shared by clones 1,2 ) ) plot(sim_complex, type = "vaf_density") ## ----clonal_structure--------------------------------------------------------- structure <- getClonalStructure(sim_complex) print(structure) ## ----sessioninfo-------------------------------------------------------------- sessionInfo()