## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL ) ## ----installation, eval = FALSE----------------------------------------------- # if(!requireNamespace("BiocManager", quietly = TRUE)) { # install.packages("BiocManager") # } # # BiocManager::install("doubletrouble") # # ## Check that you have a valid Bioconductor installation # BiocManager::valid() ## ----load_package------------------------------------------------------------- library(doubletrouble) ## ----data1-------------------------------------------------------------------- # Load list of DIAMOND tabular output data(yeast_seq) head(yeast_seq) ## ----ex_data------------------------------------------------------------------ # Load annotation list processed with syntenet::process_input() data(yeast_annot) head(yeast_annot) ## ----process_input------------------------------------------------------------ library(syntenet) # Process input data pdata <- process_input(yeast_seq, yeast_annot) # Inspect the output names(pdata) pdata$seq pdata$annotation ## ----run_diamond_intraspecies------------------------------------------------- data(diamond_intra) # Run DIAMOND in sensitive mode for S. cerevisiae only if(diamond_is_installed()) { diamond_intra <- run_diamond( seq = pdata$seq["Scerevisiae"], compare = "intraspecies", outdir = file.path(tempdir(), "diamond_intra"), ... = "--sensitive" ) } # Inspect output names(diamond_intra) head(diamond_intra$Scerevisiae_Scerevisiae) ## ----binary_classification---------------------------------------------------- # Binary classification c_binary <- classify_gene_pairs( blast_list = diamond_intra, annotation = pdata$annotation, binary = TRUE ) # Inspecting the output names(c_binary) head(c_binary$Scerevisiae) # How many pairs are there for each duplication mode? table(c_binary$Scerevisiae$type) ## ----expanded_classification-------------------------------------------------- # Expanded classification c_expanded <- classify_gene_pairs( blast_list = diamond_intra, annotation = pdata$annotation, binary = FALSE ) # Inspecting the output names(c_expanded) head(c_expanded$Scerevisiae) # How many pairs are there for each duplication mode? table(c_expanded$Scerevisiae$type) ## ----blast_interspecies------------------------------------------------------- data(diamond_inter) # load pre-computed output in case DIAMOND is not installed # Create data frame of comparisons to be made comparisons <- data.frame( species = "Scerevisiae", outgroup = "Cglabrata" ) comparisons # Run DIAMOND for the comparison we specified if(diamond_is_installed()) { diamond_inter <- run_diamond( seq = pdata$seq, compare = comparisons, outdir = file.path(tempdir(), "diamond_inter"), ... = "--sensitive" ) } names(diamond_inter) head(diamond_inter$Scerevisiae_Cglabrata) ## ----full_classification------------------------------------------------------ # Full classification c_full <- classify_gene_pairs( blast_list = diamond_intra, annotation = pdata$annotation, binary = FALSE, blast_inter = diamond_inter ) # Inspecting the output names(c_full) head(c_full$Scerevisiae) # How many pairs are there for each duplication mode? table(c_full$Scerevisiae$type) ## ----classify_genes----------------------------------------------------------- # Classify genes into unique modes of duplication c_genes <- classify_genes(c_full) # Inspecting the output names(c_genes) head(c_genes$Scerevisiae) # Number of genes per mode table(c_genes$Scerevisiae$type) ## ----kaks_calculation--------------------------------------------------------- data(cds_scerevisiae) head(cds_scerevisiae) # Store DNAStringSet object in a list cds_list <- list(Scerevisiae = cds_scerevisiae) # Keep only top give gene pairs for demonstration purposes gene_pairs <- list(Scerevisiae = c_binary$Scerevisiae[seq(1, 5, by = 1), ]) # Calculate Ka, Ks, and Ka/Ks kaks <- pairs2kaks(gene_pairs, cds_list) # Inspect the output head(kaks) ## ----ks_eda------------------------------------------------------------------- # Load data and inspect it data(gmax_ks) head(gmax_ks) # Plot distribution library(ggplot2) ggplot(gmax_ks, aes(x = Ks)) + geom_histogram(color = "black", fill = "grey90", bins = 50) + theme_bw() ## ----find_ks_peaks------------------------------------------------------------ # Find 2 and 3 peaks and test which one has more support peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = c(2, 3), verbose = TRUE) names(peaks) str(peaks) # Visualize Ks distribution plot_ks_peaks(peaks) ## ----find_peaks_explicit------------------------------------------------------ # Find 2 peaks ignoring Ks values > 1 peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = 2, max_ks = 1) plot_ks_peaks(peaks) ## ----sizer-------------------------------------------------------------------- # Get numeric vector of Ks values <= 1 ks <- gmax_ks$Ks[gmax_ks$Ks <= 1] # Get SiZer map feature::SiZer(ks) ## ----split_by_peak------------------------------------------------------------ # Gene pairs without age-based classification head(gmax_ks) # Classify gene pairs by age group pairs_age_group <- split_pairs_by_peak(gmax_ks, peaks) # Inspecting the output names(pairs_age_group) # Take a look at the classified gene pairs head(pairs_age_group$pairs) # Visualize Ks distro with age boundaries pairs_age_group$plot ## ----session_info------------------------------------------------------------- sessioninfo::session_info()