## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>",
    crop = NULL
)

## ----installation, eval = FALSE-----------------------------------------------
#  if(!requireNamespace("BiocManager", quietly = TRUE)) {
#      install.packages("BiocManager")
#  }
#  
#  BiocManager::install("doubletrouble")
#  
#  ## Check that you have a valid Bioconductor installation
#  BiocManager::valid()

## ----load_package-------------------------------------------------------------
library(doubletrouble)

## ----data1--------------------------------------------------------------------
# Load list of DIAMOND tabular output
data(yeast_seq)
head(yeast_seq)

## ----ex_data------------------------------------------------------------------
# Load annotation list processed with syntenet::process_input()
data(yeast_annot)
head(yeast_annot)

## ----process_input------------------------------------------------------------
library(syntenet)

# Process input data
pdata <- process_input(yeast_seq, yeast_annot)

# Inspect the output
names(pdata)
pdata$seq
pdata$annotation

## ----run_diamond_intraspecies-------------------------------------------------
data(diamond_intra)

# Run DIAMOND in sensitive mode for S. cerevisiae only
if(diamond_is_installed()) {
    diamond_intra <- run_diamond(
        seq = pdata$seq["Scerevisiae"],
        compare = "intraspecies", 
        outdir = file.path(tempdir(), "diamond_intra"),
        ... = "--sensitive"
    )
}

# Inspect output
names(diamond_intra)
head(diamond_intra$Scerevisiae_Scerevisiae)

## ----binary_classification----------------------------------------------------
# Binary classification
c_binary <- classify_gene_pairs(
    blast_list = diamond_intra,
    annotation = pdata$annotation,
    binary = TRUE
)

# Inspecting the output
names(c_binary)
head(c_binary$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_binary$Scerevisiae$type)

## ----expanded_classification--------------------------------------------------
# Expanded classification
c_expanded <- classify_gene_pairs(
    blast_list = diamond_intra,
    annotation = pdata$annotation,
    binary = FALSE
)

# Inspecting the output
names(c_expanded)
head(c_expanded$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_expanded$Scerevisiae$type)

## ----blast_interspecies-------------------------------------------------------
data(diamond_inter) # load pre-computed output in case DIAMOND is not installed

# Create data frame of comparisons to be made
comparisons <- data.frame(
    species = "Scerevisiae",
    outgroup = "Cglabrata"
)
comparisons

# Run DIAMOND for the comparison we specified
if(diamond_is_installed()) {
    diamond_inter <- run_diamond(
        seq = pdata$seq,
        compare = comparisons,
        outdir = file.path(tempdir(), "diamond_inter"),
        ... = "--sensitive"
    )
}

names(diamond_inter)
head(diamond_inter$Scerevisiae_Cglabrata)

## ----full_classification------------------------------------------------------
# Full classification
c_full <- classify_gene_pairs(
    blast_list = diamond_intra,
    annotation = pdata$annotation,
    binary = FALSE,
    blast_inter = diamond_inter
)

# Inspecting the output
names(c_full)
head(c_full$Scerevisiae)

# How many pairs are there for each duplication mode?
table(c_full$Scerevisiae$type)

## ----classify_genes-----------------------------------------------------------
# Classify genes into unique modes of duplication
c_genes <- classify_genes(c_full)

# Inspecting the output
names(c_genes)
head(c_genes$Scerevisiae)

# Number of genes per mode
table(c_genes$Scerevisiae$type)

## ----kaks_calculation---------------------------------------------------------
data(cds_scerevisiae)
head(cds_scerevisiae)

# Store DNAStringSet object in a list
cds_list <- list(Scerevisiae = cds_scerevisiae)

# Keep only top give gene pairs for demonstration purposes
gene_pairs <- list(Scerevisiae = c_binary$Scerevisiae[seq(1, 5, by = 1), ])

# Calculate Ka, Ks, and Ka/Ks
kaks <- pairs2kaks(gene_pairs, cds_list)

# Inspect the output
head(kaks)

## ----ks_eda-------------------------------------------------------------------
# Load data and inspect it
data(gmax_ks)
head(gmax_ks)

# Plot distribution
library(ggplot2)
ggplot(gmax_ks, aes(x = Ks)) +
    geom_histogram(color = "black", fill = "grey90", bins = 50) +
    theme_bw()

## ----find_ks_peaks------------------------------------------------------------
# Find 2 and 3 peaks and test which one has more support
peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = c(2, 3), verbose = TRUE)
names(peaks)
str(peaks)

# Visualize Ks distribution
plot_ks_peaks(peaks)

## ----find_peaks_explicit------------------------------------------------------
# Find 2 peaks ignoring Ks values > 1
peaks <- find_ks_peaks(gmax_ks$Ks, npeaks = 2, max_ks = 1)
plot_ks_peaks(peaks)

## ----sizer--------------------------------------------------------------------
# Get numeric vector of Ks values <= 1
ks <- gmax_ks$Ks[gmax_ks$Ks <= 1]

# Get SiZer map
feature::SiZer(ks)

## ----split_by_peak------------------------------------------------------------
# Gene pairs without age-based classification
head(gmax_ks)

# Classify gene pairs by age group
pairs_age_group <- split_pairs_by_peak(gmax_ks, peaks)

# Inspecting the output
names(pairs_age_group)

# Take a look at the classified gene pairs
head(pairs_age_group$pairs)

# Visualize Ks distro with age boundaries
pairs_age_group$plot

## ----session_info-------------------------------------------------------------
sessioninfo::session_info()