## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5,
  out.width = "80%",
  fig.align = "center",
echo = TRUE, 
  crop = NULL # suppress "The magick package is required to crop" issue
)
library(BiocStyle)

## ----eval=FALSE---------------------------------------------------------------
# if (!requireNamespace("BiocManager", quietly = TRUE)) {
#   install.packages("BiocManager")
# }
# 
# BiocManager::install("peakCombiner")

## ----eval=TRUE----------------------------------------------------------------
library("peakCombiner")
library("ggplot2")

## ----eval=TRUE----------------------------------------------------------------
utils::data("syn_data_tibble")

## ----eval=TRUE----------------------------------------------------------------
data_prepared <- prepareInputRegions(
  data = syn_data_tibble,
  outputFormat = "tibble",
  showMessages = FALSE
)

centerExpandRegions(
  data = data_prepared,
  centerBy = "center_column",
  expandBy = NULL,
  genome = NA,
  trim_start = TRUE,
  outputFormat = "GenomicRanges",
  showMessages = FALSE
)

data_center_expand <- centerExpandRegions(
  data = data_prepared,
  centerBy = "center_column",
  expandBy = NULL,
  genome = NA,
  trim_start = TRUE,
  outputFormat = "tibble",
  showMessages = FALSE
)

data_filtered <- filterRegions(
  data = data_center_expand,
  includeByChromosomeName = NULL,
  excludeByBlacklist = NULL,
  includeAboveScoreCutoff = NULL,
  includeTopNScoring = NULL, 
  outputFormat = "tibble",
  showMessages = FALSE
)

consensus_peak <- combineRegions(
  data = data_filtered,
  foundInSamples = 2,
  combinedCenter = "nearest",
  annotateWithInputNames = FALSE,
  combinedSampleName = "my_new_sample_name",
  outputFormat = "tibble",
  showMessages = FALSE
)

consensus_final <- centerExpandRegions(
  data = consensus_peak,
  expandBy = 350,
  outputFormat = "tibble",
  showMessages = FALSE
)

consensus_final

## ----eval=FALSE---------------------------------------------------------------
# rtracklayer::export.bed(consensus_final, paste0(here::here(), "/lists/consensus_regions.bed"), format = "bed")

## ----eval=TRUE----------------------------------------------------------------
utils::data("syn_sample_sheet", package = "peakCombiner")
syn_sample_sheet

## ----eval=TRUE----------------------------------------------------------------
utils::data("syn_data_control01", package = "peakCombiner")
syn_data_control01

## ----eval=TRUE----------------------------------------------------------------
utils::data("syn_data_treatment01", package = "peakCombiner")
syn_data_treatment01

## ----eval=TRUE----------------------------------------------------------------
combined_input <- syn_data_control01 |>
  dplyr::mutate(sample_name = "control-rep1") |>
  rbind(syn_data_treatment01 |>
    dplyr::mutate(sample_name = "treatment-rep1"))

combined_input |>
  dplyr::group_by(sample_name) |>
  dplyr::summarize(num_regions = dplyr::n())

## ----eval=TRUE----------------------------------------------------------------
prepareInputRegions(
  data = combined_input,
  outputFormat = "tibble",
  startsAreBased = 1,
  showMessages = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
prepareInputRegions(
  data = combined_input,
  outputFormat = "tibble",
  startsAreBased = 0,
  showMessages = FALSE
)

## ----eval=FALSE---------------------------------------------------------------
# file_names

## ----eval=TRUE----------------------------------------------------------------
utils::data("syn_sample_sheet", package = "peakCombiner")
sample_sheet <- syn_sample_sheet

sample_sheet

## ----eval = FALSE-------------------------------------------------------------
# prepareInputRegions(
#   data = sample_sheet,
#   startsAreBased = 0,
#   showMessages = FALSE
# )

## ----eval=TRUE----------------------------------------------------------------
utils::data(syn_control_rep1_narrowPeak)
syn_control_rep1_narrowPeak

## ----eval=TRUE----------------------------------------------------------------
utils::data(syn_treatment_rep1_narrowPeak)
syn_treatment_rep1_narrowPeak

## ----eval=TRUE----------------------------------------------------------------
control <- syn_control_rep1_narrowPeak |>
  dplyr::mutate(sample_name = "control-rep1")
control

## ----eval=TRUE----------------------------------------------------------------
treatment <- syn_treatment_rep1_narrowPeak |>
  dplyr::mutate(sample_name = "treatment-rep1")
treatment

## ----eval=TRUE----------------------------------------------------------------
combined_input <- control |>
  rbind(treatment)
combined_input

## ----eval=TRUE----------------------------------------------------------------
combined_input |>
  dplyr::group_by(sample_name) |>
  dplyr::count(name = "number_of_entries")

## ----eval=TRUE----------------------------------------------------------------
prepareInputRegions(
  data = combined_input,
  outputFormat = "tibble",
  startsAreBased = 1,
  showMessages = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
utils::data("syn_data_granges")
syn_data_granges

## ----eval=TRUE----------------------------------------------------------------
GenomicRanges_data <- GenomicRanges::makeGRangesFromDataFrame(
  syn_data_granges,
  keep.extra.columns = TRUE
)
GenomicRanges_data

## ----eval=TRUE----------------------------------------------------------------
prepareInputRegions(
  data = GenomicRanges_data,
  outputFormat = "GenomicRanges",
  showMessages = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
utils::data("syn_data_bed")
syn_data_bed |> 
  dplyr::arrange(sample_name)

## ----eval=TRUE----------------------------------------------------------------
syn_data_bed |>
  dplyr::group_by(sample_name) |>
  dplyr::summarize(num_regions = dplyr::n())

## ----eval=TRUE----------------------------------------------------------------
prepareInputRegions(
  data = syn_data_bed,
  outputFormat = "data.frame",
   startsAreBased = 0,
  showMessages = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
centerExpandRegions(
  data = data_prepared,
  centerBy = "center_column",
  expandBy = NULL,
  outputFormat = "tibble",
  showMessage = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
centerExpandRegions(
  data = data_prepared,
  centerBy = "center_column",
  expandBy = NULL,
  outputFormat = "tibble",
  showMessages = TRUE
)

## ----eval=TRUE----------------------------------------------------------------
centerExpandRegions(
  data = data_prepared,
  centerBy = "center_column",
  expandBy = c(500),
  outputFormat = "tibble",
  showMessages = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
centerExpandRegions(
  data = data_prepared,
  centerBy = "center_column",
  expandBy = c(100, 1000),
  outputFormat = "tibble",
  showMessages = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
backlist <- tibble::tibble(chrom = c("chr1"),
                           start = c(100),
                           end = c(1000))
backlist


## ----eval=TRUE----------------------------------------------------------------
data_filtered <- filterRegions(
  data = data_center_expand,
  excludeByBlacklist = backlist,
  outputFormat = "tibble",
  showMessages = FALSE
)

data_filtered

## ----eval=TRUE----------------------------------------------------------------
filterRegions(
  data = data_center_expand,
  includeByChromosomeName = c("chr1", "chr2", "chr4"),
  excludeByBlacklist = backlist,
  includeAboveScoreCutoff = 2.5,
  includeTopNScoring = 6,
  outputFormat = "tibble",
  showMessages = TRUE
)

## ----eval=TRUE----------------------------------------------------------------
input_chrom <-
  data_center_expand |>
  dplyr::select(chrom) |>
  unique()
input_chrom

## ----eval=TRUE----------------------------------------------------------------
include_chrom <- input_chrom |>
  dplyr::filter(grepl("^chr[0-9]$|^chr[1-2][0-9]$|^chr[XYM]", chrom)) |>
  dplyr::pull(chrom) |>
  unique() |>
  sort()

include_chrom

## ----eval=TRUE----------------------------------------------------------------
data_filtered_chr <- filterRegions(
  data = data_center_expand,
  includeByChromosomeName = include_chrom,
  excludeByBlacklist = NULL,
  includeAboveScoreCutoff = NULL,
  includeTopNScoring = NULL,
  outputFormat = "tibble",
  showMessages = FALSE
)
data_filtered_chr

## ----eval=TRUE----------------------------------------------------------------
data_center_expand |>
  dplyr::group_by(sample_name) |>
  dplyr::summarize(num_regions = dplyr::n())

## ----eval=TRUE----------------------------------------------------------------
data_filtered_chr |>
  dplyr::group_by(chrom) |>
  dplyr::summarize(num_regions = dplyr::n())

## ----download_blacklist_bed, eval=FALSE---------------------------------------
# # Download the blacklist BED file from ENCODE and save it as "blacklist_human.bed"
# download.file(
#   url = "https://www.encodeproject.org/files/ENCFF356LFX/@@download/ENCFF356LFX.bed.gz",
#   destfile = "blacklist_human.bed.gz",
#   mode = "wb"
# )
# 
# 
# # Import the BED file
# blacklist_granges <- readr::read_tsv("blacklist_human.bed.gz", col_names = c("chrom", "start", "end"))
# blacklist_granges

## ----apply_downloaded_blacklist_bed, eval=FALSE-------------------------------
# filterRegions(
#   data = data_center_expand,
#   excludeByBlacklist = blacklist_granges,
#   showMessages = FALSE)

## -----------------------------------------------------------------------------
# Load library (AnnotationHub would need to be added to DESCRIPTION under Suggests if used in the vignette)
library(AnnotationHub)

# Get annotation hub handle (will download index if called for the first time)
ah <- AnnotationHub::AnnotationHub()

# List ENCODE related data providers
unique(grep("ENCODE", ah$dataprovider, value = TRUE))

# Query for "blacklist"
dm <- AnnotationHub::query(ah, "blacklist")
dm

# get specific object (will be downloaded if called for the first time)
blacklist_ah <- dm[["AH107343"]]

# get info for a specific object
AnnotationHub::recordStatus(ah, "AH107343")
AnnotationHub::getInfoOnIds(ah, "AH107343")
mcols(dm)["AH107343",]
mcols(dm)["AH107343","description"]

## ----view_black_list_annotationhub, eval=FALSE--------------------------------
# blacklist_ah

## ----apply_annotationhub_blacklist, eval=FALSE--------------------------------
# filterRegions(
#   data = data_center_expand,
#   excludeByBlacklist = blacklist_ah,
#   showMessages = FALSE)

## ----eval=TRUE----------------------------------------------------------------
data_center_expand |>
  ggplot2::ggplot(ggplot2::aes(x = score)) +
  ggplot2::geom_histogram(binwidth = 10) +
  ggplot2::xlim(0,NA)

## ----eval=TRUE----------------------------------------------------------------
data_filtered_cutoff <- filterRegions(
  data = data_center_expand,
  includeAboveScoreCutoff = 35,
  outputFormat = "tibble",
  showMessages = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
dim(data_center_expand)
dim(data_filtered_cutoff)

## ----eval=TRUE----------------------------------------------------------------
range(data_center_expand |>
  dplyr::pull(score))
range(data_filtered_cutoff |>
  dplyr::pull(score))

## ----eval=TRUE----------------------------------------------------------------
data_filtered_cutoff |>
  ggplot2::ggplot(ggplot2::aes(x = score)) +
  ggplot2::geom_histogram(binwidth = 10) +
  ggplot2::xlim(0, NA)

## ----eval=TRUE----------------------------------------------------------------
data_center_expand |>
  dplyr::group_by(sample_name) |>
  dplyr::summarize(num_regions = dplyr::n())

filterRegions(
  data = data_center_expand,
  includeTopNScoring = 8,
  outputFormat = "tibble",
  showMessages = FALSE
) |>
  dplyr::group_by(sample_name) |>
  dplyr::summarize(num_regions = dplyr::n())

## ----eval=TRUE----------------------------------------------------------------
combineRegions(
  data = data_filtered,
  foundInSamples = 2,
  combinedCenter = "nearest",
  annotateWithInputNames = FALSE,
  combinedSampleName = "my_new_sample_name",
  outputFormat = "tibble",
  showMessage = FALSE
)

## ----eval=TRUE----------------------------------------------------------------
consensus_peak_list <- combineRegions(
  data = data_filtered,
  foundInSamples = 2,
  combinedCenter = "nearest",
  annotateWithInputNames = TRUE,
  combinedSampleName = "consensus",
  outputFormat = "tibble",
  showMessages = TRUE
)
consensus_peak_list

## ----eval=TRUE----------------------------------------------------------------
combineRegions(
  data = data_filtered,
  foundInSamples = 2,
  outputFormat = "tibble",
  showMessages = FALSE,
  combinedSampleName = "foundInSamples_2_example"
)

## ----eval=TRUE----------------------------------------------------------------
combineRegions(
  data = data_filtered,
  foundInSamples = 1,
  outputFormat = "tibble",
  showMessages = FALSE,
  combinedSampleName = "foundInSamples_1_example"
)

## ----eval=TRUE----------------------------------------------------------------
combineRegions(
  data = data_filtered,
  combinedCenter = "strongest",
  outputFormat = "tibble",
  showMessages = FALSE
) |> dplyr::select("center", "score")

## ----eval=TRUE----------------------------------------------------------------
combineRegions(
  data = data_filtered,
  combinedCenter = "middle",
  outputFormat = "tibble",
  showMessages = FALSE
) |> dplyr::select("center", "score")

## ----eval=TRUE----------------------------------------------------------------
combineRegions(
  data = data_filtered,
  combinedCenter = "nearest",
  outputFormat = "tibble",
  showMessages = FALSE
) |> dplyr::select("center", "score")

## ----session, eval=TRUE-------------------------------------------------------
sessionInfo()