## ----setup, echo=FALSE-----------------------------------------------------
knitr::opts_chunk$set(collapse=TRUE, comment = "#>")
suppressPackageStartupMessages(library(universalmotif))
suppressPackageStartupMessages(library(Biostrings))
suppressMessages(suppressPackageStartupMessages(library(MotifDb)))
suppressMessages(suppressPackageStartupMessages(library(ggtree)))
data(ArabidopsisPromoters)
data(ArabidopsisMotif)

## --------------------------------------------------------------------------
library(universalmotif)

motif <- create_motif("CWWWWCC", nsites = 6)
sequences <- DNAStringSet(rep(c("CAAAACC", "CTTTTCC"), 3))
motif.k2 <- add_multifreq(motif, sequences, add.k = 2)

## Alternatively:
# motif.k2 <- create_motif(sequences, add.multifreq = 2)

motif.k2

## --------------------------------------------------------------------------
library(universalmotif)
data(ArabidopsisMotif)
data(ArabidopsisPromoters)

enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3,
              threshold = 0.001, verbose = 0, progress = FALSE, RC = TRUE)

## --------------------------------------------------------------------------
library(universalmotif)
data(ArabidopsisMotif)
data(ArabidopsisPromoters)

enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3,
              search.mode = "positional", verbose = 0, progress = FALSE,
              RC = TRUE)

## --------------------------------------------------------------------------
library(universalmotif)

m <- matrix(c(0.10,0.27,0.23,0.19,0.29,0.28,0.51,0.12,0.34,0.26,
              0.36,0.29,0.51,0.38,0.23,0.16,0.17,0.21,0.23,0.36,
              0.45,0.05,0.02,0.13,0.27,0.38,0.26,0.38,0.12,0.31,
              0.09,0.40,0.24,0.30,0.21,0.19,0.05,0.30,0.31,0.08),
            byrow = TRUE, nrow = 4)
motif <- create_motif(m, alphabet = "DNA", type = "PWM")
motif

## --------------------------------------------------------------------------
data(ArabidopsisPromoters)

res <- scan_sequences(motif, ArabidopsisPromoters, verbose = 0,
                      progress = FALSE, threshold = 0.8,
                      threshold.type = "logodds")
head(res)

## --------------------------------------------------------------------------
## The second match was CTCTTTATTC, with a score of 3.896 (max possible = 6.403)

library(Biostrings)

bkg <- colMeans(oligonucleotideFrequency(ArabidopsisPromoters, 1,
                                         as.prob = TRUE))
bkg

## --------------------------------------------------------------------------
hit.prob <- bkg["A"]^1 * bkg["C"]^3 * bkg["G"]^0 * bkg["T"]^6
hit.prob <- unname(hit.prob)
hit.prob

## --------------------------------------------------------------------------
res <- res[1:6, ]
pvals <- motif_pvalue(motif, res$score, progress = FALSE, bkg.probs = bkg)
res2 <- data.frame(motif=res$motif,match=res$match,pval=pvals)[order(pvals), ]
knitr::kable(res2, digits = 22, row.names = FALSE, format = "markdown")

## --------------------------------------------------------------------------
scores <- c(-6, -3, 0, 3, 6)
k <- c(2, 4, 6, 8, 10)
out <- data.frame(k = c(2, 4, 6, 8, 10),
                  score.minus6 = rep(0, 5),
                  score.minus3 = rep(0, 5),
                  score.0 = rep(0, 5),
                  score.3 = rep(0, 5),
                  score.6 = rep(0, 5))

for (i in seq_along(scores)) {
  for (j in seq_along(k)) {
    out[j, i + 1] <- motif_pvalue(motif, scores[i], k = k[j], progress = FALSE,
                                  bkg.probs = bkg)
  }
}

knitr::kable(out, format = "markdown", digits = 10)

## --------------------------------------------------------------------------
bkg <- c(A=0.25, C=0.25, G=0.25, T=0.25)
pvals <- c(0.1, 0.01, 0.001, 0.0001, 0.00001)
scores <- motif_pvalue(motif, pvalue = pvals, progress = FALSE,
                       bkg.probs = bkg, k = 10)

scores.approx6 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE,
                               bkg.probs = bkg, k = 6)
scores.approx8 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE,
                               bkg.probs = bkg, k = 8)

pvals.exact <- motif_pvalue(motif, score = scores, progress = FALSE,
                            bkg.probs = bkg, k = 10)

pvals.approx6 <- motif_pvalue(motif, score = scores, progress = FALSE,
                              bkg.probs = bkg, k = 6)
pvals.approx8 <- motif_pvalue(motif, score = scores, progress = FALSE,
                              bkg.probs = bkg, k = 8)

res <- data.frame(pvalue = pvals, score = scores,
                  pvalue.exact = pvals.exact,
                  pvalue.k6 = pvals.approx6,
                  pvalue.k8 = pvals.approx8,
                  score.k6 = scores.approx6,
                  score.k8 = scores.approx8)
knitr::kable(res, format = "markdown", digits = 22)

## --------------------------------------------------------------------------
bkg <- c(A=0.25, C=0.25, G=0.25, T=0.25)
scores <- -2:6
pvals <- motif_pvalue(motif, score = scores, progress = FALSE,
                      bkg.probs = bkg, k = 10)

scores.exact <- motif_pvalue(motif, pvalue = pvals, progress = FALSE,
                             bkg.probs = bkg, k = 10)

scores.approx6 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE,
                               bkg.probs = bkg, k = 6)
scores.approx8 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE,
                               bkg.probs = bkg, k = 8)

pvals.approx6 <- motif_pvalue(motif, score = scores, progress = FALSE,
                              bkg.probs = bkg, k = 6)
pvals.approx8 <- motif_pvalue(motif, score = scores, progress = FALSE,
                              bkg.probs = bkg, k = 8)

res <- data.frame(score = scores, pvalue = pvals,
                  pvalue.k6 = pvals.approx6,
                  pvalue.k8 = pvals.approx8,
                  score.exact = scores.exact,
                  score.k6 = scores.approx6,
                  score.k8 = scores.approx8)
knitr::kable(res, format = "markdown", digits = 22)

## --------------------------------------------------------------------------
library(universalmotif)
library(MotifDb)

motifs <- convert_motifs(MotifDb)
motifs <- filter_motifs(motifs, altname = c("M0003_1.02", "M0004_1.02"))
summarise_motifs(motifs)
try_all <- function(motifs, ...) {
  scores <- numeric(8)
  methods <- c("MPCC", "PCC", "MEUCL", "EUCL", "MSW", "SW", "MKL", "KL")
  for (i in 1:8) {
    scores[i] <- compare_motifs(motifs, method = methods[i],
                                progress = FALSE, ...)[1, 2]
  }
  res <- data.frame(method = c("MPCC", "PCC", "MEUCL", "EUCL",
                                "MSW", "SW", "MKL", "KL"),
                     score = scores,
                     max = c(1, NA, sqrt(2), NA, 2, NA, NA, NA),
                     type = c("similarity", "similarity", "distance",
                              "distance", "similarity", "similarity",
                              "distance", "distance"))
  knitr::kable(res, format = "markdown")
}
try_all(motifs)

## --------------------------------------------------------------------------
try_all(motifs, tryRC = FALSE)

## --------------------------------------------------------------------------
try_all(motifs, use.type = "ICM")

## --------------------------------------------------------------------------
try_all(motifs, normalise.scores = TRUE)

## --------------------------------------------------------------------------
view_motifs(motifs)

## --------------------------------------------------------------------------
view_motifs(motifs, min.overlap = 99)
try_all(motifs, min.overlap = 99)
try_all(motifs, min.overlap = 99, normalise.scores = TRUE)

## --------------------------------------------------------------------------
try_all(motifs, min.overlap = 1)
view_motifs(motifs, min.overlap = 1)

## --------------------------------------------------------------------------
motifs2 <- filter_motifs(MotifDb, altname = c("M0100_1.02", "M0104_1.02"))
view_motifs(motifs2, min.mean.ic = 0)
try_all(motifs2, min.mean.ic = 0)
view_motifs(motifs2, min.mean.ic = 0.7)
try_all(motifs2, min.mean.ic = 0.7)

## --------------------------------------------------------------------------
library(universalmotif)
library(MotifDb)

motifs <- convert_motifs(MotifDb)
motifs <- filter_motifs(motifs, organism = "Athaliana")[1:100]
tree <- motif_tree(motifs, layout = "daylight", linecol = "family",
                   progress = FALSE)

## Make some changes to the tree in regular ggplot2 fashion:
# tree <- tree + ...

tree

## --------------------------------------------------------------------------
library(universalmotif)
library(MotifDb)
library(ggtree)
library(ggplot2)

motifs <- convert_motifs(MotifDb)
motifs <- filter_motifs(motifs, organism = "Athaliana")
motifs <- motifs[sample(seq_along(motifs), 25)]

## Step 1: compare motifs

comparisons <- compare_motifs(motifs, progress = FALSE)

## Step 2: create a "dist" object

# The default metric, MPCC, is a similarity metric
comparisons <- 1 - comparisons

comparisons <- as.dist(comparisons)

# We also want to extract names from the dist object to match annotations
labels <- attr(comparisons, "Labels")

## Step 3: get the comparisons ready for tree-building

# The R package "ape" provides the necessary "as.phylo" function
comparisons <- ape::as.phylo(hclust(comparisons))

## Step 4: incorporate annotation data to colour tree lines

family <- sapply(motifs, function(x) x["family"])
family.unique <- unique(family)

# We need to create a list with an entry for each family; within each entry
# are the names of the motifs belonging to that family
family.annotations <- list()
for (i in seq_along(family.unique)) {
  family.annotations <- c(family.annotations,
                          list(labels[family %in% family.unique[i]]))
}
names(family.annotations) <- family.unique

# Now add the annotation data:
comparisons <- ggtree::groupOTU(comparisons, family.annotations)

## Step 5: draw the tree

tree <- ggtree(comparisons, aes(colour = group), layout = "rectangular") +
          theme(legend.position = "bottom", legend.title = element_blank())

## Step 6: add additional annotations

# If we wish, we can additional annotations such as tip labelling and size

# Tip labels:
tree <- tree + geom_tiplab()

# Tip size:
tipsize <- data.frame(label = labels,
                      icscore = sapply(motifs, function(x) x["icscore"]))

tree <- tree %<+% tipsize + geom_tippoint(aes(size = icscore))


## ----eval=FALSE------------------------------------------------------------
#  library(universalmotif)
#  
#  ## 1. Once per session: via `options`
#  
#  options(meme.bin = "/path/to/meme/bin/meme")
#  
#  run_meme(...)
#  
#  ## 2. Once per run: via `run_meme`
#  
#  run_meme(..., bin = "/path/to/meme/bin/meme")

## ----eval=FALSE------------------------------------------------------------
#  library(universalmotif)
#  data(ArabidopsisPromoters)
#  
#  ## 1. Read sequences from disk (in fasta format):
#  
#  library(Biostrings)
#  
#  # The following `read*` functions are available in Biostrings:
#  # DNA: readDNAStringSet
#  # DNA with quality scores: readQualityScaledDNAStringSet
#  # RNA: readRNAStringSet
#  # amino acid: readAAStringSet
#  # any: readBStringSet
#  
#  sequences <- readDNAStringSet("/path/to/sequences.fasta")
#  
#  run_meme(sequences, ...)
#  
#  ## 2. Extract from a `BSgenome` object:
#  
#  library(GenomicFeatures)
#  library(TxDb.Athaliana.BioMart.plantsmart28)
#  library(BSgenome.Athaliana.TAIR.TAIR9)
#  
#  # Let us retrieve the same promoter sequences from ArabidopsisPromoters:
#  gene.names <- names(ArabidopsisPromoters)
#  
#  # First get the transcript coordinates from the relevant `TxDb` object:
#  transcripts <- transcriptsBy(TxDb.Athaliana.BioMart.plantsmart28,
#                               by = "gene")[gene.names]
#  
#  # There are multiple transcripts per gene, we only care for the first one
#  # in each:
#  
#  transcripts <- lapply(transcripts, function(x) x[1])
#  transcripts <- unlist(GRangesList(transcripts))
#  
#  # Then the actual sequences:
#  
#  # Unfortunately this is a case where the chromosome names do not match
#  # between the two databases
#  
#  seqlevels(TxDb.Athaliana.BioMart.plantsmart28)
#  #> [1] "1"  "2"  "3"  "4"  "5"  "Mt" "Pt"
#  seqlevels(BSgenome.Athaliana.TAIR.TAIR9)
#  #> [1] "Chr1" "Chr2" "Chr3" "Chr4" "Chr5" "ChrM" "ChrC"
#  
#  # So we must first rename the chromosomes in `transcripts`:
#  seqlevels(transcripts) <- seqlevels(BSgenome.Athaliana.TAIR.TAIR9)
#  
#  # Finally we can extract the sequences
#  promoters <- getPromoterSeq(transcripts,
#                              BSgenome.Athaliana.TAIR.TAIR9,
#                              upstream = 0, downstream = 1000)
#  
#  run_meme(promoters, ...)

## ----eval=FALSE------------------------------------------------------------
#  run_meme(sequences, output = "/path/to/desired/output/folder")

## ----sessionInfo, echo=FALSE-----------------------------------------------
sessionInfo()