## ----setup, echo=FALSE----------------------------------------------------- knitr::opts_chunk$set(collapse=TRUE, comment = "#>") suppressPackageStartupMessages(library(universalmotif)) suppressPackageStartupMessages(library(Biostrings)) suppressMessages(suppressPackageStartupMessages(library(MotifDb))) suppressMessages(suppressPackageStartupMessages(library(ggtree))) data(ArabidopsisPromoters) data(ArabidopsisMotif) ## -------------------------------------------------------------------------- library(universalmotif) motif <- create_motif("CWWWWCC", nsites = 6) sequences <- DNAStringSet(rep(c("CAAAACC", "CTTTTCC"), 3)) motif.k2 <- add_multifreq(motif, sequences, add.k = 2) ## Alternatively: # motif.k2 <- create_motif(sequences, add.multifreq = 2) motif.k2 ## -------------------------------------------------------------------------- library(universalmotif) data(ArabidopsisMotif) data(ArabidopsisPromoters) enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3, threshold = 0.001, verbose = 0, progress = FALSE, RC = TRUE) ## -------------------------------------------------------------------------- library(universalmotif) data(ArabidopsisMotif) data(ArabidopsisPromoters) enrich_motifs(ArabidopsisMotif, ArabidopsisPromoters, shuffle.k = 3, search.mode = "positional", verbose = 0, progress = FALSE, RC = TRUE) ## -------------------------------------------------------------------------- library(universalmotif) m <- matrix(c(0.10,0.27,0.23,0.19,0.29,0.28,0.51,0.12,0.34,0.26, 0.36,0.29,0.51,0.38,0.23,0.16,0.17,0.21,0.23,0.36, 0.45,0.05,0.02,0.13,0.27,0.38,0.26,0.38,0.12,0.31, 0.09,0.40,0.24,0.30,0.21,0.19,0.05,0.30,0.31,0.08), byrow = TRUE, nrow = 4) motif <- create_motif(m, alphabet = "DNA", type = "PWM") motif ## -------------------------------------------------------------------------- data(ArabidopsisPromoters) res <- scan_sequences(motif, ArabidopsisPromoters, verbose = 0, progress = FALSE, threshold = 0.8, threshold.type = "logodds") head(res) ## -------------------------------------------------------------------------- ## The second match was CTCTTTATTC, with a score of 3.896 (max possible = 6.403) library(Biostrings) bkg <- colMeans(oligonucleotideFrequency(ArabidopsisPromoters, 1, as.prob = TRUE)) bkg ## -------------------------------------------------------------------------- hit.prob <- bkg["A"]^1 * bkg["C"]^3 * bkg["G"]^0 * bkg["T"]^6 hit.prob <- unname(hit.prob) hit.prob ## -------------------------------------------------------------------------- res <- res[1:6, ] pvals <- motif_pvalue(motif, res$score, progress = FALSE, bkg.probs = bkg) res2 <- data.frame(motif=res$motif,match=res$match,pval=pvals)[order(pvals), ] knitr::kable(res2, digits = 22, row.names = FALSE, format = "markdown") ## -------------------------------------------------------------------------- scores <- c(-6, -3, 0, 3, 6) k <- c(2, 4, 6, 8, 10) out <- data.frame(k = c(2, 4, 6, 8, 10), score.minus6 = rep(0, 5), score.minus3 = rep(0, 5), score.0 = rep(0, 5), score.3 = rep(0, 5), score.6 = rep(0, 5)) for (i in seq_along(scores)) { for (j in seq_along(k)) { out[j, i + 1] <- motif_pvalue(motif, scores[i], k = k[j], progress = FALSE, bkg.probs = bkg) } } knitr::kable(out, format = "markdown", digits = 10) ## -------------------------------------------------------------------------- bkg <- c(A=0.25, C=0.25, G=0.25, T=0.25) pvals <- c(0.1, 0.01, 0.001, 0.0001, 0.00001) scores <- motif_pvalue(motif, pvalue = pvals, progress = FALSE, bkg.probs = bkg, k = 10) scores.approx6 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE, bkg.probs = bkg, k = 6) scores.approx8 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE, bkg.probs = bkg, k = 8) pvals.exact <- motif_pvalue(motif, score = scores, progress = FALSE, bkg.probs = bkg, k = 10) pvals.approx6 <- motif_pvalue(motif, score = scores, progress = FALSE, bkg.probs = bkg, k = 6) pvals.approx8 <- motif_pvalue(motif, score = scores, progress = FALSE, bkg.probs = bkg, k = 8) res <- data.frame(pvalue = pvals, score = scores, pvalue.exact = pvals.exact, pvalue.k6 = pvals.approx6, pvalue.k8 = pvals.approx8, score.k6 = scores.approx6, score.k8 = scores.approx8) knitr::kable(res, format = "markdown", digits = 22) ## -------------------------------------------------------------------------- bkg <- c(A=0.25, C=0.25, G=0.25, T=0.25) scores <- -2:6 pvals <- motif_pvalue(motif, score = scores, progress = FALSE, bkg.probs = bkg, k = 10) scores.exact <- motif_pvalue(motif, pvalue = pvals, progress = FALSE, bkg.probs = bkg, k = 10) scores.approx6 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE, bkg.probs = bkg, k = 6) scores.approx8 <- motif_pvalue(motif, pvalue = pvals, progress = FALSE, bkg.probs = bkg, k = 8) pvals.approx6 <- motif_pvalue(motif, score = scores, progress = FALSE, bkg.probs = bkg, k = 6) pvals.approx8 <- motif_pvalue(motif, score = scores, progress = FALSE, bkg.probs = bkg, k = 8) res <- data.frame(score = scores, pvalue = pvals, pvalue.k6 = pvals.approx6, pvalue.k8 = pvals.approx8, score.exact = scores.exact, score.k6 = scores.approx6, score.k8 = scores.approx8) knitr::kable(res, format = "markdown", digits = 22) ## -------------------------------------------------------------------------- library(universalmotif) library(MotifDb) motifs <- convert_motifs(MotifDb) motifs <- filter_motifs(motifs, altname = c("M0003_1.02", "M0004_1.02")) summarise_motifs(motifs) try_all <- function(motifs, ...) { scores <- numeric(8) methods <- c("MPCC", "PCC", "MEUCL", "EUCL", "MSW", "SW", "MKL", "KL") for (i in 1:8) { scores[i] <- compare_motifs(motifs, method = methods[i], progress = FALSE, ...)[1, 2] } res <- data.frame(method = c("MPCC", "PCC", "MEUCL", "EUCL", "MSW", "SW", "MKL", "KL"), score = scores, max = c(1, NA, sqrt(2), NA, 2, NA, NA, NA), type = c("similarity", "similarity", "distance", "distance", "similarity", "similarity", "distance", "distance")) knitr::kable(res, format = "markdown") } try_all(motifs) ## -------------------------------------------------------------------------- try_all(motifs, tryRC = FALSE) ## -------------------------------------------------------------------------- try_all(motifs, use.type = "ICM") ## -------------------------------------------------------------------------- try_all(motifs, normalise.scores = TRUE) ## -------------------------------------------------------------------------- view_motifs(motifs) ## -------------------------------------------------------------------------- view_motifs(motifs, min.overlap = 99) try_all(motifs, min.overlap = 99) try_all(motifs, min.overlap = 99, normalise.scores = TRUE) ## -------------------------------------------------------------------------- try_all(motifs, min.overlap = 1) view_motifs(motifs, min.overlap = 1) ## -------------------------------------------------------------------------- motifs2 <- filter_motifs(MotifDb, altname = c("M0100_1.02", "M0104_1.02")) view_motifs(motifs2, min.mean.ic = 0) try_all(motifs2, min.mean.ic = 0) view_motifs(motifs2, min.mean.ic = 0.7) try_all(motifs2, min.mean.ic = 0.7) ## -------------------------------------------------------------------------- library(universalmotif) library(MotifDb) motifs <- convert_motifs(MotifDb) motifs <- filter_motifs(motifs, organism = "Athaliana")[1:100] tree <- motif_tree(motifs, layout = "daylight", linecol = "family", progress = FALSE) ## Make some changes to the tree in regular ggplot2 fashion: # tree <- tree + ... tree ## -------------------------------------------------------------------------- library(universalmotif) library(MotifDb) library(ggtree) library(ggplot2) motifs <- convert_motifs(MotifDb) motifs <- filter_motifs(motifs, organism = "Athaliana") motifs <- motifs[sample(seq_along(motifs), 25)] ## Step 1: compare motifs comparisons <- compare_motifs(motifs, progress = FALSE) ## Step 2: create a "dist" object # The default metric, MPCC, is a similarity metric comparisons <- 1 - comparisons comparisons <- as.dist(comparisons) # We also want to extract names from the dist object to match annotations labels <- attr(comparisons, "Labels") ## Step 3: get the comparisons ready for tree-building # The R package "ape" provides the necessary "as.phylo" function comparisons <- ape::as.phylo(hclust(comparisons)) ## Step 4: incorporate annotation data to colour tree lines family <- sapply(motifs, function(x) x["family"]) family.unique <- unique(family) # We need to create a list with an entry for each family; within each entry # are the names of the motifs belonging to that family family.annotations <- list() for (i in seq_along(family.unique)) { family.annotations <- c(family.annotations, list(labels[family %in% family.unique[i]])) } names(family.annotations) <- family.unique # Now add the annotation data: comparisons <- ggtree::groupOTU(comparisons, family.annotations) ## Step 5: draw the tree tree <- ggtree(comparisons, aes(colour = group), layout = "rectangular") + theme(legend.position = "bottom", legend.title = element_blank()) ## Step 6: add additional annotations # If we wish, we can additional annotations such as tip labelling and size # Tip labels: tree <- tree + geom_tiplab() # Tip size: tipsize <- data.frame(label = labels, icscore = sapply(motifs, function(x) x["icscore"])) tree <- tree %<+% tipsize + geom_tippoint(aes(size = icscore)) ## ----eval=FALSE------------------------------------------------------------ # library(universalmotif) # # ## 1. Once per session: via `options` # # options(meme.bin = "/path/to/meme/bin/meme") # # run_meme(...) # # ## 2. Once per run: via `run_meme` # # run_meme(..., bin = "/path/to/meme/bin/meme") ## ----eval=FALSE------------------------------------------------------------ # library(universalmotif) # data(ArabidopsisPromoters) # # ## 1. Read sequences from disk (in fasta format): # # library(Biostrings) # # # The following `read*` functions are available in Biostrings: # # DNA: readDNAStringSet # # DNA with quality scores: readQualityScaledDNAStringSet # # RNA: readRNAStringSet # # amino acid: readAAStringSet # # any: readBStringSet # # sequences <- readDNAStringSet("/path/to/sequences.fasta") # # run_meme(sequences, ...) # # ## 2. Extract from a `BSgenome` object: # # library(GenomicFeatures) # library(TxDb.Athaliana.BioMart.plantsmart28) # library(BSgenome.Athaliana.TAIR.TAIR9) # # # Let us retrieve the same promoter sequences from ArabidopsisPromoters: # gene.names <- names(ArabidopsisPromoters) # # # First get the transcript coordinates from the relevant `TxDb` object: # transcripts <- transcriptsBy(TxDb.Athaliana.BioMart.plantsmart28, # by = "gene")[gene.names] # # # There are multiple transcripts per gene, we only care for the first one # # in each: # # transcripts <- lapply(transcripts, function(x) x[1]) # transcripts <- unlist(GRangesList(transcripts)) # # # Then the actual sequences: # # # Unfortunately this is a case where the chromosome names do not match # # between the two databases # # seqlevels(TxDb.Athaliana.BioMart.plantsmart28) # #> [1] "1" "2" "3" "4" "5" "Mt" "Pt" # seqlevels(BSgenome.Athaliana.TAIR.TAIR9) # #> [1] "Chr1" "Chr2" "Chr3" "Chr4" "Chr5" "ChrM" "ChrC" # # # So we must first rename the chromosomes in `transcripts`: # seqlevels(transcripts) <- seqlevels(BSgenome.Athaliana.TAIR.TAIR9) # # # Finally we can extract the sequences # promoters <- getPromoterSeq(transcripts, # BSgenome.Athaliana.TAIR.TAIR9, # upstream = 0, downstream = 1000) # # run_meme(promoters, ...) ## ----eval=FALSE------------------------------------------------------------ # run_meme(sequences, output = "/path/to/desired/output/folder") ## ----sessionInfo, echo=FALSE----------------------------------------------- sessionInfo()