## ----------------------------------------------------------------------------- library(textreuse) dir <- system.file("extdata/ats", package = "textreuse") corpus <- TextReuseCorpus(dir = dir, tokenizer = tokenize_ngrams, n = 5, progress = FALSE) ## ----------------------------------------------------------------------------- jaccard_similarity(corpus[["remember00palm"]], corpus[["remembermeorholy00palm"]]) ## ----eval=FALSE--------------------------------------------------------------- # comparisons <- pairwise_compare(corpus, jaccard_similarity, progress = FALSE) # comparisons[1:4, 1:4] ## ---- echo=FALSE-------------------------------------------------------------- comparisons <- pairwise_compare(corpus, jaccard_similarity, progress = FALSE) round(comparisons[1:3, 1:3], digits = 3) ## ----------------------------------------------------------------------------- candidates <- pairwise_candidates(comparisons) candidates[candidates$score > 0.1, ] ## ----eval=FALSE--------------------------------------------------------------- # vignette("minhash", package = "textreuse")