## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>", warning = FALSE
)

## -----------------------------------------------------------------------------
################################################################################
# Example: Using tidytext with textmineR
################################################################################

library(tidytext)
library(textmineR)
library(dplyr)
library(tidyr)

# load documents in a data frame
docs <- textmineR::nih_sample 

# tokenize using tidytext's unnest_tokens
tidy_docs <- docs %>% 
  select(APPLICATION_ID, ABSTRACT_TEXT) %>% 
  unnest_tokens(output = word, 
                input = ABSTRACT_TEXT,
                stopwords = c(stopwords::stopwords("en"), 
                              stopwords::stopwords(source = "smart")),
                token = "ngrams",
                n_min = 1, n = 2) %>% 
  count(APPLICATION_ID, word) %>% 
  filter(n>1) #Filtering for words/bigrams per document, rather than per corpus

tidy_docs <- tidy_docs %>% # filter words that are just numbers
  filter(! stringr::str_detect(tidy_docs$word, "^[0-9]+$"))

# turn a tidy tbl into a sparse dgCMatrix for use in textmineR
d <- tidy_docs %>% 
  cast_sparse(APPLICATION_ID, word, n)


# create a topic model
m <- FitLdaModel(dtm = d, 
                 k = 20,
                 iterations = 200,
                 burnin = 175)


# below is equivalent to tidy_beta <- tidy(x = m, matrix = "beta")
tidy_beta <- data.frame(topic = as.integer(stringr::str_replace_all(rownames(m$phi), "t_", "")), 
                        m$phi, 
                        stringsAsFactors = FALSE) %>%
  gather(term, beta, -topic) %>% 
  tibble::as_tibble()

# below is equivalent to tidy_gamma <- tidy(x = m, matrix = "gamma")
tidy_gamma <- data.frame(document = rownames(m$theta),
                         m$theta,
                         stringsAsFactors = FALSE) %>%
  gather(topic, gamma, -document) %>%
  tibble::as_tibble()