## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----gh-installation, eval = FALSE--------------------------------------------
#  install.packages("devtools")
#  devtools::install_github("ArthurSpirling/stylest2")

## ----echo=TRUE----------------------------------------------------------------
library(stylest2)
library(quanteda)

## ----echo=TRUE----------------------------------------------------------------
data(novels)

## ----echo=FALSE---------------------------------------------------------------
# show a snippet of the data
knitr::kable(novels[c(1,4,8), ])

## ----echo=TRUE----------------------------------------------------------------
novels_tok <- tokens(novels$text)
novels_dfm <- dfm(novels_tok)

unique(novels$author)
docvars(novels_dfm)["author"] <- novels$author


## ---- echo=TRUE---------------------------------------------------------------

novels_tok <- tokens(novels$text, 
                     remove_punct = T,
                     remove_symbols = T,
                     remove_numbers = T,
                     remove_separators = T,
                     split_hyphens = T)
novels_dfm <- dfm(novels_tok)

unique(novels$author)
docvars(novels_dfm)["author"] <- novels$author


## ----echo = TRUE--------------------------------------------------------------
set.seed(1234)

## ----echo=TRUE----------------------------------------------------------------
vocab_with_defaults <- stylest2_select_vocab(dfm = novels_dfm)

## ----echo=TRUE----------------------------------------------------------------
vocab_custom <- stylest2_select_vocab(dfm = novels_dfm, 
                                      smoothing = 1, 
                                      nfold = 10, 
                                      cutoffs = c(50, 75, 99))

## ----echo=TRUE----------------------------------------------------------------
# Percentile with best prediction rate
vocab_with_defaults$cutoff_pct_best

# Rate of INCORRECTLY predicted speakers of held-out texts
vocab_with_defaults$cv_missrate_results

# Data on the setup:

# Percentiles tested
vocab_with_defaults$cutoff_candidates

# Number of folds
vocab_with_defaults$nfold

## ----echo=TRUE----------------------------------------------------------------
terms_90 <- stylest2_terms(dfm = novels_dfm, cutoff = 90)

## ----echo=TRUE----------------------------------------------------------------
mod <- stylest2_fit(dfm = novels_dfm, terms = terms_90)

## ----echo = TRUE--------------------------------------------------------------
term_weights <- c(0.1,0.2,0.001)
names(term_weights) <- c("the", "and", "Floccinaucinihilipilification")

term_weights

## ----echo = TRUE--------------------------------------------------------------
mod <- stylest2_fit(dfm = novels_dfm,  terms = terms_90, term_weights = term_weights)

## ----echo = TRUE--------------------------------------------------------------
predictions <- stylest2_predict(dfm = novels_dfm, model = mod)

## ----echo = TRUE--------------------------------------------------------------
predictions <- stylest2_predict(dfm = novels_dfm, model = mod,
                                speaker_odds = TRUE, term_influence = TRUE)

## ----echo = TRUE--------------------------------------------------------------
# Pride and Prejudice
novels$text[14]

predictions$speaker_odds$log_odds_mean[14]

predictions$speaker_odds$log_odds_se[14]

## ----echo = TRUE--------------------------------------------------------------
na_text <- "No one who had ever seen Catherine Morland in her infancy would have supposed 
            her born to be an heroine. Her situation in life, the character of her father 
            and mother, her own person and disposition, were all equally against her. Her 
            father was a clergyman, without being neglected, or poor, and a very respectable 
            man, though his name was Richard—and he had never been handsome. He had a 
            considerable independence besides two good livings—and he was not in the least 
            addicted to locking up his daughters."

na_text_dfm <- dfm(tokens(na_text))

pred <- stylest2_predict(dfm = na_text_dfm, model = mod)

## ----echo = TRUE--------------------------------------------------------------
pred$posterior$predicted

pred$posterior$log_probs

## ----echo = FALSE-------------------------------------------------------------
head(predictions$term_influence$features[order(predictions$term_influence$mean_influence, decreasing = TRUE)])

## ----echo = FALSE-------------------------------------------------------------
tail(predictions$term_influence$features[order(predictions$term_influence$mean_influence, decreasing = TRUE)])