## ---- include = FALSE--------------------------------------------------------- Sys.setenv(CORPUS_REGISTRY = "") ## ----loading_polmineR--------------------------------------------------------- library(polmineR) ## ----get_registry------------------------------------------------------------- registry() ## ---- eval = FALSE------------------------------------------------------------ # Sys.getenv("CORPUS_REGISTRY") ## ----use_polmineR_data, message = FALSE, eval = TRUE-------------------------- use("polmineR") use("RcppCWB", corpus = "REUTERS") ## ---- eval = TRUE, message = FALSE-------------------------------------------- corpus() ## ---- eval = FALSE, message = FALSE, results = 'hide'------------------------- # options()[grep("polmineR", names(options()))] ## ----------------------------------------------------------------------------- options("polmineR.left" = 5) options("polmineR.right" = 5) options("polmineR.mc" = FALSE) ## ---- echo = FALSE, message = FALSE------------------------------------------- options("polmineR.pagelength" = 3L) ## ---- eval = TRUE, render = knit_print---------------------------------------- k <- kwic("REUTERS", "oil") ## ---- eval = TRUE, render = knit_print---------------------------------------- k <- kwic("REUTERS", "oil", s_attributes = "places") ## ---- eval = TRUE, render = knit_print---------------------------------------- k <- kwic("REUTERS", "oil", s_attributes = c("id", "places")) ## ---- eval = TRUE, render = knit_print---------------------------------------- k <- kwic("REUTERS", '"oil" "price.*"') ## ---- eval = TRUE------------------------------------------------------------- cnt <- count("REUTERS", "Kuwait") cnt <- count("REUTERS", c("Kuwait", "USA", "Bahrain")) cnt <- count("REUTERS", c('"United" "States"', '"Saudi" "Arabia.*"'), cqp = TRUE) ## ---- eval = TRUE, message = FALSE-------------------------------------------- oil <- dispersion("REUTERS", query = "oil", s_attribute = "id", progress = FALSE) ## ----------------------------------------------------------------------------- saudi_arabia <- dispersion( "REUTERS", query = '"Saudi" "Arabia.*"', s_attribute = "id", cqp = TRUE, progress = FALSE ) ## ---- eval = TRUE------------------------------------------------------------- barplot(height = saudi_arabia[["count"]], names.arg = saudi_arabia[["id"]], las = 2) ## ---- eval = TRUE, message = FALSE-------------------------------------------- oil <- cooccurrences("REUTERS", query = "oil") sa <- cooccurrences("REUTERS", query = '"Saudi" "Arabia.*"', left = 10, right = 10) top5 <- subset(oil, rank_ll <= 5) ## ---- eval = rmarkdown::pandoc_available(), render = knit_print--------------- top5 ## ---- eval = TRUE------------------------------------------------------------- as.data.frame(top5) ## ---- eval = TRUE, message = FALSE, results = 'hide'-------------------------- kuwait <- partition("REUTERS", places = "kuwait", regex = TRUE) ## ---- eval = TRUE------------------------------------------------------------- kuwait ## ---- eval = TRUE, message = FALSE-------------------------------------------- saudi_arabia <- partition("REUTERS", places = "saudi-arabia", regex = TRUE) s_attributes(saudi_arabia, "id") ## ---- eval = TRUE, message = FALSE-------------------------------------------- saudi_arabia <- partition("REUTERS", places = "saudi-arabia", regex = TRUE) oil <- cooccurrences(saudi_arabia, "oil", p_attribute = "word", left = 10, right = 10) ## ---- eval = TRUE------------------------------------------------------------- df <- as.data.frame(oil) df[1:5, c("word", "ll", "rank_ll")] ## ---- eval = TRUE------------------------------------------------------------- q1 <- dispersion(saudi_arabia, query = 'oil', s_attribute = "id", progress = FALSE) q2 <- dispersion(saudi_arabia, query = c("oil", "barrel"), s_attribute = "id", progress = FALSE) ## ---- eval = TRUE, message = FALSE, render = knit_print----------------------- saudi_arabia <- partition("REUTERS", places = "saudi-arabia", regex = TRUE) saudi_arabia <- enrich(saudi_arabia, p_attribute = "word") saudi_arabia_features <- features(saudi_arabia, "REUTERS", included = TRUE) saudi_arabia_features_min <- subset(saudi_arabia_features, rank_chisquare <= 10.83 & count_coi >= 5) saudi_arabia_features_min ## ---- eval = TRUE, message = FALSE-------------------------------------------- df <- as.data.frame(saudi_arabia_features_min) df_min <- df[,c("word", "count_coi", "count_ref", "chisquare")] ## ---- eval = TRUE------------------------------------------------------------- articles <- corpus("REUTERS") %>% partition_bundle(s_attribute = "id", progress = FALSE) articles_count <- count(articles, p_attribute = "word") tdm <- as.TermDocumentMatrix(articles_count, col = "count", verbose = FALSE) class(tdm) # to see what it is show(tdm) m <- as.matrix(tdm) # turn it into an ordinary matrix m[c("oil", "barrel"),] ## ---- eval = TRUE, message = FALSE-------------------------------------------- P <- partition("REUTERS", id = "248") H <- html(P, height = "250px") H ## ---- eval = FALSE------------------------------------------------------------ # Sys.setenv(CORPUS_REGISTRY = "C:/PATH/TO/YOUR/REGISTRY")