## ----setup, include = FALSE--------------------------------------------------- library(qlcData) ## ----eval = FALSE------------------------------------------------------------- # # install devtools from CRAN # install.packages("devtools") # # install qlcData from github using devtools # devtools::install_github("cysouw/qlcData", build_vignettes = TRUE) # # load qlcTokenize package # library(qlcData) # # access help files of the package # help(qlcData) # # access this vignette # vignette("orthography_processing") ## ----------------------------------------------------------------------------- test <- "hállo hállо" ## ----eval = FALSE------------------------------------------------------------- # write.profile(test) ## ----echo=FALSE, results='asis'----------------------------------------------- # some example string knitr::kable(write.profile(test)) ## ----------------------------------------------------------------------------- # the differenec between various "o" characters is mostly invisible on screen "o" == "o" # these are the same "o" characters, so this statement in true "o" == "о" # this is one latin and and cyrillic "o" character, so this statement is false ## ----------------------------------------------------------------------------- test <- c("this thing", "is", "a", "vector", "with", "many", "strings") ## ----eval = FALSE------------------------------------------------------------- # write.profile(test) ## ----echo=FALSE, results='asis'----------------------------------------------- # some example string knitr::kable(write.profile(test)) ## ----------------------------------------------------------------------------- tokenize(test) ## ----eval = FALSE------------------------------------------------------------- # dir.create("~/Desktop/tokenize") # setwd("~/Desktop/tokenize") # tokenize(test, file.out = "test_profile.txt") ## ----echo = FALSE, results='asis'--------------------------------------------- test_profile.txt <- as.data.frame(rbind(as.matrix(tokenize(test)$profile),c("th", ""),c("ng", ""))) knitr::kable(test_profile.txt) ## ----eval = FALSE------------------------------------------------------------- # tokenize(test, profile = "test_profile.txt") # # # with overwriting of the existing profile: # # tokenize(test, profile = "test_profile.txt", file.out = "test_profile.txt") # # # note that you can abbreviate this in R: # # tokenize_old(test, p = "test_profile.txt", f = "test_profile.txt") ## ----echo = FALSE------------------------------------------------------------- tokenize(test, profile = test_profile.txt) ## ----eval = FALSE------------------------------------------------------------- # tokenize(c("think", "thin", "both"), profile = "test_profile.txt") ## ----echo = FALSE------------------------------------------------------------- tokenize(c("think", "thin", "both"), profile = test_profile.txt) ## ----echo = FALSE, results='asis'--------------------------------------------- Grapheme <- c("c", "c", "n", "s", "a", "i") IPA <- c("k", "tʃ", "n", "s", "a", "i") Right <- c("", "[ie]", "", "", "", "") italian <- cbind(Grapheme, Right, IPA) knitr::kable(italian) ## ----------------------------------------------------------------------------- tokenize(c("casa", "cina"), profile = italian, transliterate = "IPA", regex = TRUE)$strings ## ----echo = FALSE, results='asis'--------------------------------------------- Grapheme <- c("c", "c", "n", "s", "a", "i", "e") IPA <- c("k", "tʃ", "n", "s", "a", "i", "e") Right <- c("", "frontV", "", "", "", "","") Class <- c("","","","","","frontV","frontV") italian <- cbind(Grapheme, Right, Class, IPA) knitr::kable(italian) ## ----------------------------------------------------------------------------- tokenize(c("casa", "cina"), profile = italian, transliterate = "IPA", regex = TRUE)$strings