## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
library(ggplot2)

## ----setup--------------------------------------------------------------------
library(stressor)

## -----------------------------------------------------------------------------
set.seed(43421)
lm_data <- data_gen_lm(500, weight_vec = rep(1, 5), y_int = 0, resp_sd = 1)
head(lm_data)

## ----echo = FALSE, warning=FALSE, fig.align='center', fig.height=5, fig.width = 5----
# Data Verification
simulation <- function(n, eps, weight_mat, label, test_size = 100, 
                       seed = 43421) {
  pred_accuracy <- matrix(0, nrow = length(eps), ncol = length(n))
  conv_mat <- matrix(0, nrow = length(eps), ncol = length(n))
  set.seed(seed)
  for (i in seq_len(ncol(pred_accuracy))) {
    for (j in seq_len(nrow(pred_accuracy))) {
      temp <- data_gen_lm(n[i] + test_size, weight_mat = weight_mat, 
                          resp_sd = eps[j])
      test <- sample(nrow(temp), test_size)
      data <- temp[-test,]
      test <- temp[test, ]
      test_y <- test[, 1]
      test <- test[, -1]
      obj <- lm(Y ~ ., data = data)
      pred <- predict(obj, test)
      pred_accuracy[j, i] <- sqrt(sum((test_y - pred)^2) / test_size)
    }
  }
  pred_accuracy <- as.data.frame(pred_accuracy)
  colnames(pred_accuracy) <- label
  sim_list <- list(pred_accuracy, conv_mat)
  return(sim_list)
}
eps <- seq(from = .1, to = 1, by = .1)
n <- c(100, 300, 500, 700, 900, 1000) # add in 5000
lab <- c("n = 100", "n = 300", "n = 500", "n = 700",
         "n = 900", "n = 1000")
weight_vec <- c(1, 3, 4, 5, 7)
lm_accuracy_res <- simulation(n, eps, weight_vec, lab)
lm_accuracy <- lm_accuracy_res[[1]]

eps2 <- rep(seq(.1, 1, .1), 6)
lm_results <- as.data.frame(eps2)
rmse <- c(lm_accuracy$`n = 100`, lm_accuracy$`n = 300`,
          lm_accuracy$`n = 500`, lm_accuracy$`n = 700`,
          lm_accuracy$`n = 900`, lm_accuracy$`n = 1000`)
lm_results$rmse <- rmse
lm_results$groups <- c(rep("n = 100", 10), rep("n = 300", 10),
                       rep("n = 500", 10), rep("n = 700", 10),
                       rep("n = 900", 10), rep("n = 1000", 10))
lm_results$groups <- factor(lm_results$groups, levels = lab)

ggplot(lm_results, aes(x = eps2, y = rmse)) +
  geom_point() +
  geom_line(aes(x = eps2, y = eps2), color = "red") +
  scale_x_continuous(name = "eps", breaks = seq(0, 1, by = .2), 
                     limits = c(0.0, 1.05)) +
  scale_y_continuous(breaks = seq(0, 1.2, by = .2), limits = c(0, 1.2)) +
  facet_wrap(~ groups, nrow = 2) +
  ggtitle("Linear Model Validation") +
  theme(axis.title=element_text(size=14,face="bold"),
        axis.text.x = element_text(angle = 45))

## ----eval=FALSE---------------------------------------------------------------
#  create_virtualenv()

## ----eval = FALSE-------------------------------------------------------------
#  set.seed(43421)
#  lm_data <- data_gen_lm(1000)
#  # Split the data into a 80/20 split
#  indices <- split_data_prob(lm_data, .8)
#  train <- lm_data[indices, ]
#  test <- lm_data[!indices, ]
#  # Tune the models
#  mlm_lm <- mlm_regressor(Y ~ ., lm_data, sort_v = 'RMSE', seed = 43421)

## ----echo=FALSE---------------------------------------------------------------
pred <- readRDS(file = "pred_lm.rds")
cv <- readRDS(file = "mlm_lm_cv.rds")
mlm_vignette <- list(pred_accuracy = pred, mlm_lm_cv = cv)
mlm_score <- as.data.frame(readRDS("mlm_test.rds"))
top_RMSE <- min(mlm_score$rmse)
name_RMSE <- mlm_vignette$pred_accuracy$Model[which(mlm_score$rmse == top_RMSE)]

## ----eval = FALSE-------------------------------------------------------------
#  mlm_lm$pred_accuracy

## ----echo = FALSE-------------------------------------------------------------
mlm_vignette$pred_accuracy

## ----eval = FALSE-------------------------------------------------------------
#  pred_lm <- predict(mlm_lm, test)
#  score(test$Y, pred_lm)

## ----echo = FALSE-------------------------------------------------------------
mlm_score

## -----------------------------------------------------------------------------
test_index <- split_data_prob(lm_data, .2)
test <- lm_data[test_index, ]
train <- lm_data[!test_index, ]
lm_test <- lm(Y ~ ., train)
lm_pred <- predict(lm_test, newdata = test)
lm_score <- score(test$Y, lm_pred)
lm_score

## ----eval = FALSE-------------------------------------------------------------
#  mlm_cv <- cv(mlm_lm, lm_data, n_folds = 10)

## -----------------------------------------------------------------------------
lm_cv <- cv(lm_test, lm_data, n_folds = 10)

## ----echo = FALSE-------------------------------------------------------------
mlm_cv <- mlm_vignette$mlm_lm_cv

## -----------------------------------------------------------------------------
score(lm_data$Y, mlm_cv)
score(lm_data$Y, lm_cv)

## ----eval=FALSE---------------------------------------------------------------
#  data(boston)
#  mlm_boston <- mlm_regressor(cmedv ~ ., boston)
#  mlm_boston$pred_accuracy

## ----echo=FALSE---------------------------------------------------------------
data(boston)
mlm_boston_pred_accuracy <- readRDS(file = "pred.rds")
mlm_boston_pred_accuracy

## ----eval=FALSE---------------------------------------------------------------
#  mlm_boston_cv <- cv(mlm_boston, boston, n_folds = 10)
#  mlm_boston_score <- score(boston$cmedv, mlm_boston_cv)
#  mlm_boston_score

## ----echo=FALSE---------------------------------------------------------------
cv_data <- readRDS(file = "cv.rds")
mlm_boston_score <- score(boston$cmedv, cv_data)
mlm_boston_score

## ----eval=FALSE---------------------------------------------------------------
#  mlm_boston_clust_cv <- cv(mlm_boston, boston, n_folds = 10, k_mult = 5)
#  mlm_boston_clust_score <- score(boston$cmedv, mlm_boston_clust_cv)
#  mlm_boston_clust_score

## ----echo=FALSE---------------------------------------------------------------
clus_data <- readRDS(file = "cluster.rds")
boston_clust_score <- score(boston$cmedv, clus_data)
boston_clust_score