## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( echo = TRUE, #eval = FALSE, #collapse = TRUE, comment = "#>" ) ## ----loading data------------------------------------------------------------- library(lionfish) library(data.table) data(ChemicalManufacturingProcess) data <- data.table(ChemicalManufacturingProcess) col_names <- names(data) mp_cols <- grep("^ManufacturingProcess", col_names, value = TRUE) bm_cols <- grep("^BiologicalMaterial", col_names, value = TRUE) new_mp_names <- sub("^ManufacturingProcess", "ManPr", mp_cols) new_bm_names <- sub("^BiologicalMaterial", "BioMat", bm_cols) setnames(data, old = mp_cols, new = new_mp_names) setnames(data, old = bm_cols, new = new_bm_names) ## ----removing incomplete entries---------------------------------------------- data <- data[rowSums(is.na(data))<3] data <- data[, lapply(.SD, function(x) if (sum(is.na(x)) <= 3) x else NULL)] data <- data[-c(99,153,154,158,159,160)] data <- data[, lapply(.SD, function(x) if (sum(is.na(x)) <= 0) x else NULL)] ## ----------------------------------------------------------------------------- min_yield <- min(data$Yield, na.rm = TRUE) max_yield <- max(data$Yield, na.rm = TRUE) intervals <- seq(min_yield, max_yield, length.out = 11) yield_values <- findInterval(data$Yield, intervals, rightmost.closed = TRUE) yield_labels <- paste0("Yield ", head(intervals, -1), "-", tail(intervals, -1)) ## ----eval = FALSE------------------------------------------------------------- # library(ggplot2) # library(viridis) # # yield_values_factor <- factor(yield_values, labels = yield_labels) # # ggplot(data, aes(x = Yield, fill = yield_values_factor)) + # geom_histogram(binwidth = (max_yield - min_yield) / 30, color = "black") + # scale_fill_viridis(option = "plasma", discrete = TRUE) + # theme_minimal() + # labs(title = "Histogram of the process yield", # x = "Yield", # y = "Count", # fill = "Yield Intervals") ## ----------------------------------------------------------------------------- data <- data[, lapply(.SD, function(x) (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE))] data_wo_y <- copy(data) data_wo_y[, Yield := NULL] ## ----------------------------------------------------------------------------- set.seed(42) library(tourr) display_size <- 5 # Adjust to fit your display ## ----------------------------------------------------------------------------- if (check_venv()){ init_env(env_name = "r-lionfish", virtual_env = "virtual_env") } else if (check_conda_env()){ init_env(env_name = "r-lionfish", virtual_env = "anaconda") } ## ----results="hide", message=FALSE-------------------------------------------- lda_tour_history_1d <- save_history(data_wo_y, tour_path = guided_tour(lda_pp(yield_values),d=1)) lda_tour_history_2d <- save_history(data_wo_y, tour_path = guided_tour(lda_pp(yield_values),d=2)) half_range <- max(sqrt(rowSums(data_wo_y^2))) feature_names <- colnames(data_wo_y) obj1 <- list(type="1d_tour", obj=lda_tour_history_1d) obj2 <- list(type="2d_tour", obj=lda_tour_history_2d) ## ----------------------------------------------------------------------------- if (interactive()){ interactive_tour(data=data_wo_y, plot_objects=list(obj2), feature_names=feature_names, half_range=half_range/2, n_plot_cols=2, preselection=yield_values, preselection_names=yield_labels, display_size=8, color_scale="plasma") } ## ----results = 'hide', warning=FALSE, message=FALSE--------------------------- if (requireNamespace("randomForest")) { set.seed(42) library(randomForest) } ## ----------------------------------------------------------------------------- if (requireNamespace("randomForest")) { rf <- randomForest(Yield~.,data) importance_df <- data.frame(rf$importance) importance_df <- as.data.table(rf$importance, keep.rownames = "Feature") sorted_importance <- importance_df[order(-IncNodePurity)] top_10_features <- sorted_importance[1:10, Feature] print(top_10_features) } ## ----results = 'hide', warning=FALSE, message=FALSE--------------------------- if (requireNamespace("randomForest")) { data_rf_sel <- data[, ..top_10_features] grand_tour_history_1d <- save_history(data_rf_sel, tour_path = guided_tour(lda_pp(yield_values),d=1)) lda_tour_history_2d <- save_history(data_rf_sel, tour_path = guided_tour(lda_pp(yield_values),d=2)) half_range <- max(sqrt(rowSums(data_rf_sel^2))) feature_names <- colnames(data_rf_sel) obj1 <- list(type="1d_tour", obj=grand_tour_history_1d) obj2 <- list(type="2d_tour", obj=lda_tour_history_2d) } ## ----------------------------------------------------------------------------- if (interactive()){ interactive_tour(data=data_rf_sel, plot_objects=list(obj1,obj2), feature_names=feature_names, half_range=half_range/2, n_plot_cols=2, preselection=yield_values, preselection_names=yield_labels, n_subsets=11, display_size=8, color_scale = "plasma") } ## ----------------------------------------------------------------------------- intervals <- seq(min_yield, max_yield, length.out = 4) new_yield_labels <- paste0("Yield ", head(intervals, -1), "-", tail(intervals, -1)) new_min_yield <- min(data$Yield, na.rm = TRUE) new_max_yield <- max(data$Yield, na.rm = TRUE) new_intervals <- seq(new_min_yield, new_max_yield, length.out = 4) new_yield_values <- findInterval(data$Yield, new_intervals, rightmost.closed = TRUE) ## ----results = 'hide', warning=FALSE, message=FALSE--------------------------- if (requireNamespace("randomForest")) { grand_tour_history_1d <- save_history(data_rf_sel, tour_path = guided_tour(lda_pp(new_yield_values),d=1)) lda_tour_history_2d <- save_history(data_rf_sel, tour_path = guided_tour(lda_pp(new_yield_values),d=2)) half_range <- max(sqrt(rowSums(data_rf_sel^2))) feature_names <- colnames(data_rf_sel) obj1 <- list(type="1d_tour", obj=grand_tour_history_1d) obj2 <- list(type="2d_tour", obj=lda_tour_history_2d) } ## ----------------------------------------------------------------------------- if (interactive()){ interactive_tour(data=data_rf_sel, plot_objects=list(obj1,obj2), feature_names=feature_names, half_range=half_range/2, n_plot_cols=2, preselection=new_yield_values, preselection_names=new_yield_labels, n_subsets=3, display_size=8, color_scale = "plasma") }