## ----setup, include = FALSE---------------------------------------------------
not_cran <- identical(Sys.getenv("NOT_CRAN"), "true")

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  echo = TRUE,
  eval = not_cran,
  warning = FALSE,
  message = FALSE,
  out.width = "100%",
  fig.align = "center"
)
library(knitr)
library(dataSDA)
library(RSDA)
library(HistDAWass)

has_symbolicDA <- requireNamespace("symbolicDA", quietly = TRUE)
has_MAINT <- requireNamespace("MAINT.Data", quietly = TRUE)
has_e1071 <- requireNamespace("e1071", quietly = TRUE)
has_ggInterval <- requireNamespace("ggInterval", quietly = TRUE) && not_cran

## ----dataset-summary, echo = FALSE--------------------------------------------
# # Use installed package data dir; fall back to source tree when building locally
# data_dir <- system.file("data", package = "dataSDA")
# if (!nzchar(data_dir) || length(list.files(data_dir, pattern = "[.]rda$")) == 0)
#   data_dir <- file.path("..", "data")
# 
# data_files <- list.files(data_dir, pattern = "[.]rda$")
# data_names <- sub("[.]rda$", "", data_files)
# 
# classify_type <- function(name) {
#   if (grepl("[.]its$", name))      return("Interval Time Series (.its)")
#   if (grepl("[.]int[.]mm$", name)) return("Interval (.int.mm)")
#   if (grepl("[.]int$", name))      return("Interval (.int)")
#   if (grepl("[.]iGAP$", name))     return("Interval (.iGAP)")
#   if (grepl("[.]hist$", name))     return("Histogram (.hist)")
#   if (grepl("[.]distr$", name))    return("Distributional (.distr)")
#   if (grepl("[.]mix$", name))      return("Mixed (.mix)")
#   if (grepl("[.]modal$", name))    return("Modal (.modal)")
#   "Other"
# }
# 
# types <- sapply(data_names, classify_type, USE.NAMES = FALSE)
# type_tbl <- as.data.frame(table(Type = types), stringsAsFactors = FALSE)
# type_tbl <- type_tbl[order(-type_tbl$Freq), ]
# names(type_tbl)[2] <- "Datasets"
# kable(type_tbl, row.names = FALSE,
#       caption = "Dataset counts by type")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# head(mushroom.int, 3)
# class(mushroom.int)

## -----------------------------------------------------------------------------
# data(abalone.int)
# head(abalone.int, 3)
# class(abalone.int)

## -----------------------------------------------------------------------------
# data(abalone.iGAP)
# head(abalone.iGAP, 3)
# class(abalone.iGAP)

## -----------------------------------------------------------------------------
# int_detect_format(mushroom.int)
# int_detect_format(abalone.int)
# int_detect_format(abalone.iGAP)

## -----------------------------------------------------------------------------
# int_list_conversions()

## -----------------------------------------------------------------------------
# # RSDA to MM
# mushroom.MM <- int_convert_format(mushroom.int, to = "MM")
# head(mushroom.MM, 3)

## -----------------------------------------------------------------------------
# # iGAP to MM
# abalone.MM <- int_convert_format(abalone.iGAP, to = "MM")
# head(abalone.MM, 3)

## -----------------------------------------------------------------------------
# # iGAP to RSDA
# data(face.iGAP)
# face.RSDA <- int_convert_format(face.iGAP, to = "RSDA")
# head(face.RSDA, 3)

## -----------------------------------------------------------------------------
# # RSDA to MM
# mushroom.MM <- RSDA_to_MM(mushroom.int, RSDA = TRUE)
# head(mushroom.MM, 3)

## -----------------------------------------------------------------------------
# # MM to iGAP
# mushroom.iGAP <- MM_to_iGAP(mushroom.MM)
# head(mushroom.iGAP, 3)

## -----------------------------------------------------------------------------
# # iGAP to MM
# data(face.iGAP)
# face.MM <- iGAP_to_MM(face.iGAP, location = 1:6)
# head(face.MM, 3)

## -----------------------------------------------------------------------------
# # MM to RSDA
# face.RSDA <- MM_to_RSDA(face.MM)
# head(face.RSDA, 3)
# class(face.RSDA)

## -----------------------------------------------------------------------------
# # iGAP to RSDA (direct, one-step)
# abalone.RSDA <- iGAP_to_RSDA(abalone.iGAP, location = 1:7)
# head(abalone.RSDA, 3)
# class(abalone.RSDA)

## -----------------------------------------------------------------------------
# # RSDA to iGAP
# mushroom.iGAP2 <- RSDA_to_iGAP(mushroom.int)
# head(mushroom.iGAP2, 3)

## -----------------------------------------------------------------------------
# data(mushroom.int.mm)
# head(mushroom.int.mm, 3)

## -----------------------------------------------------------------------------
# mushroom_set <- set_variable_format(data = mushroom.int.mm, location = 8,
#                                     var = "Species")
# head(mushroom_set, 3)

## -----------------------------------------------------------------------------
# mushroom_tmp <- RSDA_format(data = mushroom_set,
#                             sym_type1 = c("I", "I", "I", "S"),
#                             location = c(25, 27, 29, 31),
#                             sym_type2 = c("S"),
#                             var = c("Species"))
# head(mushroom_tmp, 3)

## -----------------------------------------------------------------------------
# mushroom_clean <- clean_colnames(data = mushroom_tmp)
# head(mushroom_clean, 3)

## -----------------------------------------------------------------------------
# write_symbolic_csv(mushroom_clean, file = "mushroom_interval.csv")
# mushroom_int <- read_symbolic_csv(file = "mushroom_interval.csv")
# head(mushroom_int, 3)
# class(mushroom_int)

## ----include=FALSE------------------------------------------------------------
# file.remove("mushroom_interval.csv")

## -----------------------------------------------------------------------------
# BLOOD[1:3, 1:2]

## -----------------------------------------------------------------------------
# A1 <- c(50, 60, 70, 80, 90, 100, 110, 120)
# B1 <- c(0.00, 0.02, 0.08, 0.32, 0.62, 0.86, 0.92, 1.00)
# A2 <- c(50, 60, 70, 80, 90, 100, 110, 120)
# B2 <- c(0.00, 0.05, 0.12, 0.42, 0.68, 0.88, 0.94, 1.00)
# A3 <- c(50, 60, 70, 80, 90, 100, 110, 120)
# B3 <- c(0.00, 0.03, 0.24, 0.36, 0.75, 0.85, 0.98, 1.00)
# 
# ListOfWeight <- list(
#   distributionH(A1, B1),
#   distributionH(A2, B2),
#   distributionH(A3, B3)
# )
# 
# Weight <- methods::new("MatH",
#                        nrows = 3, ncols = 1, ListOfDist = ListOfWeight,
#                        names.rows = c("20s", "30s", "40s"),
#                        names.cols = c("weight"), by.row = FALSE)
# Weight

## -----------------------------------------------------------------------------
# data(mushroom.int)
# var_name <- c("Stipe.Length", "Stipe.Thickness")
# int_mean(mushroom.int, var_name, method = c("CM", "FV", "EJD"))

## -----------------------------------------------------------------------------
# data(mushroom.int)
# var_name <- c("Pileus.Cap.Width", "Stipe.Length")
# method <- c("CM", "VM", "QM", "SE", "FV", "EJD", "GQ", "SPT")
# 
# mean_mat <- int_mean(mushroom.int, var_name, method)
# mean_mat
# 
# var_mat <- int_var(mushroom.int, var_name, method)
# var_mat

## ----fig.width=7, fig.height=8------------------------------------------------
# cols <- c("#4E79A7", "#F28E2B")
# par(mfrow = c(2, 1), mar = c(5, 4, 3, 6), las = 2, xpd = TRUE)
# 
# # --- Mean across eight methods ---
# bp <- barplot(t(mean_mat), beside = TRUE, col = cols,
#               main = "Interval Mean by Method (mushroom.int)",
#               ylab = "Mean",
#               ylim = c(0, max(mean_mat) * 1.25))
# legend("topright", inset = c(-0.18, 0),
#        legend = colnames(mean_mat), fill = cols, bty = "n", cex = 0.85)
# 
# # --- Variance across eight methods ---
# bp <- barplot(t(var_mat), beside = TRUE, col = cols,
#               main = "Interval Variance by Method (mushroom.int)",
#               ylab = "Variance",
#               ylim = c(0, max(var_mat) * 1.25))
# legend("topright", inset = c(-0.18, 0),
#        legend = colnames(var_mat), fill = cols, bty = "n", cex = 0.85)

## -----------------------------------------------------------------------------
# cov_list <- int_cov(mushroom.int, "Pileus.Cap.Width", "Stipe.Length", method)
# cor_list <- int_cor(mushroom.int, "Pileus.Cap.Width", "Stipe.Length", method)
# 
# # Collect scalar values into named vectors for display and plotting
# cov_vec <- sapply(cov_list, function(x) x[1, 1])
# cor_vec <- sapply(cor_list, function(x) x[1, 1])
# 
# data.frame(Method = names(cov_vec), Covariance = round(cov_vec, 4),
#            Correlation = round(cor_vec, 4), row.names = NULL)

## ----fig.width=7, fig.height=8------------------------------------------------
# par(mfrow = c(2, 1), mar = c(5, 4, 3, 1), las = 2)
# 
# # --- Covariance across eight methods ---
# bar_cols <- c("#4E79A7", "#59A14F", "#F28E2B", "#E15759",
#               "#76B7B2", "#EDC948", "#B07AA1", "#FF9DA7")
# bp <- barplot(cov_vec, col = bar_cols, border = NA,
#               main = "Cov(Pileus.Cap.Width, Stipe.Length) by Method",
#               ylab = "Covariance",
#               ylim = c(0, max(cov_vec) * 1.25))
# text(bp, cov_vec, labels = round(cov_vec, 2), pos = 3, cex = 0.8)
# 
# # --- Correlation across eight methods ---
# bp <- barplot(cor_vec, col = bar_cols, border = NA,
#               main = "Cor(Pileus.Cap.Width, Stipe.Length) by Method",
#               ylab = "Correlation",
#               ylim = c(0, 1.15))
# text(bp, cor_vec, labels = round(cor_vec, 2), pos = 3, cex = 0.8)
# abline(h = 1, lty = 2, col = "grey50")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Width = upper - lower
# head(int_width(mushroom.int, "Stipe.Length"))
# 
# # Radius = width / 2
# head(int_radius(mushroom.int, "Stipe.Length"))
# 
# # Center = (lower + upper) / 2
# head(int_center(mushroom.int, "Stipe.Length"))
# 
# # Midrange
# head(int_midrange(mushroom.int, "Stipe.Length"))

## -----------------------------------------------------------------------------
# # Overlap between two interval variables
# head(int_overlap(mushroom.int, "Stipe.Length", "Stipe.Thickness"))
# 
# # Containment: proportion of var_name2 contained within var_name1
# head(int_containment(mushroom.int, "Stipe.Length", "Stipe.Thickness"))

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Median (default method = "CM")
# int_median(mushroom.int, "Stipe.Length")
# 
# # Quantiles
# int_quantile(mushroom.int, "Stipe.Length", probs = c(0.25, 0.5, 0.75))
# 
# # Compare median across methods
# int_median(mushroom.int, "Stipe.Length", method = c("CM", "FV"))

## -----------------------------------------------------------------------------
# # Range (max - min)
# int_range(mushroom.int, "Stipe.Length")
# 
# # Interquartile range (Q3 - Q1)
# int_iqr(mushroom.int, "Stipe.Length")
# 
# # Median absolute deviation
# int_mad(mushroom.int, "Stipe.Length")
# 
# # Mode (histogram-based estimation)
# int_mode(mushroom.int, "Stipe.Length")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Compare standard mean vs trimmed mean (10% trim)
# int_mean(mushroom.int, "Stipe.Length", method = "CM")
# int_trimmed_mean(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")
# 
# # Winsorized mean: extreme values are replaced (not removed)
# int_winsorized_mean(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")

## -----------------------------------------------------------------------------
# int_var(mushroom.int, "Stipe.Length", method = "CM")
# int_trimmed_var(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")
# int_winsorized_var(mushroom.int, "Stipe.Length", trim = 0.1, method = "CM")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Skewness: asymmetry of the distribution
# int_skewness(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Kurtosis: tail heaviness
# int_kurtosis(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Symmetry coefficient
# int_symmetry(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Tailedness (related to kurtosis)
# int_tailedness(mushroom.int, "Stipe.Length", method = "CM")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# int_jaccard(mushroom.int, "Stipe.Length", "Stipe.Thickness")
# int_dice(mushroom.int, "Stipe.Length", "Stipe.Thickness")
# int_cosine(mushroom.int, "Stipe.Length", "Stipe.Thickness")
# int_overlap_coefficient(mushroom.int, "Stipe.Length", "Stipe.Thickness")

## -----------------------------------------------------------------------------
# int_tanimoto(mushroom.int, "Stipe.Length", "Stipe.Thickness")

## -----------------------------------------------------------------------------
# int_similarity_matrix(mushroom.int, method = "jaccard")

## -----------------------------------------------------------------------------
# data(mushroom.int)
# 
# # Shannon entropy (higher = more uncertainty)
# int_entropy(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Coefficient of variation (SD / mean)
# int_cv(mushroom.int, "Stipe.Length", method = "CM")
# 
# # Dispersion index
# int_dispersion(mushroom.int, "Stipe.Length", method = "CM")

## -----------------------------------------------------------------------------
# # Imprecision: based on interval widths
# int_imprecision(mushroom.int, "Stipe.Length")
# 
# # Granularity: variability in interval sizes
# int_granularity(mushroom.int, "Stipe.Length")
# 
# # Uniformity: inverse of granularity (higher = more uniform)
# int_uniformity(mushroom.int, "Stipe.Length")
# 
# # Normalized information content (between 0 and 1)
# int_information_content(mushroom.int, "Stipe.Length", method = "CM")

## -----------------------------------------------------------------------------
# data(car.int)
# car_num <- car.int[, 2:5]
# head(car_num, 3)

## -----------------------------------------------------------------------------
# # Euclidean distance between observations
# int_dist(car_num, method = "euclidean")

## -----------------------------------------------------------------------------
# # Return as a full matrix
# dm <- int_dist_matrix(car_num, method = "hausdorff")
# dm[1:5, 1:5]

## -----------------------------------------------------------------------------
# int_pairwise_dist(car_num, "Price", "Max_Velocity", method = "euclidean")

## -----------------------------------------------------------------------------
# all_dists <- int_dist_all(car_num)
# names(all_dists)

## -----------------------------------------------------------------------------
# all_methods <- c("BG", "BD", "B", "L2W")
# var_names <- c("Cholesterol", "Hemoglobin")
# 
# # Compute mean for each variable and method
# mean_mat <- sapply(all_methods, function(m) {
#   sapply(var_names, function(v) hist_mean(BLOOD, v, method = m))
# })
# rownames(mean_mat) <- var_names
# mean_mat
# 
# # Compute variance for each variable and method
# var_mat <- sapply(all_methods, function(m) {
#   sapply(var_names, function(v) hist_var(BLOOD, v, method = m))
# })
# rownames(var_mat) <- var_names
# var_mat

## ----fig.width=7, fig.height=8------------------------------------------------
# bar_cols <- c("#4E79A7", "#59A14F", "#F28E2B", "#E15759")
# par(mfrow = c(2, 2), mar = c(4, 5, 3, 1), las = 1)
# 
# # --- Mean: Cholesterol ---
# bp <- barplot(mean_mat["Cholesterol", ], col = bar_cols, border = NA,
#               main = "Mean of Cholesterol", ylab = "Mean",
#               ylim = c(0, max(mean_mat["Cholesterol", ]) * 1.15))
# text(bp, mean_mat["Cholesterol", ],
#      labels = round(mean_mat["Cholesterol", ], 2), pos = 3, cex = 0.8)
# 
# # --- Mean: Hemoglobin ---
# bp <- barplot(mean_mat["Hemoglobin", ], col = bar_cols, border = NA,
#               main = "Mean of Hemoglobin", ylab = "Mean",
#               ylim = c(0, max(mean_mat["Hemoglobin", ]) * 1.15))
# text(bp, mean_mat["Hemoglobin", ],
#      labels = round(mean_mat["Hemoglobin", ], 2), pos = 3, cex = 0.8)
# 
# # --- Variance: Cholesterol ---
# bp <- barplot(var_mat["Cholesterol", ], col = bar_cols, border = NA,
#               main = "Variance of Cholesterol", ylab = "Variance",
#               ylim = c(0, max(var_mat["Cholesterol", ]) * 1.25))
# text(bp, var_mat["Cholesterol", ],
#      labels = round(var_mat["Cholesterol", ], 1), pos = 3, cex = 0.8)
# 
# # --- Variance: Hemoglobin ---
# bp <- barplot(var_mat["Hemoglobin", ], col = bar_cols, border = NA,
#               main = "Variance of Hemoglobin", ylab = "Variance",
#               ylim = c(0, max(var_mat["Hemoglobin", ]) * 1.25))
# text(bp, var_mat["Hemoglobin", ],
#      labels = round(var_mat["Hemoglobin", ], 4), pos = 3, cex = 0.8)

## -----------------------------------------------------------------------------
# cov_vec <- sapply(all_methods, function(m)
#   hist_cov(BLOOD, "Cholesterol", "Hemoglobin", method = m))
# cor_vec <- sapply(all_methods, function(m)
#   hist_cor(BLOOD, "Cholesterol", "Hemoglobin", method = m))
# 
# data.frame(Method = all_methods,
#            Covariance = round(cov_vec, 4),
#            Correlation = round(cor_vec, 4),
#            row.names = NULL)

## ----fig.width=7, fig.height=4------------------------------------------------
# par(mfrow = c(1, 2), mar = c(4, 5, 3, 1), las = 1)
# 
# # --- Covariance ---
# bp <- barplot(cov_vec, col = bar_cols, border = NA,
#               main = "Cov(Cholesterol, Hemoglobin)",
#               ylab = "Covariance",
#               ylim = c(min(cov_vec) * 1.35, 0))
# text(bp, cov_vec, labels = round(cov_vec, 2), pos = 1, cex = 0.8)
# 
# # --- Correlation ---
# bp <- barplot(cor_vec, col = bar_cols, border = NA,
#               main = "Cor(Cholesterol, Hemoglobin)",
#               ylab = "Correlation",
#               ylim = c(min(cor_vec) * 1.4, 0))
# text(bp, cor_vec, labels = round(cor_vec, 2), pos = 1, cex = 0.8)
# abline(h = -1, lty = 2, col = "grey50")

## ----eda-aggregate------------------------------------------------------------
# set.seed(42)
# iris_int <- aggregate_to_symbolic(
#   iris,
#   type        = "int",
#   group_by    = "kmeans",
#   stratify_var = "Species",
#   K           = 10
# )
# iris_int

## ----eda-gginterval-setup, eval = has_ggInterval, message = FALSE-------------
# library(ggInterval)
# library(ggplot2)
# 
# # Keep only interval columns (drop the 'sample' label column).
# # Fix zero-width intervals from singleton clusters.
# iris_int_num <- iris_int[, sapply(iris_int, inherits, "symbolic_interval")]
# for (v in colnames(iris_int_num)) {
#   cv <- unclass(iris_int_num[[v]])
#   w  <- Im(cv) - Re(cv)
#   fix <- which(w == 0)
#   if (length(fix) > 0) {
#     cv[fix] <- complex(real = Re(cv[fix]) - 1e-6, imaginary = Im(cv[fix]) + 1e-6)
#     class(cv) <- c("symbolic_interval", "vctrs_vctr")
#     iris_int_num[[v]] <- cv
#   }
# }

## ----eda-indeximage, eval = has_ggInterval, fig.width = 8, fig.height = 5-----
# ggInterval_indexImage(iris_int_num, plotAll = TRUE)

## ----eda-pca, eval = has_ggInterval, fig.width = 6, fig.height = 5------------
# ggInterval_PCA(iris_int_num)

## ----eda-radar, eval = has_ggInterval, fig.width = 7, fig.height = 7----------
# data(environment.mix)
# env_int <- environment.mix[, 5:17]
# ggInterval_radarplot(env_int, plotPartial = c(4, 6),
#                      showLegend = FALSE, addText = FALSE)

## ----eda-ts, fig.width = 12, fig.height = 4-----------------------------------
# library(ggplot2)
# data(irish_wind.its)
# wind_sub <- irish_wind.its[1:12, ]
# 
# # Reshape to long format
# stations <- c("BIR", "DUB", "KIL", "SHA", "VAL")
# wind_long <- do.call(rbind, lapply(stations, function(st) {
#   data.frame(
#     month_num = seq_len(12),
#     Station   = st,
#     lower     = wind_sub[[paste0(st, "_l")]],
#     upper     = wind_sub[[paste0(st, "_u")]],
#     mid       = (wind_sub[[paste0(st, "_l")]] + wind_sub[[paste0(st, "_u")]]) / 2
#   )
# }))
# wind_long$Station <- factor(wind_long$Station, levels = stations)
# 
# # Dodge bars for each station within each month
# n_st  <- length(stations)
# bar_w <- 0.6 / n_st
# wind_long$st_idx <- as.numeric(wind_long$Station)
# wind_long$x <- wind_long$month_num +
#   (wind_long$st_idx - (n_st + 1) / 2) * bar_w
# 
# ggplot(wind_long) +
#   geom_rect(aes(xmin = x - bar_w / 2, xmax = x + bar_w / 2,
#                 ymin = lower, ymax = upper, fill = Station),
#             alpha = 0.4, color = NA) +
#   geom_line(aes(x = x, y = mid, color = Station, group = Station),
#             linewidth = 0.5) +
#   geom_point(aes(x = x, y = mid, color = Station), size = 1) +
#   scale_x_continuous(breaks = 1:12, labels = month.abb) +
#   labs(title = "Irish Wind Speed Intervals (1961)",
#        x = "Month", y = "Wind Speed (knots)") +
#   theme_grey(base_size = 12)

## ----clust-int-helpers, include = FALSE---------------------------------------
# # Helper: extract interval-only columns as symbolic_tbl
# .get_interval_cols <- function(x) {
#   int_cols <- sapply(x, function(col) inherits(col, "symbolic_interval"))
#   if (sum(int_cols) == 0) return(x)
#   out <- x[, int_cols, drop = FALSE]
#   class(out) <- c("symbolic_tbl", class(out))
#   out
# }
# 
# # Helper: convert symbolic_tbl to 3D array [n, p, 2] for symbolicDA
# .to_3d_array <- function(x) {
#   n <- nrow(x); p <- ncol(x)
#   arr <- array(0, dim = c(n, p, 2))
#   for (j in seq_len(p)) {
#     cv <- unclass(x[[j]])
#     arr[, j, 1] <- Re(cv)
#     arr[, j, 2] <- Im(cv)
#   }
#   arr
# }
# 
# # Helper: compute clustering quality (1 - WSS/TSS) from distance matrix
# .clust_quality <- function(d, cl) {
#   d <- as.matrix(d)
#   n <- nrow(d)
#   TSS <- sum(d^2) / (2 * n)
#   WSS <- 0
#   for (k in unique(cl)) {
#     idx <- which(cl == k)
#     nk <- length(idx)
#     if (nk > 1) WSS <- WSS + sum(d[idx, idx]^2) / (2 * nk)
#   }
#   1 - WSS / TSS
# }
# 
# # Helper: find optimal k via n-adaptive elbow method with 2-step lookahead
# .find_optimal_k <- function(qualities, n) {
#   ks <- as.integer(names(qualities))
#   qs <- qualities
#   valid <- !is.na(qs)
#   if (sum(valid) < 2) return(ks[which(valid)[1]])
#   valid_ks <- ks[valid]; valid_qs <- qs[valid]
#   gains <- diff(valid_qs)
#   max_gain <- max(gains, na.rm = TRUE)
#   if (max_gain <= 0) return(valid_ks[1])
#   threshold <- max_gain / (1 + n / 100)
#   for (i in seq_along(gains)) {
#     if (gains[i] < threshold) {
#       look <- (i + 1):min(i + 2, length(gains))
#       look <- look[look >= i + 1 & look <= length(gains)]
#       ahead <- gains[look]
#       if (length(ahead) > 0 && any(!is.na(ahead) & ahead >= threshold)) next
#       return(valid_ks[i])
#     }
#   }
#   valid_ks[length(valid_ks)]
# }

## ----clust-int, eval = has_symbolicDA && not_cran, results = "hide"-----------
# library(symbolicDA)
# 
# set.seed(123)
# 
# datasets_clust_int <- list(
#   list(name = "face.iGAP",              data = "face.iGAP"),
#   list(name = "prostate.int",           data = "prostate.int"),
#   list(name = "nycflights.int",         data = "nycflights.int"),
#   list(name = "china_temp.int",         data = "china_temp.int"),
#   list(name = "lisbon_air_quality.int", data = "lisbon_air_quality.int")
# )
# 
# clust_int_results <- do.call(rbind, lapply(datasets_clust_int, function(ds) {
#   tryCatch({
#     data(list = ds$data)
#     x <- get(ds$data)
#     if (!inherits(x, "symbolic_tbl")) {
#       x <- tryCatch(int_convert_format(x, to = "RSDA"), error = function(e) x)
#       for (i in seq_along(x)) {
#         if (is.complex(x[[i]]) && !inherits(x[[i]], "symbolic_interval"))
#           class(x[[i]]) <- c("symbolic_interval", "vctrs_vctr")
#       }
#       if (!inherits(x, "symbolic_tbl"))
#         class(x) <- c("symbolic_tbl", class(x))
#     }
#     x_int <- .get_interval_cols(x)
#     n <- nrow(x_int); p <- ncol(x_int)
#     k_max <- min(n - 1, 10, max(3, floor(n / 5)))
# 
#     d <- int_dist_matrix(x_int, method = "hausdorff")
#     so <- simple2SO(.to_3d_array(x_int))
# 
#     km_qs <- dc_qs <- sc_qs <- setNames(rep(NA_real_, k_max - 1),
#                                          as.character(2:k_max))
#     for (k in 2:k_max) {
#       set.seed(123)
#       km_qs[as.character(k)] <- tryCatch({
#         res <- sym.kmeans(x_int, k = k)
#         1 - res$tot.withinss / res$totss
#       }, error = function(e) NA)
# 
#       set.seed(123)
#       dc_qs[as.character(k)] <- tryCatch({
#         cl <- DClust(d, cl = k, iter = 100)
#         .clust_quality(d, cl)
#       }, error = function(e) NA)
# 
#       set.seed(123)
#       sc_qs[as.character(k)] <- tryCatch({
#         cl <- SClust(so, cl = k, iter = 100)
#         .clust_quality(d, cl)
#       }, error = function(e) NA)
#     }
# 
#     km_k <- .find_optimal_k(km_qs, n); km_q <- km_qs[as.character(km_k)]
#     dc_k <- .find_optimal_k(dc_qs, n); dc_q <- dc_qs[as.character(dc_k)]
#     sc_k <- .find_optimal_k(sc_qs, n); sc_q <- sc_qs[as.character(sc_k)]
# 
#     data.frame(Dataset = ds$name, n = n, p = p,
#                sym.kmeans = sprintf("%.4f (%d)", km_q, km_k),
#                DClust     = sprintf("%.4f (%d)", dc_q, dc_k),
#                SClust     = sprintf("%.4f (%d)", sc_q, sc_k))
#   }, error = function(e) NULL)
# }))

## ----clust-int-table, eval = has_symbolicDA && not_cran-----------------------
# kable(clust_int_results, row.names = FALSE,
#       caption = "Table 4: Interval clustering quality (1 - WSS/TSS) with optimal k in parentheses")

## ----clust-hist-helpers, include = FALSE--------------------------------------
# # Helper: parse a dataSDA histogram string into a HistDAWass distributionH
# # Format: "{[lo, hi), prob; [lo, hi], prob; ...}"
# .parse_hist_to_distH <- function(s) {
#   s <- trimws(sub("^\\{", "", sub("\\}$", "", s)))
#   bins <- trimws(strsplit(s, ";")[[1]])
#   xs <- numeric(0)
#   ps <- numeric(0)
#   for (b in bins) {
#     b_clean <- gsub("\\[|\\]|\\(|\\)", "", b)  # strip brackets
#     parts <- as.numeric(trimws(strsplit(b_clean, ",")[[1]]))
#     lo <- parts[1]; hi <- parts[2]; p <- parts[3]
#     if (length(xs) == 0) xs <- lo
#     xs <- c(xs, hi)
#     ps <- c(ps, p)
#   }
#   cp <- c(0, cumsum(ps))
#   cp[length(cp)] <- 1  # ensure exact 1
#   distributionH(xs, cp)
# }
# 
# # Helper: convert a dataSDA histogram data frame to a HistDAWass MatH object
# # Keeps histogram columns with >50% non-NA, then drops incomplete rows
# .dataSDA_hist_to_MatH <- function(df) {
#   df <- as.data.frame(df)
#   hist_cols <- names(df)[sapply(df, is.character)]
#   hist_cols <- hist_cols[sapply(hist_cols, function(cn)
#     any(grepl("^\\{\\[", na.omit(df[[cn]]))))]
#   # Keep only columns where >50% of values are non-NA (handles conditional vars)
#   hist_cols <- hist_cols[sapply(hist_cols, function(cn)
#     mean(!is.na(df[[cn]])) > 0.5)]
#   # Drop rows with remaining NAs
#   complete <- complete.cases(df[, hist_cols, drop = FALSE])
#   df <- df[complete, , drop = FALSE]
#   n <- nrow(df); p <- length(hist_cols)
#   dists <- vector("list", n * p)
#   for (j in seq_along(hist_cols)) {
#     for (i in seq_len(n)) {
#       dists[[(j - 1) * n + i]] <- .parse_hist_to_distH(df[[hist_cols[j]]][i])
#     }
#   }
#   rn <- if (!is.null(rownames(df))) rownames(df) else paste0("I", seq_len(n))
#   methods::new("MatH",
#     nrows = n, ncols = p,
#     ListOfDist = dists,
#     names.rows = rn,
#     names.cols = hist_cols,
#     by.row = FALSE)
# }

## ----clust-hist, results = "hide"---------------------------------------------
# set.seed(123)
# 
# datasets_clust_hist <- list(
#   list(name = "age_pyramids.hist"),
#   list(name = "ozone.hist"),
#   list(name = "china_climate_season.hist"),
#   list(name = "french_agriculture.hist"),
#   list(name = "flights_detail.hist")
# )
# 
# clust_hist_results <- do.call(rbind, lapply(datasets_clust_hist, function(ds) {
#   tryCatch({
#     data(list = ds$name, package = "dataSDA")
#     raw <- get(ds$name)
#     x <- .dataSDA_hist_to_MatH(raw)
#     n <- nrow(x@M); p <- ncol(x@M)
#     k_max <- min(n - 1, 10, max(3, floor(n / 5)))
# 
#     # Precompute Wasserstein distance matrix and hclust tree (shared across k)
#     dm <- WH_MAT_DIST(x)
#     set.seed(123)
#     hc <- WH_hclust(x, simplify = TRUE)
# 
#     km_qs <- fc_qs <- hc_qs <- setNames(rep(NA_real_, k_max - 1),
#                                          as.character(2:k_max))
#     for (k in 2:k_max) {
#       set.seed(123)
#       km_qs[as.character(k)] <- tryCatch({
#         res <- WH_kmeans(x, k = k)
#         res$quality
#       }, error = function(e) NA)
# 
#       set.seed(123)
#       fc_qs[as.character(k)] <- tryCatch({
#         res <- WH_fcmeans(x, k = k)
#         res$quality
#       }, error = function(e) NA)
# 
#       set.seed(123)
#       hc_qs[as.character(k)] <- tryCatch({
#         cl <- cutree(hc, k = k)
#         .clust_quality(dm, cl)
#       }, error = function(e) NA)
#     }
# 
#     km_k <- .find_optimal_k(km_qs, n); km_q <- km_qs[as.character(km_k)]
#     fc_k <- .find_optimal_k(fc_qs, n); fc_q <- fc_qs[as.character(fc_k)]
#     hc_k <- .find_optimal_k(hc_qs, n); hc_q <- hc_qs[as.character(hc_k)]
# 
#     data.frame(Dataset = ds$name, n = n, p = p,
#                WH_kmeans  = sprintf("%.4f (%d)", km_q, km_k),
#                WH_fcmeans = sprintf("%.4f (%d)", fc_q, fc_k),
#                WH_hclust  = sprintf("%.4f (%d)", hc_q, hc_k))
#   }, error = function(e) NULL)
# }))

## ----clust-hist-table---------------------------------------------------------
# kable(clust_hist_results, row.names = FALSE,
#       caption = "Table 5: Histogram clustering quality (1 - WSS/TSS) with optimal k in parentheses")

## ----class-helpers, include = FALSE-------------------------------------------
# # Helper: extract class labels from symbolic_set or character/factor column
# .get_class_labels <- function(x, col) {
#   cls <- x[[col]]
#   if (inherits(cls, "symbolic_set")) {
#     factor(vapply(cls, function(v) paste(v, collapse = ","), character(1)))
#   } else {
#     factor(cls)
#   }
# }
# 
# # Helper: build IData from interval columns of a symbolic_tbl
# .build_IData <- function(x) {
#   int_cols <- sapply(x, function(col) inherits(col, "symbolic_interval"))
#   df <- data.frame(row.names = seq_len(nrow(x)))
#   for (v in names(x)[int_cols]) {
#     cv <- unclass(x[[v]])
#     df[[paste0(v, "_l")]] <- Re(cv)
#     df[[paste0(v, "_u")]] <- Im(cv)
#   }
#   MAINT.Data::IData(df)
# }

## ----class-int, eval = has_MAINT && has_e1071 && not_cran, results = "hide"----
# library(MAINT.Data)
# library(e1071)
# 
# datasets_class <- list(
#   list(name = "cars.int",        data = "cars.int",
#        class_col = "class",
#        class_desc = "class: Utilitarian(7), Berlina(8), Sportive(8), Luxury(4)"),
#   list(name = "china_temp.int",  data = "china_temp.int",
#        class_col = "GeoReg",
#        class_desc = "GeoReg: 6 regions"),
#   list(name = "mushroom.int",    data = "mushroom.int",
#        class_col = "Edibility",
#        class_desc = "Edibility: T(4), U(2), Y(17)"),
#   list(name = "ohtemp.int",      data = "ohtemp.int",
#        class_col = "STATE",
#        class_desc = "STATE: 10 groups"),
#   list(name = "wine.int",        data = "wine.int",
#        class_col = "class",
#        class_desc = "class: 1(21), 2(12)")
# )
# 
# class_results <- do.call(rbind, lapply(datasets_class, function(ds) {
#   tryCatch({
#     data(list = ds$data)
#     x <- get(ds$data)
#     grp <- .get_class_labels(x, ds$class_col)
# 
#     idata <- .build_IData(x)
# 
#     int_cols <- sapply(x, function(col) inherits(col, "symbolic_interval"))
#     svm_df <- data.frame(row.names = seq_len(nrow(x)))
#     for (v in names(x)[int_cols]) {
#       cv <- unclass(x[[v]])
#       svm_df[[paste0(v, "_l")]] <- Re(cv)
#       svm_df[[paste0(v, "_u")]] <- Im(cv)
#     }
# 
#     set.seed(123)
#     lda_acc <- tryCatch({
#       res <- MAINT.Data::lda(idata, grouping = grp)
#       pred <- predict(res, idata)
#       mean(pred$class == grp)
#     }, error = function(e) NA)
# 
#     set.seed(123)
#     qda_acc <- tryCatch({
#       res <- MAINT.Data::qda(idata, grouping = grp)
#       pred <- predict(res, idata)
#       mean(pred$class == grp)
#     }, error = function(e) NA)
# 
#     set.seed(123)
#     svm_acc <- tryCatch({
#       svm_df$class <- grp
#       res <- svm(class ~ ., data = svm_df, kernel = "radial")
#       pred <- predict(res, svm_df)
#       mean(pred == grp)
#     }, error = function(e) NA)
# 
#     data.frame(Dataset = ds$name, Response = ds$class_desc,
#                LDA = lda_acc, QDA = qda_acc, SVM = svm_acc)
#   }, error = function(e) NULL)
# }))

## ----class-int-table, eval = has_MAINT && has_e1071 && not_cran---------------
# kable(class_results, digits = 4, row.names = FALSE,
#       caption = "Table 6: Classification accuracy (resubstitution)")

## ----reg-int, results = "hide"------------------------------------------------
# datasets_reg <- list(
#   list(name = "abalone.iGAP",      data = "abalone.iGAP",
#        response = "Length",           n_x = 6),
#   list(name = "cardiological.int",  data = "cardiological.int",
#        response = "pulse",            n_x = 4),
#   list(name = "nycflights.int",     data = "nycflights.int",
#        response = "distance",         n_x = 3),
#   list(name = "oils.int",           data = "oils.int",
#        response = "specific_gravity", n_x = 3),
#   list(name = "prostate.int",       data = "prostate.int",
#        response = "lpsa",             n_x = 8)
# )
# 
# reg_results <- do.call(rbind, lapply(datasets_reg, function(ds) {
#   tryCatch({
#     data(list = ds$data)
#     x <- get(ds$data)
# 
#     if (!inherits(x, "symbolic_tbl")) {
#       x2 <- tryCatch(int_convert_format(x, to = "RSDA"), error = function(e) NULL)
#       if (!is.null(x2)) {
#         x <- x2
#         for (i in seq_along(x)) {
#           if (is.complex(x[[i]]) && !inherits(x[[i]], "symbolic_interval"))
#             class(x[[i]]) <- c("symbolic_interval", "vctrs_vctr")
#         }
#         if (!inherits(x, "symbolic_tbl"))
#           class(x) <- c("symbolic_tbl", class(x))
#       } else {
#         cn <- colnames(x)
#         l_cols <- grep("_l$", cn, value = TRUE)
#         vars <- sub("_l$", "", l_cols)
#         out <- data.frame(row.names = seq_len(nrow(x)))
#         for (v in vars) {
#           lv <- x[[paste0(v, "_l")]]; uv <- x[[paste0(v, "_u")]]
#           si <- complex(real = lv, imaginary = uv)
#           class(si) <- c("symbolic_interval", "vctrs_vctr")
#           out[[v]] <- si
#         }
#         class(out) <- c("symbolic_tbl", class(out))
#         x <- out
#       }
#     }
#     x_int <- .get_interval_cols(x)
# 
#     fml <- as.formula(paste(ds$response, "~ ."))
#     nc <- data.frame(row.names = seq_len(nrow(x_int)))
#     for (v in names(x_int)) {
#       cv <- unclass(x_int[[v]])
#       nc[[v]] <- (Re(cv) + Im(cv)) / 2
#     }
#     actual <- nc[[ds$response]]
#     resp_idx <- which(names(x_int) == ds$response)
#     .r2 <- function(a, p) 1 - sum((a - p)^2) / sum((a - mean(a))^2)
# 
#     set.seed(123)
#     lm_r2 <- tryCatch({
#       res <- sym.lm(fml, sym.data = x_int, method = "cm")
#       summary(res)$r.squared
#     }, error = function(e) NA)
# 
#     set.seed(123)
#     glm_r2 <- tryCatch({
#       res <- sym.glm(sym.data = x_int, response = resp_idx, method = "cm")
#       pred <- as.numeric(predict(res, newx = as.matrix(nc[, -resp_idx]),
#                                  s = "lambda.min"))
#       .r2(actual, pred)
#     }, error = function(e) NA)
# 
#     set.seed(123)
#     rf_r2 <- tryCatch({
#       res <- sym.rf(fml, sym.data = x_int, method = "cm")
#       tail(res$rsq, 1)
#     }, error = function(e) NA)
# 
#     set.seed(123)
#     rt_r2 <- tryCatch({
#       res <- sym.rt(fml, sym.data = x_int, method = "cm")
#       .r2(actual, predict(res))
#     }, error = function(e) NA)
# 
#     set.seed(123)
#     nnet_r2 <- tryCatch({
#       res <- sym.nnet(fml, sym.data = x_int, method = "cm")
#       pred_sc <- as.numeric(res$net.result[[1]])
#       pred <- pred_sc * res$data_c_sds[resp_idx] + res$data_c_means[resp_idx]
#       .r2(actual, pred)
#     }, error = function(e) NA)
# 
#     data.frame(Dataset = ds$name, Response = ds$response, p = ds$n_x,
#                sym.lm = lm_r2, sym.glm = glm_r2, sym.rf = rf_r2,
#                sym.rt = rt_r2, sym.nnet = nnet_r2)
#   }, error = function(e) NULL)
# }))

## ----reg-int-table------------------------------------------------------------
# kable(reg_results, digits = 4, row.names = FALSE,
#       caption = "Table 7: Regression R-squared")