SCE: Stepwise Clustered Ensemble

Overview

The SCE (Stepwise Clustered Ensemble) package provides implementation of Stepwise Clustered Ensemble (SCE) and Stepwise Cluster Analysis (SCA) methods for multivariate data analysis. These methods are particularly useful for handling complex, high-dimensional datasets and building robust predictive models.

The package supports proper S3 object-oriented programming, providing dedicated output classes with associated methods for print, summary, predict, importance, and evaluate.

Installation

Install SCE from CRAN:

install.packages("SCE")

Or install the development version from GitHub:

# install.packages("devtools")
devtools::install_github("loong2020/Stepwise-Clustered-Ensemble")

Core Functions

Main Modeling Functions

Prediction and Evaluation

Feature Selection and Importance

S3 Classes and Methods

The package provides S3 classes for both SCE and SCA models with convenient methods:

SCE Class Methods

SCA Class Methods

Quick Start with S3 Methods

# Build models
sce_model <- SCE(Training_data = data, X = predictors, Y = predictants, ...)
sca_model <- SCA(Training_data = data, X = predictors, Y = predictants, ...)

# Use S3 methods
print(sce_model)           # Display model info
summary(sce_model)         # Detailed summary
predictions <- predict(sce_model, newdata)  # Make predictions
imp_ranking <- importance(sce_model)  # Calculate variable importance
evaluation <- evaluate(sce_model, Testing_data, Training_data, Predictant)  # Evaluate model

# Check available methods
methods(class = "SCE")
methods(class = "SCA")

Available Datasets

The package includes several datasets for demonstration and testing:

Streamflow Datasets

Air Quality Datasets

Usage Examples

First, load the required packages and data:

# Load required packages
library(SCE)
library(parallel)

SCA (Single tree) Analysis

# Load the example datasets
data(Streamflow_training_10var)
data(Streamflow_testing_10var)

# Define predictors and predictants
Predictors <- c("Prcp", "SRad", "Tmax", "Tmin", "VP", "smlt", "swvl1", "swvl2", "swvl3", "swvl4")
Predictants <- c("Flow")

# Perform SCA
set.seed(123)
model <- SCA(alpha = 0.05, 
            Training_data = Streamflow_training_10var, 
            X = Predictors, 
            Y = Predictants, 
            Nmin = 5, 
            resolution = 100)

# Use S3 methods
print(model)
summary(model)

# Calculate variable importance
Imp_ranking <- importance(model)
print(Imp_ranking)

# Make predictions
prediction <- predict(model, Streamflow_testing_10var)

# Evaluate performance
performance <- evaluate(
  object = model,
  Testing_data = Streamflow_testing_10var,
  Predictant = Predictants
)
print(performance)

Importance_ranking_sorted <- Imp_ranking[order(-Imp_ranking$Relative_Importance), ]
barplot(
  Importance_ranking_sorted$Relative_Importance,
  names.arg = Importance_ranking_sorted$Predictor,
  las = 2, # vertical labels
  col = "skyblue",
  main = "Variable Importance (SCA)",
  ylab = "Importance",
  xlab = "Predictor"
)

SCE (Tree ensemble) Analysis

# Build SCE model
set.seed(123)
Ensemble <- SCE(Training_data = Streamflow_training_10var,
               X = Predictors,
               Y = Predictants,
               mfeature = round(0.5 * length(Predictors)),
               Nmin = 5,
               Ntree = 40,
               alpha = 0.05,
               resolution = 100)

# Use S3 methods
print(Ensemble)
summary(Ensemble)

# Make predictions
predictions <- predict(Ensemble, Streamflow_testing_10var)
cat("Prediction components:", names(predictions), "\n")
cat("Testing predictions dimensions:", dim(predictions$Testing), "\n")

# Calculate variable importance
Imp_ranking <- importance(Ensemble)

# Evaluate model performance
evaluation <- evaluate(
  object = Ensemble,
  Testing_data = Streamflow_testing_10var,
  Training_data = Streamflow_training_10var,
  Predictant = Predictants,
  digits = 3
)
print(evaluation)

Importance_ranking_sorted <- Imp_ranking[order(-Imp_ranking$Relative_Importance), ]
barplot(
  Importance_ranking_sorted$Relative_Importance,
  names.arg = Importance_ranking_sorted$Predictor,
  las = 2, # vertical labels
  col = "skyblue",
  main = "Variable Importance (SCE)",
  ylab = "Importance",
  xlab = "Predictor"
)

Multiple Predictants Case

# Define predictors and multiple predictants
# Load the example datasets
data(Air_quality_training)
data(Air_quality_testing)

Predictors <- c("SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "WSPM")
Predictants <- c("PM2.5", "PM10")

# Build and evaluate model
set.seed(123)
Ensemble <- SCE(Training_data = Air_quality_training,
               X = Predictors,
               Y = Predictants,
               mfeature = round(0.5 * length(Predictors)),
               Nmin = 5,
               Ntree = 40,
               alpha = 0.05,
               resolution = 100)

# Use S3 methods
print(Ensemble)
summary(Ensemble)

# Make predictions
predictions <- predict(Ensemble, Air_quality_testing)

# Calculate variable importance
Imp_ranking <- importance(Ensemble)

# Evaluate model performance
evaluation <- evaluate(
  object = Ensemble,
  Testing_data = Air_quality_testing,
  Training_data = Air_quality_training,
  Predictant = Predictants
)
print(evaluation)

Importance_ranking_sorted <- Imp_ranking[order(-Imp_ranking$Relative_Importance), ]
barplot(
  Importance_ranking_sorted$Relative_Importance,
  names.arg = Importance_ranking_sorted$Predictor,
  las = 2, # vertical labels
  col = "skyblue",
  main = "Variable Importance (SCE)",
  ylab = "Importance",
  xlab = "Predictor"
)

Recursive Feature Elimination

# Load the example datasets
data(Streamflow_training_22var)
data(Streamflow_testing_22var)

# Define predictors and predictants
Predictors <- c(
  "Precipitation", "Radiation", "Tmax", "Tmin", "VP",
  "Precipitation_2Mon", "Radiation_2Mon", "Tmax_2Mon", "Tmin_2Mon", "VP_2Mon",
  "PNA", "Nino3.4", "IPO", "PDO",
  "PNA_lag1", "Nino3.4_lag1", "IPO_lag1", "PDO_lag1",
  "PNA_lag2", "Nino3.4_lag2", "IPO_lag2", "PDO_lag2"
)
Predictants <- c("Flow")

# Perform RFE
set.seed(1)
result <- RFE_SCE(
  Training_data = Streamflow_training_22var,
  Testing_data = Streamflow_testing_22var,
  Predictors = Predictors,
  Predictant = Predictants,
  Nmin = 5,
  Ntree = 48,
  alpha = 0.05,
  resolution = 1000,
  step = 3  # Number of predictors to remove at each iteration
)

# Plot RFE results
Plot_RFE(result)

Documentation

Full documentation is available through the R help system:

# Core functions
?SCE
?SCA

# S3 methods
?predict.SCE
?predict.SCA
?importance.SCE
?importance.SCA
?evaluate.SCE
?evaluate.SCA
?print.SCE
?print.SCA
?summary.SCE
?summary.SCA

# Traditional functions (for advanced users)
?Model_simulation
?SCA_tree_predict
?SCA_Model_evaluation
?SCE_Model_evaluation
?RFE_SCE
?Plot_RFE
?Wilks_importance
?SCA_importance

License

This package is licensed under the GPL-3 License.

Authors