The SCE (Stepwise Clustered Ensemble) package provides implementation of Stepwise Clustered Ensemble (SCE) and Stepwise Cluster Analysis (SCA) methods for multivariate data analysis. These methods are particularly useful for handling complex, high-dimensional datasets and building robust predictive models.
The package supports proper S3 object-oriented programming, providing
dedicated output classes with associated methods for print
,
summary
, predict
, importance
, and
evaluate
.
Install SCE from CRAN:
install.packages("SCE")
Or install the development version from GitHub:
# install.packages("devtools")
::install_github("loong2020/Stepwise-Clustered-Ensemble") devtools
SCE()
: Build a Stepwise Clustered Ensemble modelSCA()
: Build a Stepwise Cluster Analysis model (single
tree)Model_simulation()
: Perform SCE model predictionSCA_tree_predict()
: Perform SCA model predictionSCE_Model_evaluation()
: Evaluate SCE model
performanceSCA_Model_evaluation()
: Evaluate SCA model
performanceRFE_SCE()
: Recursive Feature Elimination for SCEWilks_importance()
: Calculate variable importance for
SCE using Wilks’ lambdaSCA_importance()
: Calculate variable importance for a
single SCA treeThe package provides S3 classes for both SCE and SCA models with convenient methods:
print()
: Display model information and performance
metricssummary()
: Detailed model summary with statisticspredict()
: Make predictions on new data (returns
Training, Validation, and Testing predictions)importance()
: Calculate variable importance using
Wilks’ lambdaevaluate()
: Evaluate model performance (training,
validation, and testing)print()
: Display tree structure and variable
informationsummary()
: Detailed tree summary with statisticspredict()
: Make predictions on new dataimportance()
: Calculate variable importanceevaluate()
: Evaluate model performance (testing
only)# Build models
<- SCE(Training_data = data, X = predictors, Y = predictants, ...)
sce_model <- SCA(Training_data = data, X = predictors, Y = predictants, ...)
sca_model
# Use S3 methods
print(sce_model) # Display model info
summary(sce_model) # Detailed summary
<- predict(sce_model, newdata) # Make predictions
predictions <- importance(sce_model) # Calculate variable importance
imp_ranking <- evaluate(sce_model, Testing_data, Training_data, Predictant) # Evaluate model
evaluation
# Check available methods
methods(class = "SCE")
methods(class = "SCA")
The package includes several datasets for demonstration and testing:
Streamflow_training_10var
,
Streamflow_testing_10var
Streamflow_training_22var
,
Streamflow_testing_22var
Air_quality_training
, Air_quality_testing
First, load the required packages and data:
# Load required packages
library(SCE)
library(parallel)
# Load the example datasets
data(Streamflow_training_10var)
data(Streamflow_testing_10var)
# Define predictors and predictants
<- c("Prcp", "SRad", "Tmax", "Tmin", "VP", "smlt", "swvl1", "swvl2", "swvl3", "swvl4")
Predictors <- c("Flow")
Predictants
# Perform SCA
set.seed(123)
<- SCA(alpha = 0.05,
model Training_data = Streamflow_training_10var,
X = Predictors,
Y = Predictants,
Nmin = 5,
resolution = 100)
# Use S3 methods
print(model)
summary(model)
# Calculate variable importance
<- importance(model)
Imp_ranking print(Imp_ranking)
# Make predictions
<- predict(model, Streamflow_testing_10var)
prediction
# Evaluate performance
<- evaluate(
performance object = model,
Testing_data = Streamflow_testing_10var,
Predictant = Predictants
)print(performance)
<- Imp_ranking[order(-Imp_ranking$Relative_Importance), ]
Importance_ranking_sorted barplot(
$Relative_Importance,
Importance_ranking_sortednames.arg = Importance_ranking_sorted$Predictor,
las = 2, # vertical labels
col = "skyblue",
main = "Variable Importance (SCA)",
ylab = "Importance",
xlab = "Predictor"
)
# Build SCE model
set.seed(123)
<- SCE(Training_data = Streamflow_training_10var,
Ensemble X = Predictors,
Y = Predictants,
mfeature = round(0.5 * length(Predictors)),
Nmin = 5,
Ntree = 40,
alpha = 0.05,
resolution = 100)
# Use S3 methods
print(Ensemble)
summary(Ensemble)
# Make predictions
<- predict(Ensemble, Streamflow_testing_10var)
predictions cat("Prediction components:", names(predictions), "\n")
cat("Testing predictions dimensions:", dim(predictions$Testing), "\n")
# Calculate variable importance
<- importance(Ensemble)
Imp_ranking
# Evaluate model performance
<- evaluate(
evaluation object = Ensemble,
Testing_data = Streamflow_testing_10var,
Training_data = Streamflow_training_10var,
Predictant = Predictants,
digits = 3
)print(evaluation)
<- Imp_ranking[order(-Imp_ranking$Relative_Importance), ]
Importance_ranking_sorted barplot(
$Relative_Importance,
Importance_ranking_sortednames.arg = Importance_ranking_sorted$Predictor,
las = 2, # vertical labels
col = "skyblue",
main = "Variable Importance (SCE)",
ylab = "Importance",
xlab = "Predictor"
)
# Define predictors and multiple predictants
# Load the example datasets
data(Air_quality_training)
data(Air_quality_testing)
<- c("SO2", "NO2", "CO", "O3", "TEMP", "PRES", "DEWP", "RAIN", "WSPM")
Predictors <- c("PM2.5", "PM10")
Predictants
# Build and evaluate model
set.seed(123)
<- SCE(Training_data = Air_quality_training,
Ensemble X = Predictors,
Y = Predictants,
mfeature = round(0.5 * length(Predictors)),
Nmin = 5,
Ntree = 40,
alpha = 0.05,
resolution = 100)
# Use S3 methods
print(Ensemble)
summary(Ensemble)
# Make predictions
<- predict(Ensemble, Air_quality_testing)
predictions
# Calculate variable importance
<- importance(Ensemble)
Imp_ranking
# Evaluate model performance
<- evaluate(
evaluation object = Ensemble,
Testing_data = Air_quality_testing,
Training_data = Air_quality_training,
Predictant = Predictants
)print(evaluation)
<- Imp_ranking[order(-Imp_ranking$Relative_Importance), ]
Importance_ranking_sorted barplot(
$Relative_Importance,
Importance_ranking_sortednames.arg = Importance_ranking_sorted$Predictor,
las = 2, # vertical labels
col = "skyblue",
main = "Variable Importance (SCE)",
ylab = "Importance",
xlab = "Predictor"
)
# Load the example datasets
data(Streamflow_training_22var)
data(Streamflow_testing_22var)
# Define predictors and predictants
<- c(
Predictors "Precipitation", "Radiation", "Tmax", "Tmin", "VP",
"Precipitation_2Mon", "Radiation_2Mon", "Tmax_2Mon", "Tmin_2Mon", "VP_2Mon",
"PNA", "Nino3.4", "IPO", "PDO",
"PNA_lag1", "Nino3.4_lag1", "IPO_lag1", "PDO_lag1",
"PNA_lag2", "Nino3.4_lag2", "IPO_lag2", "PDO_lag2"
)<- c("Flow")
Predictants
# Perform RFE
set.seed(1)
<- RFE_SCE(
result Training_data = Streamflow_training_22var,
Testing_data = Streamflow_testing_22var,
Predictors = Predictors,
Predictant = Predictants,
Nmin = 5,
Ntree = 48,
alpha = 0.05,
resolution = 1000,
step = 3 # Number of predictors to remove at each iteration
)
# Plot RFE results
Plot_RFE(result)
Full documentation is available through the R help system:
# Core functions
?SCE
?SCA
# S3 methods
?predict.SCE
?predict.SCA
?importance.SCE
?importance.SCA
?evaluate.SCE
?evaluate.SCA
?print.SCE
?print.SCA
?summary.SCE
?summary.SCA
# Traditional functions (for advanced users)
?Model_simulation
?SCA_tree_predict
?SCA_Model_evaluation
?SCE_Model_evaluation
?RFE_SCE
?Plot_RFE
?Wilks_importance ?SCA_importance
This package is licensed under the GPL-3 License.