Ensembling Custom Python Models in TurboML
TurboML allows you to create custom ensemble models using Python classes, leveraging the flexibility of Python while benefiting from TurboML's performance and scalability. In this notebook, we'll walk through how to create a custom ensemble model using TurboML's PythonEnsembleModel interface.
Imports and Utilities
We'll start by importing the necessary libraries and modules.
from river import datasets
import pandas as pd
import turboml as tb
import turboml.common.pytypes as types
import turboml.common.pymodel as model
import logging
from typing import List
import matplotlib.pyplot as plt
Prepare an Evaluation Dataset
We choose standard Credit Card Fraud dataset that ships with River to evaluate our models on.
dataset = datasets.CreditCard()
dataset
sample, score = next(iter(dataset))
sample
# Extract samples and labels
sample_inputs = []
sample_labels = []
for sample, score in dataset:
sample_inputs.append(sample)
sample_labels.append({"score": score})
# Convert to DataFrames
df_features = pd.DataFrame.from_dict(sample_inputs)
df_labels = pd.DataFrame.from_dict(sample_labels)
# Drop the 'Time' column as we won't use it
df_features = df_features.drop(["Time"], axis=1)
Load Datasets into TurboML
We'll load the features and labels into TurboML PandasDataset objects.
features = tb.PandasDataset(
dataset_name="cc_feats_ensemble",
key_field="index",
dataframe=df_features.reset_index(),
upload=True,
)
labels = tb.PandasDataset(
dataset_name="cc_labels_ensemble",
key_field="index",
dataframe=df_labels.reset_index(),
upload=True,
)
Isolate features
numerical_cols = df_features.columns.tolist()
input_features = features.get_input_fields(numerical_fields=numerical_cols)
label = labels.get_label_field(label_field="score")
Structure of Ensemble Models
A custom ensemble model in TurboML must implement three instance methods:
init_imports
: Import any external modules used in the class.learn_one
: Receive labeled data for the model to learn from.predict_one
: Receive input features for prediction and output the result. Here's the general structure:
class CustomEnsembleModel:
def __init__(self, base_models: List[types.Model]):
# Ensure at least one base model is provided
if not base_models:
raise ValueError("PythonEnsembleModel requires at least one base model.")
self.base_models = base_models
def init_imports(self):
"""
Import any external symbols/modules used in this class
"""
pass
def learn_one(self, input: types.InputData):
"""
Receives labelled data for the model to learn from
"""
pass
def predict_one(self, input: types.InputData, output: types.OutputData):
"""
Receives input features for a prediction, must pass output to the
output object
"""
pass
Example - Creating a Custom Ensemble Model
We'll create a custom ensemble model that averages the predictions of its base models.
class MyEnsembleModel:
def __init__(self, base_models: List[model.Model]):
if not base_models:
raise ValueError("PythonEnsembleModel requires at least one base model.")
self.base_models = base_models
self.logger = logging.getLogger(__name__)
def init_imports(self):
pass
def learn_one(self, input: types.InputData):
try:
for model in self.base_models:
model.learn_one(input)
except Exception as e:
self.logger.exception(f"Exception in learn_one: {e}")
def predict_one(self, input: types.InputData, output: types.OutputData):
try:
total_score = 0.0
for model in self.base_models:
model_output = model.predict_one(input)
model_score = model_output.score()
total_score += model_score
average_score = total_score / len(self.base_models)
output.set_score(average_score)
except Exception as e:
self.logger.exception(f"Exception in predict_one: {e}")
Set Up the Virtual Environment
We'll set up a virtual environment and add our custom ensemble class to it. Since our class requires arguments in the constructor, we'll disable validation when adding it.
# Set up the virtual environment
venv_name = "my_ensemble_venv"
venv = tb.setup_venv(venv_name, ["river"])
# Add the ensemble class without validation
venv.add_python_class(MyEnsembleModel, do_validate_as_model=False)
Create Base Models
We'll use TurboML's built-in models as base models for our ensemble.
# Create individual base models
model1 = tb.HoeffdingTreeClassifier(n_classes=2)
model2 = tb.AMFClassifier(n_classes=2)
# Create the PythonEnsembleModel
ensemble_model = tb.PythonEnsembleModel(
base_models=[model1, model2],
module_name="",
class_name="MyEnsembleModel",
venv_name=venv_name,
)
Deploy the Ensemble Model
We'll deploy the ensemble model, providing the input features and labels.
deployed_ensemble_model = ensemble_model.deploy(
name="ensemble_model", input=input_features, labels=label
)
Evaluate the Ensemble Model
We'll add a metric to evaluate the model and plot the results.
# Add a metric to the deployed model
deployed_ensemble_model.add_metric("WindowedRMSE")
# Retrieve the evaluation results
model_rmse_scores = deployed_ensemble_model.get_evaluation("WindowedRMSE")
# Plot the RMSE scores
plt.figure(figsize=(10, 6))
plt.plot([score.metric for score in model_rmse_scores], label="Ensemble Model RMSE")
plt.xlabel("Time Steps")
plt.ylabel("RMSE")
plt.title("Ensemble Model Evaluation")
plt.legend()
plt.show()