Python Model: Batch Example
In this example we emulate batch training of custom models defined using TurboML's Python
model.
import turboml as tb
import pandas as pd
import numpy as np
Model Definition
Here we define MyBatchModel
with buffers to store the input features and labels until we exceed our buffer limit. Then, the model can be brained all at once on the buffered samples.
We use Scikit-Learn
's Perceptron
for this task.
from sklearn.linear_model import Perceptron
import turboml.common.pytypes as types
class MyBatchModel:
def __init__(self):
self.model = Perceptron()
self.X_buffer = []
self.y_buffer = []
self.batch_size = 64
self.trained = False
def init_imports(self):
from sklearn.linear_model import Perceptron
import numpy as np
def learn_one(self, input: types.InputData):
self.X_buffer.append(input.numeric)
self.y_buffer.append(input.label)
if len(self.X_buffer) >= self.batch_size:
self.model = self.model.partial_fit(
np.array(self.X_buffer), np.array(self.y_buffer), classes=[0, 1]
)
self.X_buffer = []
self.y_buffer = []
self.trained = True
def predict_one(self, input: types.InputData, output: types.OutputData):
if self.trained:
prediction = self.model.predict(np.array(input.numeric).reshape(1, -1))[0]
output.set_predicted_class(prediction)
else:
output.set_score(0.0)
Now, we define a custom virtual environment with the correct list of dependencies which the model will be using, and link our model to this venv
.
venv = tb.setup_venv("my_batch_python_venv", ["scikit-learn", "numpy<2"])
venv.add_python_class(MyBatchModel)
Model Deployment
Once the virtual environment is ready, we prepare the dataset to be used in this task and deploy the model with its features and labels.
batch_model = tb.Python(class_name=MyBatchModel.__name__, venv_name=venv.name)
transactions_df = pd.read_csv("data/transactions.csv").reset_index()
labels_df = pd.read_csv("data/labels.csv").reset_index()
transactions = tb.PandasDataset(
dataset_name="transactions_batch_python",
key_field="index",
dataframe=transactions_df,
upload=True,
)
labels = tb.PandasDataset(
dataset_name="labels_batch_python",
key_field="index",
dataframe=labels_df,
upload=True,
)
numerical_fields = [
"transactionAmount",
"localHour",
"isProxyIP",
"digitalItemCount",
"physicalItemCount",
]
features = transactions.get_input_fields(numerical_fields=numerical_fields)
label = labels.get_label_field(label_field="is_fraud")
deployed_batch_model = batch_model.deploy("batch_model", input=features, labels=label)
Evaluation
import matplotlib.pyplot as plt
deployed_batch_model.add_metric("WindowedRMSE")
model_auc_scores = deployed_batch_model.get_evaluation("WindowedRMSE")
plt.plot([model_auc_score.metric for model_auc_score in model_auc_scores])