ONNX tutorial with Scikit-Learn
Install necessary libraries
import pandas as pd
import turboml as tb
!pip install onnx==1.14.1 scikit-learn skl2onnx
Scikit Learn - Standard Model Training
The following blocks of code define a standard sklearn training code. This is completely independent of TurboML.
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.helpers.onnx_helper import select_model_inputs_outputs
import matplotlib.pyplot as plt
transactions = pd.read_csv("data/transactions.csv").reset_index()
labels = pd.read_csv("data/labels.csv").reset_index()
joined_df = pd.merge(transactions, labels, on="index", how="right")
joined_df
X = joined_df.drop("is_fraud", axis=1)
y = joined_df["is_fraud"]
numerical_fields = [
"transactionAmount",
"localHour",
"isProxyIP",
"digitalItemCount",
"physicalItemCount",
]
X = X[numerical_fields]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Export model to ONNX format
Exporting a model to ONNX format depends on the framework. Tutorials for different frameworks can be found at https://github.com/onnx/tutorials#converting-to-onnx-format (opens in a new tab)
initial_type = [("float_input", FloatTensorType([None, X_train.shape[1]]))]
onx = convert_sklearn(
clf, initial_types=initial_type, options={type(clf): {"zipmap": False}}
)
onx = select_model_inputs_outputs(onx, outputs=["probabilities"])
Create an ONNX model with TurboML
Now that we've converted the model to ONNX format, we can deploy it with TurboML.
try:
transactions = tb.PandasDataset(
dataset_name="transactions_onnx_sklearn",
key_field="index",
dataframe=transactions,
upload=True,
)
except:
transactions = tb.PandasDataset(dataset_name="transactions_onnx_sklearn")
try:
labels = tb.PandasDataset(
dataset_name="labels_onnx_sklearn", key_field="index", dataframe=labels, upload=True
)
except:
labels = tb.PandasDataset(dataset_name="labels_onnx_sklearn")
features = transactions.get_input_fields(numerical_fields=numerical_fields)
label = labels.get_label_field(label_field="is_fraud")
tb.set_onnx_model("randomforest", onx.SerializeToString())
onnx_model = tb.ONNX(model_save_name="randomforest")
deployed_model = onnx_model.deploy("onnx_model", input=features, labels=label)
deployed_model.add_metric("WindowedAUC")
model_auc_scores = deployed_model.get_evaluation("WindowedAUC")
plt.plot([model_auc_score.metric for model_auc_score in model_auc_scores])