Pre-Deployment ML
Performance Improvements

Performance Improvements

Open In Colab (opens in a new tab)

In this notebook, we'll cover some examples of how model performance can be improved. The techniques covered are

  • Sampling for imbalanced learning
  • Bagging
  • Boosting
  • Continuous Model Selection using Bandits.
import turboml as tb
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
transactions = tb.datasets.FraudDetectionDatasetFeatures().to_online(
    id="transactions", load_if_exists=True
)
labels = tb.datasets.FraudDetectionDatasetLabels().to_online(
    id="transaction_labels", load_if_exists=True
)
numerical_fields = [
    "transactionAmount",
    "localHour",
]
categorical_fields = [
    "digitalItemCount",
    "physicalItemCount",
    "isProxyIP",
]
features = transactions.get_model_inputs(
    numerical_fields=numerical_fields, categorical_fields=categorical_fields
)
label = labels.get_model_labels(label_field="is_fraud")

Now that we have our setup ready, let's first see the performance of a base HoeffdingTreeClassfier model.

htc_model = tb.HoeffdingTreeClassifier(n_classes=2)
deployed_model = htc_model.deploy("htc_classifier", input=features, labels=label)
labels_df = labels.preview_df
outputs = deployed_model.get_outputs()
len(outputs)
output_df = pd.DataFrame(
    {labels.key_field: str(x["record"].key), "class": x["record"].predicted_class}
    for x in outputs
)
joined_df = output_df.merge(labels_df, how="inner", on="transactionID")
 
true_labels = joined_df["is_fraud"]
real_outputs = joined_df["class"]
joined_df
roc_auc_score(true_labels, real_outputs)

Not bad. But can we improve it further? We haven't yet used the fact that the dataset is highly skewed.

Sampling for Imbalanced Learning

sampler_model = tb.RandomSampler(
    n_classes=2, desired_dist=[0.5, 0.5], sampling_method="under", base_model=htc_model
)
deployed_model = sampler_model.deploy(
    "undersampler_model", input=features, labels=label
)
outputs = deployed_model.get_outputs()
len(outputs)
output_df = pd.DataFrame(
    {labels.key_field: str(x["record"].key), "class": x["record"].predicted_class}
    for x in outputs
)
joined_df = output_df.merge(labels_df, how="inner", on="transactionID")
 
true_labels = joined_df["is_fraud"]
real_outputs = joined_df["class"]
joined_df
roc_auc_score(true_labels, real_outputs)

Bagging

lbc_model = tb.LeveragingBaggingClassifier(n_classes=2, base_model=htc_model)
deployed_model = lbc_model.deploy("lbc_classifier", input=features, labels=label)
outputs = deployed_model.get_outputs()
len(outputs)
output_df = pd.DataFrame(
    {labels.key_field: str(x["record"].key), "class": x["record"].predicted_class}
    for x in outputs
)
joined_df = output_df.merge(labels_df, how="inner", on="transactionID")
 
true_labels = joined_df["is_fraud"]
real_outputs = joined_df["class"]
joined_df
roc_auc_score(true_labels, real_outputs)

Boosting

abc_model = tb.AdaBoostClassifier(n_classes=2, base_model=htc_model)
deployed_model = abc_model.deploy("abc_classifier", input=features, labels=label)
outputs = deployed_model.get_outputs()
len(outputs)
output_df = pd.DataFrame(
    {labels.key_field: str(x["record"].key), "class": x["record"].predicted_class}
    for x in outputs
)
joined_df = output_df.merge(labels_df, how="inner", on="transactionID")
 
true_labels = joined_df["is_fraud"]
real_outputs = joined_df["class"]
joined_df
roc_auc_score(true_labels, real_outputs)

Continuous Model Selection with Bandits

bandit_model = tb.BanditModelSelection(base_models=[htc_model, lbc_model, abc_model])
deployed_model = bandit_model.deploy(
    "demo_classifier_bandit", input=features, labels=label
)
outputs = deployed_model.get_outputs()
len(outputs)
output_df = pd.DataFrame(
    {labels.key_field: str(x["record"].key), "class": x["record"].predicted_class}
    for x in outputs
)
joined_df = output_df.merge(labels_df, how="inner", on="transactionID")
 
true_labels = joined_df["is_fraud"]
real_outputs = joined_df["class"]
joined_df
roc_auc_score(true_labels, real_outputs)