Feature Engineering - Python UDFs
import turboml as tb
transactions = tb.datasets.FraudDetectionDatasetFeatures()[:100].to_online(
id="udf_transactions", load_if_exists=True
)
Simple User Defined function
For creating a user defined function first create a separate python file containing the function along with the imports used by it; the function should process the data and return a value. In the below example we have shown a simple example of a function that takes a value and then returns its sine value.
myfunction_contents = """
import numpy as np
def myfunction(x):
return np.sin(x)
"""
User Defined Functions - Multiple Input example
We saw that the above user defined function is very simple. We can also create a more complicated function with multiple inputs, we can perform string processing etc
my_complex_function_contents = """
def my_complex_function(x, y):
if x.lower() == y.lower():
return 1
else:
return 0
"""
Rich User Defined Functions
%pip install psycopg_pool psycopg['binary'] psycopg2-binary
my_rich_function_contents = """
from turboml.common.feature_engineering import TurboMLScalarFunction
from psycopg_pool import ConnectionPool
class PostgresLookup(TurboMLScalarFunction):
def __init__(self, user, password, host, port, dbname):
conninfo = (
f"user={user} password={password} host={host} port={port} dbname={dbname}"
)
self.connPool = ConnectionPool(conninfo=conninfo)
def func(self, index: str):
with self.connPool.connection() as risingwaveConn:
with risingwaveConn.cursor() as cur:
query = 'SELECT "model_length" FROM r2dt_models WHERE id = %s'
cur.execute(query, (index,))
result = cur.fetchone()
return result[0] if result else 0
"""
We can create a rich UDF and materialize it.
transactions.feature_engineering.create_rich_udf_features(
new_feature_name="lookup_feature",
argument_names=["index"],
function_name="lookup",
class_file_contents=my_rich_function_contents,
libraries=["psycopg_pool", "psycopg[binary]", "psycopg2-binary"],
class_name="PostgresLookup",
dev_initializer_arguments=["reader", "NWDMCE5xdipIjRrp", "hh-pgsql-public.ebi.ac.uk", "5432", "pfmegrnargs"],
prod_initializer_arguments=["reader", "NWDMCE5xdipIjRrp", "hh-pgsql-public.ebi.ac.uk", "5432", "pfmegrnargs"],
)
transactions.feature_engineering.materialize_features(["lookup_feature"])
Feature Engineering using User Defined Functions (UDF)
Make sure the libraries that are specified are pip installable and hence named appropriately, for example, if the UDF uses a sklearn function, then the library to be installed should be "scikit-learn" (and not "sklearn")
transactions.feature_engineering.create_udf_features(
new_feature_name="sine_of_amount",
argument_names=["transactionAmount"],
function_name="myfunction",
function_file_contents=myfunction_contents,
libraries=["numpy"],
)
transactions.feature_engineering.create_udf_features(
new_feature_name="transaction_location_overlap",
argument_names=["ipCountryCode", "paymentBillingCountryCode"],
function_name="my_complex_function",
function_file_contents=my_complex_function_contents,
libraries=[],
)
transactions.feature_engineering.get_local_features()
transactions.feature_engineering.materialize_features(
["sine_of_amount", "transaction_location_overlap"]
)
transactions.feature_engineering.get_materialized_features()