TF-IDF embedding example using gRPC Client
This example demonstrates using our gRPC API client to generate TF-IDF embedding.
import turboml as tb
!pip install nltk grpcio
Start gRPC server for tfdif embedding from jupyter-notebook
import pandas as pd
from utils.tfidf_grpc_server import serve
import threading
def run_server_in_background(url):
serve(url) # This will start the gRPC server
# Start the server in a separate thread
url = "0.0.0.0:50047"
server_thread = threading.Thread(
target=run_server_in_background, args=(url,), daemon=True
)
server_thread.start()
print("gRPC server is running in the background...")
Load text dataset
import re
import urllib.request
with urllib.request.urlopen(
"https://raw.githubusercontent.com/TurboML-Inc/colab-notebooks/refs/heads/main/data/tfidf_test_data.txt"
) as file:
text = file.read().decode()
sentences = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", text)
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
labels = [0] * len(sentences)
text_dict_test = {"text": sentences}
label_dict_test = {"labels": labels}
text_df_test = pd.DataFrame(text_dict_test)
label_df_test = pd.DataFrame(label_dict_test)
text_df_test.reset_index(inplace=True)
label_df_test.reset_index(inplace=True)
text_df_test = text_df_test.reset_index(drop=True)
label_df_test = label_df_test.reset_index(drop=True)
text_train = tb.LocalDataset.from_pd(df=text_df_test, key_field="index")
labels_train = tb.LocalDataset.from_pd(df=label_df_test, key_field="index")
text_test = tb.LocalDataset.from_pd(df=text_df_test, key_field="index")
labels_test = tb.LocalDataset.from_pd(df=label_df_test, key_field="index")
textual_fields = ["text"]
features_train = text_train.get_model_inputs(textual_fields=textual_fields)
targets_train = labels_train.get_model_labels(label_field="labels")
features_test = text_test.get_model_inputs(textual_fields=textual_fields)
targets_test = labels_test.get_model_labels(label_field="labels")
Using TurboML Client to request gRPC server
grpc_model = tb.GRPCClient(
server_url="0.0.0.0:50047",
connection_timeout=10000,
max_request_time=10000,
max_retries=1,
)
model_trained = grpc_model.learn(features_train, targets_train)
outputs_test = model_trained.predict(features_test)
outputs_test