ML pipeline
# Get the latest lib from Rapidcanvas
# !pip install --extra-index-url=https://us-central1-python.pkg.dev/rapidcanvas-361003/pypi/simple utils==0.12dev0
from utils.rc.client.requests import Requests
from utils.rc.client.auth import AuthClient
from utils.rc.dtos.env import Env
from utils.rc.dtos.env import EnvType
from utils.rc.dtos.project import Project
from utils.rc.dtos.dataset import Dataset
from utils.rc.dtos.recipe import Recipe
from utils.rc.dtos.transform import Transform
from utils.rc.dtos.artifact import Artifact
from utils.rc.dtos.dataSource import DataSource
from utils.rc.dtos.dataSource import DataSourceType
from utils.rc.dtos.dataSource import RedisStorageConfig
from utils.rc.dtos.prediction_service import PredictionService
from utils.dtos.rc_prediction_service import RCPredictionService
from utils.rc.dtos.template_v2 import TemplateV2, TemplateTransformV2
import pandas as pd
import logging
from utils.utils.log_util import LogUtil
LogUtil.set_basic_config(format='%(levelname)s:%(message)s', level=logging.INFO)
# Requests.setRootHost("https://test.dev.rapidcanvas.net/api/")
# AuthClient.setToken(email='', password='')
AuthClient.setToken()
env = Env.createEnv(name="ml_pipeline_env", description="ml_pipeline_env", envType=EnvType.LARGE, requirements="")
Creating Project
project = Project.create(
name="Example ML Pipeline",
description="Testing python lib",
createEmpty=True,
envId=env.id
)
project.id
Uploading the data
titanic_dataset = project.addDataset(
dataset_name="titanic_dataset",
dataset_description="titanic_dataset",
dataset_file_path="data/titanic.csv"
)
Building Model
In this step, we are training a model with some data prep and cleaning (one hot encoding and fill missing) - you can look this in the TrainTheModel notebook (inside transform folder).
model_name = "rf_model"
recipe = project.addRecipe([titanic_dataset], name="build")
template = TemplateV2(
name="TrainTheModel", description="TrainTheModel", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["ML"]
)
template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="TrainTheModel.ipynb"))
template.base_transforms = [template_transform]
template.publish("transforms/TrainTheModel.ipynb")
transform = Transform()
transform.templateId = template.id
transform.name = "transform_1"
transform.variables = {
"inputDataset": "titanic_dataset",
"target": "Survived",
"modelName": model_name
}
recipe.add_transform(transform)
recipe.run()
To make sure your model got created in the backend you can check using the command below
all_models = PredictionService.get_all_models()
all_models
assert model_name in all_models, "models dont match"
Predicting Model
Using the model as batch
predict_recipe = project.addRecipe([titanic_dataset], name="predict")
template = TemplateV2(
name="PredictMLModel", description="PredictMLModel", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["Number", "datatype-long"]
)
template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="PredictMLModel.ipynb"))
template.base_transforms = [template_transform]
template.publish("transforms/PredictMLModel.ipynb")
transform = Transform()
transform.templateId = template.id
transform.name = "transform"
transform.variables = {
"modelInput": titanic_dataset.name,
"modelName": model_name
}
# predict_recipe.prepareForLocal(transform, contextId="PredictMLModel")
predict_recipe.add_transform(transform)
predict_recipe.run()
output = predict_recipe.getChildrenDatasets()['output']
output.getData()
Building Features
We are skipping this part for now, but if in your case you need feature store we can also provide it
# online_data_store = DataSource.createDataSource(
# "online-redis",
# DataSourceType.REDIS_STORAGE,
# {RedisStorageConfig.HOST: "10.41.1.3", RedisStorageConfig.PORT: "6379"}
# )
# recipe = project.addRecipe([titanic_dataset], name="feature_store_sync")
# template = TemplateV2(
# name="FeatureStoreSync", description="FeatureStoreSync", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["Number", "datatype-long"]
# )
# template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="FeatureStoreSync.ipynb"))
# template.base_transforms = [template_transform]
# template.publish("transforms/FeatureStoreSync.ipynb")
# transform = Transform()
# transform.templateId = template.id
# transform.name = "transform_1"
# transform.variables = {
# "datasetName": titanic_dataset.name,
# "columns": "Name,Sex,Fare",
# "featureEntityName": "Passenger",
# "featureEntityColumn": "PassengerId",
# "dataSourceName": online_data_store.name
# }
# recipe.prepareForLocal(transform, "feature_store")
# recipe.add_transform(transform)
# recipe.run()
# output = recipe.getChildrenDatasets()['feature_sync_stats']
# output.getData()
Creating Predict Service
To expose the model as API you need to create a service, the code below does it for you
service_name = "RandomForestModelService"
prediction_service = PredictionService.create_service(
name=service_name,
description="testing purposes",
model_name=model_name, #this is exposing the model that you have created before
service_obj_path="prediction_services/model.py",
env_id=None,
data_source_ids=None
)
You can use the curl that you can see above or you can use the code below to do realtime predictions
# PredictionService.refresh_service(prediction_service.name)
new = pd.read_csv("data/titanic.csv")
t_json = new.head().to_dict()
PredictionService.predict_by_service(
prediction_service.name,
t_json
)