ML pipeline

# Get the latest lib from Rapidcanvas
# !pip install --extra-index-url=https://us-central1-python.pkg.dev/rapidcanvas-361003/pypi/simple utils==0.12dev0

from utils.rc.client.requests import Requests
from utils.rc.client.auth import AuthClient

from utils.rc.dtos.env import Env
from utils.rc.dtos.env import EnvType
from utils.rc.dtos.project import Project
from utils.rc.dtos.dataset import Dataset
from utils.rc.dtos.recipe import Recipe
from utils.rc.dtos.transform import Transform
from utils.rc.dtos.artifact import Artifact
from utils.rc.dtos.dataSource import DataSource
from utils.rc.dtos.dataSource import DataSourceType
from utils.rc.dtos.dataSource import RedisStorageConfig
from utils.rc.dtos.prediction_service import PredictionService
from utils.dtos.rc_prediction_service import RCPredictionService

from utils.rc.dtos.template_v2 import TemplateV2, TemplateTransformV2

import pandas as pd
import logging
from utils.utils.log_util import LogUtil
LogUtil.set_basic_config(format='%(levelname)s:%(message)s', level=logging.INFO)

# Requests.setRootHost("https://test.dev.rapidcanvas.net/api/")
# AuthClient.setToken(email='', password='')
AuthClient.setToken()

env = Env.createEnv(name="ml_pipeline_env", description="ml_pipeline_env", envType=EnvType.LARGE, requirements="", async_flag= True)
env.pollStatus(900)

Creating Project

project = Project.create(
    name="Example ML Pipeline",
    description="Testing python lib",
    createEmpty=True,
    envId=env.id
)

project.id

Uploading the data

titanic_dataset = project.addDataset(
    dataset_name="titanic_dataset",
    dataset_description="titanic_dataset",
    dataset_file_path="data/titanic.csv"
)

Building Model

In this step, we are training a model with some data prep and cleaning (one hot encoding and fill missing) - you can look this in the TrainTheModel notebook (inside transform folder).

model_name = "rf_model"

recipe = project.addRecipe([titanic_dataset], name="build")

template = TemplateV2(
    name="TrainTheModel", description="TrainTheModel", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["ML"]
)
template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="TrainTheModel.ipynb"))
template.base_transforms = [template_transform]
template.publish("transforms/TrainTheModel.ipynb")

transform = Transform()
transform.templateId = template.id
transform.name = "transform_1"
transform.variables = {
    "inputDataset": "titanic_dataset",
    "target": "Survived",
    "modelName": model_name

}

recipe.add_transform(transform)

recipe.run()

To make sure your model got created in the backend you can check using the command below

all_models = PredictionService.get_all_models()

all_models

assert model_name in all_models, "models dont match"

Predicting Model

Using the model as batch

predict_recipe = project.addRecipe([titanic_dataset], name="predict")

template = TemplateV2(
    name="PredictMLModel", description="PredictMLModel", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["Number", "datatype-long"]
)
template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="PredictMLModel.ipynb"))
template.base_transforms = [template_transform]
template.publish("transforms/PredictMLModel.ipynb")

transform = Transform()
transform.templateId = template.id
transform.name = "transform"
transform.variables = {
    "modelInput": titanic_dataset.name,
    "modelName": model_name
}

# predict_recipe.prepareForLocal(transform, contextId="PredictMLModel")

predict_recipe.add_transform(transform)
predict_recipe.run()

output = predict_recipe.getChildrenDatasets()['output']

output.getData()

Building Features

We are skipping this part for now, but if in your case you need feature store we can also provide it

# online_data_store = DataSource.createDataSource(
#     "online-redis",
#     DataSourceType.REDIS_STORAGE,
#     {RedisStorageConfig.HOST: "10.41.1.3", RedisStorageConfig.PORT: "6379"}
# )

# recipe = project.addRecipe([titanic_dataset], name="feature_store_sync")

# template = TemplateV2(
#     name="FeatureStoreSync", description="FeatureStoreSync", project_id=project.id, source="CUSTOM", status="ACTIVE", tags=["Number", "datatype-long"]
# )
# template_transform = TemplateTransformV2(type = "python", params=dict(notebookName="FeatureStoreSync.ipynb"))
# template.base_transforms = [template_transform]
# template.publish("transforms/FeatureStoreSync.ipynb")

# transform = Transform()
# transform.templateId = template.id
# transform.name = "transform_1"
# transform.variables = {
#     "datasetName": titanic_dataset.name,
#     "columns": "Name,Sex,Fare",
#     "featureEntityName": "Passenger",
#     "featureEntityColumn": "PassengerId",
#     "dataSourceName": online_data_store.name
# }

# recipe.prepareForLocal(transform, "feature_store")

# recipe.add_transform(transform)

# recipe.run()

# output = recipe.getChildrenDatasets()['feature_sync_stats']

# output.getData()

Creating Predict Service

To expose the model as API you need to create a service, the code below does it for you

service_name = "RandomForestModelService"

prediction_service = PredictionService.create_service(
    name=service_name,
    description="testing purposes",
    model_name=model_name, #this is exposing the model that you have created before
    service_obj_path="prediction_services/model.py",
    env_id=None,
    data_source_ids=None
)

You can use the curl that you can see above or you can use the code below to do realtime predictions

# PredictionService.refresh_service(prediction_service.name)

new = pd.read_csv("data/titanic.csv")

t_json = new.head().to_dict()

PredictionService.predict_by_service(
    prediction_service.name,
    t_json
)