Introduction


This tutorial explains the steps to  train a logistic regression model, save the trained model using model lifecycle management and run inference using the saved model


Prerequisite

Create a folder "titanic" in project-space and upload the titanic training and test data as train.csv and test.csv respectively


Solution


Create and execute training pipeline


1. Import required packages from SDK


from razor.marketplace.blocks.rzt.ML_Blocks import LogisticRegression
from razor.api import project_space_path
import razor.flow as rf

2. Define a custom block to read the training data from project space

import pandas as pd
@rf.block
class CsvReader:
filename: str
output:rf.SeriesOutput[pd.DataFrame]
def run(self):
file_path = project_space_path(self.filename)
chunks = pd.read_csv(file_path, chunksize=10, nrows=None, delimiter = None)
for df in chunks:
self.output.put(df)

3. Define a custom block to remove rows with Nan

@rf.block
class DfFilterNan():
df_chunks:rf.SeriesInput[pd.DataFrame]
output: rf.Output[pd.DataFrame]
def run(self):
concat_df = pd.DataFrame()
for df in self.df_chunks:
df.dropna(axis=0, inplace=True)
concat_df = pd.concat([concat_df, df])
self.output.put(concat_df)

4. Define a custom block to convert categorical fields to numeric

@rf.block
class DfCategorical():
columns:list
df:pd.DataFrame
output:rf.Output[pd.DataFrame]
def run(self):
for col in self.columns:
self.df[col] = self.df[col].astype('category')
self.df[col] = self.df[col].cat.codes
self.output.put(self.df)
5. Define a custom blocks to generate two numpy arrays for input and target based on the column names provided
import numpy as np
@rf.block
class Get_data():
x_columns : list
y_column:list
df:pd.DataFrame
out_x:rf.Output[np.ndarray]
out_y:rf.Output[np.ndarray]

def run(self):
if self.y_column is not None and len(self.y_column)!=0:
x = self.df[self.x_columns].values
y = np.squeeze(self.df[self.y_column].values)
self.out_x.put(x)
self.out_y.put(y)
else:
x = self.df[self.x_columns].values
self.out_x.put(x)

6. Build and display the pipeline


atomic_csv_reader = CsvReader(filename="titanic/train.csv")
df_filter = DfFilterNan(
df_chunks =atomic_csv_reader.output
)
df_cat = DfCategorical( columns =["Sex", "Cabin", "Embarked"],
df = df_filter.output
)
train_data = Get_data(x_columns =['PassengerId', 'Pclass', 'Sex', 'Age', 'SibSp', 'Fare', 'Cabin', 'Embarked'],
y_column = ['Survived'],
df = df_cat.output
)
lr_model_train = LogisticRegression(
operation="fit",
x_data = train_data.out_x,
y_data = train_data.out_y,
path = "lr_m1.sav",
save = True
)
pipeline = rf.Pipeline(targets=[lr_model_train])
pipeline.show()


7. Run the pipeline

pipeline.execute()


Evaluate pipeline


1. Define a block to save the predictions to project space

import pandas as pd
import numpy as np
@rf.block
class NumpyToCsv():
numpy_array:np.ndarray
output_path:str
def run(self):
pd.DataFrame(self.numpy_array,columns=['Predictions']).to_csv(project_space_path(self.output_path))

2. Create pipeline to make predictions using model trained and saved in above section

lr_model_predict = LogisticRegression(operation = "predict",
x_data = train_data.out_x,
attribute = "classes_",
path="lr_m1.sav",
load = True
)
csv_writer = NumpyToCsv(output_path = "lr_pred_1.csv",
numpy_array = lr_model_predict.predictions)
predict_pipeline = rf.Pipeline(targets=[csv_writer])

predict_pipeline.show()

3. Execute the pipeline

predict_pipeline.execute()


4. Evaluate using trained pipeline

lr_model_conf = (LogisticRegression()
.operation("evaluate")
.metric_function("confusion_matrix")
.test_x_data(train_data.out_x)
.test_y_data(train_data.out_y)
.path("lr_m1.sav"))