Source code for evp.evp

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Felix Zailskas <felixzailskas@gmail.com>

import lightgbm as lgb
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight

from evp.predictors import (
    XGB,
    AdaBoost,
    Classifier,
    KNNClassifier,
    LightGBM,
    MerchantSizeByDPV,
    NaiveBayesClassifier,
    Predictors,
    RandomForest,
)
from logger import get_logger

log = get_logger()


SEED = 42



[docs]
class EstimatedValuePredictor:
    lead_classifier: Classifier

    def __init__(
        self,
        data: pd.DataFrame,
        train_size=0.8,
        val_size=0.1,
        test_size=0.1,
        model_type: Predictors = Predictors.RandomForest,
        model_name: str = None,
        limit_classes: bool = False,
        selected_features: list = None,
        **model_args,
    ) -> None:
        self.df = data
        self.num_classes = 5
        features = self.df.drop("MerchantSizeByDPV", axis=1)
        if selected_features is not None:
            features = features[selected_features]
        features = features.to_numpy()
        if limit_classes:
            self.num_classes = 3
            self.df["new_labels"] = np.where(
                self.df["MerchantSizeByDPV"] == 0,
                0,
                np.where(self.df["MerchantSizeByDPV"] == 4, 2, 1),
            )
            self.df = self.df.drop("MerchantSizeByDPV", axis=1)
            self.df = self.df.rename(columns={"new_labels": "MerchantSizeByDPV"})
        self.class_labels = self.df["MerchantSizeByDPV"].to_numpy()
        # split the data into training (80%), validation (10%), and testing (10%) sets
        self.X_train, X_temp, self.y_train, y_temp = train_test_split(
            features, self.class_labels, test_size=val_size + test_size, random_state=42
        )
        self.X_val, self.X_test, self.y_val, self.y_test = train_test_split(
            X_temp, y_temp, test_size=val_size / (val_size + test_size), random_state=42
        )
        self.model_type = model_type
        if model_type == Predictors.XGBoost:
            self.dtrain_xgb = xgb.DMatrix(self.X_train, label=self.y_train)
            self.dtest_xgb = xgb.DMatrix(self.X_test, label=self.y_test)

        # Class weights to tackle the class imbalance
        class_weights = class_weight.compute_class_weight(
            "balanced", classes=np.unique(self.y_train), y=self.y_train
        )
        self.class_weight_dict = dict(zip(np.unique(self.y_train), class_weights))

        match model_type:
            case Predictors.RandomForest:
                self.lead_classifier = RandomForest(
                    model_name=model_name,
                    class_weight=self.class_weight_dict,
                    **model_args,
                )
            case Predictors.XGBoost:
                self.lead_classifier = XGB(
                    model_name=model_name,
                    **model_args,
                )
            case Predictors.NaiveBayes:
                self.lead_classifier = NaiveBayesClassifier(
                    model_name=model_name,
                    **model_args,
                )
            case Predictors.KNN:
                self.lead_classifier = KNNClassifier(
                    model_name=model_name, **model_args
                )
            case Predictors.AdaBoost:
                self.lead_classifier = AdaBoost(model_name=model_name, **model_args)
            case Predictors.LightGBM:
                self.lead_classifier = LightGBM(model_name=model_name, **model_args)
            case default:
                log.error(
                    f"Error: EVP initialized with unsupported model type {model_type}!"
                )


[docs]
    def train(self, epochs=1, batch_size=None) -> None:
        self.lead_classifier.train(
            self.X_train,
            self.y_train,
            self.X_test,
            self.y_test,
            epochs=epochs,
            batch_size=batch_size,
        )



[docs]
    def save_model(self) -> None:
        self.lead_classifier.save(num_classes=self.num_classes)



[docs]
    def predict(self, X) -> list[MerchantSizeByDPV]:
        # use the models to predict required values
        if (
            self.lead_classifier.classification_report["epochs"] == "untrained"
            or self.lead_classifier.classification_report["weighted avg"]["f1-score"]
            == "untrained"
        ):
            log.error("Cannot make predictions with untrained model!")
            return [MerchantSizeByDPV.Invalid]
        if self.model_type == Predictors.XGBoost:
            merchant_size = self.lead_classifier.predict(self.dtest_xgb)
        else:
            merchant_size = self.lead_classifier.predict(X)
        return merchant_size