Source code for evp.predictors

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Felix Zailskas <felixzailskas@gmail.com>

from abc import ABC, abstractmethod
from enum import Enum

import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from database import get_database
from logger import get_logger

log = get_logger()



[docs]
class Predictors(Enum):
    RandomForest = "Random Forest"
    XGBoost = "XGBoost"
    NaiveBayes = "Naive Bayes"
    KNN = "KNN Classifier"
    AdaBoost = "AdaBoost"
    LightGBM = "LightGBM"




[docs]
class MerchantSizeByDPV(Enum):
    Invalid = -1
    XS = 0
    S = 1
    M = 2
    L = 3
    XL = 4




[docs]
class Classifier(ABC):
    @abstractmethod
    def __init__(self, model_name: str = None, *args, **kwargs) -> None:
        self.epochs = "untrained"
        self.f1_test = "untrained"
        self.classification_report = {
            "epochs": self.epochs,
            "weighted avg": {"f1-score": self.f1_test},
        }

    @abstractmethod
    def _init_new_model(self):
        pass


[docs]
    @abstractmethod
    def predict(self, X) -> list[MerchantSizeByDPV]:
        pass



[docs]
    @abstractmethod
    def train(
        self, X_train, y_train, X_test, y_test, epochs=1, batch_size=None
    ) -> None:
        log.info(f"Training {type(self).__name__} for {epochs} epochs")

        self.model.fit(X_train, y_train)

        y_pred = self.model.predict(X_test)
        f1_test = f1_score(y_test, y_pred, average="weighted")
        log.info(f"F1 Score on Testing Set: {f1_test:.4f}")
        log.info("Computing classification report")
        self.classification_report = classification_report(
            y_test, y_pred, output_dict=True
        )
        self.classification_report["epochs"] = epochs
        self.epochs = epochs
        self.f1_test = f1_test



[docs]
    def save(self, num_classes: int = 5) -> None:
        model_type = type(self).__name__
        try:
            f1_string = f"{self.f1_test:.4f}"
        except:
            f1_string = self.f1_test
        model_name = f"{model_type.lower()}_epochs({self.epochs})_f1({f1_string})_numclasses({num_classes})_model.pkl"
        get_database().save_ml_model(self.model, model_name)
        get_database().save_classification_report(
            self.classification_report, model_name
        )



[docs]
    def load(self, model_name: str) -> None:
        loaded_model = get_database().load_ml_model(model_name)
        loaded_classification_report = get_database().load_classification_report(
            model_name
        )
        if loaded_model is not None:
            self.model = loaded_model
        if loaded_classification_report is not None:
            self.classification_report = loaded_classification_report
        self.epochs = self.classification_report["epochs"]
        self.f1_test = self.classification_report["weighted avg"]["f1-score"]





[docs]
class RandomForest(Classifier):
    def __init__(
        self,
        model_name: str = None,
        n_estimators=100,
        class_weight=None,
        random_state=42,
    ) -> None:
        super().__init__()
        self.random_state = random_state
        self.model = None
        if model_name is not None:
            self.load(model_name)
            if self.model is None:
                log.info(
                    f"Loading model '{model_name}' failed. Initializing new untrained model!"
                )
                self._init_new_model(
                    n_estimators=n_estimators, class_weight=class_weight
                )
        else:
            self._init_new_model(n_estimators=n_estimators, class_weight=class_weight)

    def _init_new_model(self, n_estimators=100, class_weight=None):
        self.model = RandomForestClassifier(
            n_estimators=n_estimators,
            class_weight=class_weight,
            random_state=self.random_state,
        )


[docs]
    def predict(self, X) -> MerchantSizeByDPV:
        return self.model.predict(X)



[docs]
    def train(
        self, X_train, y_train, X_test, y_test, epochs=1, batch_size=None
    ) -> None:
        super().train(
            X_train, y_train, X_test, y_test, epochs=epochs, batch_size=batch_size
        )





[docs]
class NaiveBayesClassifier(Classifier):
    def __init__(self, model_name: str = None, random_state=42) -> None:
        super().__init__()
        self.random_state = random_state
        self.model = None
        if model_name is not None:
            self.load(model_name)
            if self.model is None:
                log.info(
                    f"Loading model '{model_name}' failed. Initializing new untrained model!"
                )
                self._init_new_model()
        else:
            self._init_new_model()

    def _init_new_model(self):
        self.model = BernoulliNB()


[docs]
    def predict(self, X) -> list[MerchantSizeByDPV]:
        return self.model.predict(X)



[docs]
    def train(
        self, X_train, y_train, X_test, y_test, epochs=1, batch_size=None
    ) -> None:
        super().train(
            X_train, y_train, X_test, y_test, epochs=epochs, batch_size=batch_size
        )





[docs]
class KNNClassifier(Classifier):
    def __init__(
        self,
        model_name: str = None,
        random_state=42,
        n_neighbors=10,
        weights="distance",
    ) -> None:
        super().__init__()
        self.random_state = random_state
        self.n_neighbors = n_neighbors
        self.weights = weights
        self.model = None
        if model_name is not None:
            self.load(model_name)
            if self.model is None:
                log.info(
                    f"Loading model '{model_name}' failed. Initializing new untrained model!"
                )
                self._init_new_model()
        else:
            self._init_new_model()

    def _init_new_model(self):
        self.model = KNeighborsClassifier(
            n_neighbors=self.n_neighbors, weights=self.weights
        )


[docs]
    def predict(self, X) -> list[MerchantSizeByDPV]:
        return self.model.predict(X)



[docs]
    def train(
        self, X_train, y_train, X_test, y_test, epochs=1, batch_size=None
    ) -> None:
        super().train(
            X_train, y_train, X_test, y_test, epochs=epochs, batch_size=batch_size
        )





[docs]
class XGB(Classifier):
    def __init__(
        self,
        model_name: str = None,
        num_rounds=2000,
        random_state=42,
    ) -> None:
        super().__init__()
        self.random_state = random_state
        self.model = None
        self.num_rounds = num_rounds
        if model_name is not None:
            self.load(model_name)
            if self.model is None:
                log.info(
                    f"Loading model '{model_name}' failed. Initializing new untrained model!"
                )
                self._init_new_model(num_rounds == num_rounds)
        else:
            self._init_new_model(num_rounds == num_rounds)

    def _init_new_model(self, num_rounds=1000):
        self.params = {
            "objective": "multi:softmax",
            "num_class": 5,
            "max_depth": 3,
            "learning_rate": 0.1,
            "eval_metric": "mlogloss",
        }


[docs]
    def predict(self, X) -> MerchantSizeByDPV:
        return self.model.predict(X)



[docs]
    def train(
        self, X_train, y_train, X_test, y_test, epochs=1, batch_size=None
    ) -> None:
        log.info("Training XGBoost")

        dtrain = xgb.DMatrix(X_train, label=y_train)
        dtest = xgb.DMatrix(X_test, label=y_test)
        self.model = xgb.train(self.params, dtrain, self.num_rounds)

        # inference
        y_pred = self.model.predict(dtest)
        # metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1_test = f1_score(y_test, y_pred, average="weighted")

        log.info(f"F1 Score on Testing Set: {f1_test:.4f}")
        log.info("Computing classification report")
        self.classification_report = classification_report(
            y_test, y_pred, output_dict=True
        )
        self.classification_report["epochs"] = epochs
        self.epochs = epochs
        self.f1_test = f1_test





[docs]
class AdaBoost(Classifier):
    def __init__(
        self,
        model_name: str = None,
        n_estimators=100,
        class_weight=None,
        random_state=42,
    ) -> None:
        super().__init__()
        self.random_state = random_state
        self.model = None
        if model_name is not None:
            self.load(model_name)
            if self.model is None:
                log.info(
                    f"Loading model '{model_name}' failed. Initializing new untrained model!"
                )
                self._init_new_model(
                    n_estimators=n_estimators, class_weight=class_weight
                )
        else:
            self._init_new_model(n_estimators=n_estimators, class_weight=class_weight)

    def _init_new_model(self, n_estimators=100, class_weight=None):
        self.model = AdaBoostClassifier(
            estimator=DecisionTreeClassifier(max_depth=None, class_weight=class_weight),
            n_estimators=n_estimators,
            random_state=self.random_state,
        )


[docs]
    def predict(self, X) -> MerchantSizeByDPV:
        return self.model.predict(X)



[docs]
    def train(
        self, X_train, y_train, X_test, y_test, epochs=1, batch_size=None
    ) -> None:
        super().train(
            X_train, y_train, X_test, y_test, epochs=epochs, batch_size=batch_size
        )





[docs]
class LightGBM(Classifier):
    def __init__(
        self,
        model_name: str = None,
        num_leaves=1000,
        random_state=42,
    ) -> None:
        super().__init__()
        self.random_state = random_state
        self.model = None
        self.num_leaves = num_leaves
        if model_name is not None:
            self.load(model_name)
            if self.model is None:
                log.info(
                    f"Loading model '{model_name}' failed. Initializing new untrained model!"
                )
                self._init_new_model(num_leaves == num_leaves)
        else:
            self._init_new_model(num_leaves == num_leaves)

    def _init_new_model(self, num_rounds=1000):
        self.params_lgb = {
            "boosting_type": "gbdt",
            "objective": "multiclass",
            "metric": "multi_logloss",
            "num_class": 5,
            "num_leaves": self.num_leaves,
            "max_depth": -1,
            "learning_rate": 0.05,
            "feature_fraction": 0.9,
        }
        self.model = lgb.LGBMClassifier(**self.params_lgb)


[docs]
    def predict(self, X) -> MerchantSizeByDPV:
        return self.model.predict(X)



[docs]
    def train(
        self, X_train, y_train, X_test, y_test, epochs=1, batch_size=None
    ) -> None:
        log.info("Training LightGBM")

        self.model.fit(X_train, y_train)

        # inference
        y_pred = self.model.predict(X_test)
        # metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1_test = f1_score(y_test, y_pred, average="weighted")

        log.info(f"F1 Score on Testing Set: {f1_test:.4f}")
        log.info("Computing classification report")
        self.classification_report = classification_report(
            y_test, y_pred, output_dict=True
        )
        self.classification_report["epochs"] = epochs
        self.epochs = epochs
        self.f1_test = f1_test