Source code for database.leads.repository

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Sophie Heasman <sophieheasmann@gmail.com>

from abc import ABC, abstractmethod
from datetime import datetime


[docs] class Repository(ABC): DATETIME_FORMAT = "%Y-%m-%d %H:%M:%S" # Database paths for dataframe and reviews have to be set @property @abstractmethod def DF_INPUT(self): """ Define database path to load dataframe """ pass @property @abstractmethod def DF_OUTPUT(self): """ Define database path to store dataframe """ pass @property @abstractmethod def DF_HISTORICAL_OUTPUT(self): """ Define database path to store historical enriched dataframe (used for preprocessing input) """ pass @property @abstractmethod def REVIEWS(self): """ Define database path to store reviews """ pass @property @abstractmethod def SNAPSHOTS(self): """ Define database path to store snapshots """ pass @property @abstractmethod def GPT_RESULTS(self): """ Define database path to store GPT operations """ pass def __init__(self): """ Initialise DAL, and saves the input df as an attribute :param download_df: Specify if you want to download the dataframe in this instance (not needed when handling reviews) """ self.df = None self._download()
[docs] def get_dataframe(self): return self.df
[docs] def set_dataframe(self, df): self.df = df
[docs] def get_input_path(self): return self.DF_INPUT
[docs] def get_enriched_data_path(self, historical=False): if historical: return self.DF_HISTORICAL_OUTPUT return self.DF_OUTPUT
@abstractmethod def _download(self): """ Download database from specified DF path """ pass
[docs] @abstractmethod def save_dataframe(self): """ Save dataframe in df attribute in chosen output location """ pass
[docs] @abstractmethod def save_prediction(self, df): """ Save dataframe in df parameter in chosen output location """ pass
[docs] @abstractmethod def insert_data(self, data): """ Insert new data into specified dataframe :param data: Data to be inserted (desired format must be checked) """ pass
[docs] @abstractmethod def create_snapshot(self, df, prefix, name): """ Snapshot the current state of the dataframe :param df: Data to create a snapshot of :param prefix: Prefix for a group of snapshots belonging to a singe pipeline run, used to identify snapshots when cleaning up after a pipeline run :param name: Name of the snapshot :return: None """
[docs] @abstractmethod def clean_snapshots(self, prefix): """ Clean up the snapshots after a pipeline ran successfully :param prefix: Prefix of the current pipeline run used to identify all snapshots to delete """
[docs] @abstractmethod def save_review(self, review, place_id, force_refresh=False): """ Upload review to specified review path :param review: json contents of the review to be uploaded """ pass
[docs] @abstractmethod def fetch_review(self, place_id): """ Fetch review for specified place_id :return: json contents of desired review """ pass
[docs] @abstractmethod def save_lookup_table(self, lookup_table: dict, step_name: str) -> None: """ Save the lookup table for hashes for a given step """ pass
[docs] @abstractmethod def fetch_gpt_result(self, file_id, operation_name): """ Fetches the GPT result for a given file ID and operation name. Args: file_id (str): The ID of the file. operation_name (str): The name of the GPT operation. Returns: The GPT result for the specified file ID and operation name. """ pass
[docs] @abstractmethod def load_lookup_table(self, step_name: str) -> dict: """ Create or load the lookup table of hashes for a given step :return: lookup table as a pandas DataFrame """ pass
[docs] @abstractmethod def save_gpt_result(self, gpt_result, file_id, operation_name, force_refresh=False): """ Saves the GPT result for a given file ID and operation name. Args: gpt_result (str): The GPT result to be saved. file_id (str): The ID of the file. operation_name (str): The name of the operation. force_refresh (bool, optional): Whether to force a refresh of the saved result. Defaults to False. """ pass
def _get_current_time_as_string(self): """ Get the current time as a string """ return datetime.now().strftime(self.DATETIME_FORMAT) def _convert_string_time_to_datetime(self, time): """ Convert a string time to a datetime object """ return datetime.strptime(time, self.DATETIME_FORMAT)
[docs] @abstractmethod def load_ml_model(self, model_name: str): """ Load a ML model from a file with a given name Args: model_name (str): File name """ pass
[docs] @abstractmethod def save_ml_model(self, model, model_name: str): """ Save a given ML model to a file with a given name Args: model: Model to save model_name (str): File name """ pass
[docs] @abstractmethod def load_classification_report(self, model_name: str): """ Load a given classification report to a file with a given name Args: model_name (str): Model name that created the report """ pass
[docs] @abstractmethod def save_classification_report(self, report, model_name: str): """ Save a given classification report to a file with a given name Args: report: The classification report to save model_name (str): Model name that created the report """ pass
[docs] @abstractmethod def get_preprocessed_data_path(self, historical: bool = True): """ Returns the path for a preprocessed data file (either historical or current) """ pass
[docs] @abstractmethod def load_preprocessed_data(self, historical: bool = True): """ Load the preprocessed data from the given file """ pass