# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
import pandas as pd
from pandas import DataFrame
from tqdm import tqdm
from bdc.steps.helpers import OffeneRegisterAPI, get_lead_hash_generator
from bdc.steps.step import Step
from logger import get_logger
log = get_logger()
[docs]
class SearchOffeneRegister(Step):
"""
This class represents a step in the sales lead qualification process that searches for company-related data
using the OffeneRegisterAPI.
Attributes:
name (str): The name of the step.
required_cols (list): The list of required columns in the input DataFrame.
added_cols (list): The list of columns to be added to the input DataFrame.
offeneregisterAPI (OffeneRegisterAPI): An instance of the OffeneRegisterAPI class.
Methods:
verify(): Verifies if the step is ready to run.
finish(): Performs any necessary cleanup or finalization steps.
load_data(): Loads any required data for the step.
run(): Executes the step and returns the modified DataFrame.
_extract_company_related_data(lead): Extracts company-related data for a given lead.
Added Columns:
company_name (str): The name of the company from offeneregister.de
company_objective (str): The objective of the company offeneregister.de
company_capital (float): The capital of the company offeneregister.de
company_capital_currency (str): The currency of the company capital offeneregister.de
company_address (str): The address of the company offeneregister.de
"""
name = "OffeneRegister"
required_cols = ["Last Name", "First Name"]
added_cols = [
"company_name",
"company_objective",
"company_capital",
"company_capital_currency",
"compan_address",
]
offeneregisterAPI = OffeneRegisterAPI()
[docs]
def verify(self) -> bool:
return super().verify()
[docs]
def finish(self):
log.info("Search Offeneregister finished with the summary below:")
for col in self.added_cols:
col_perc = self.df[col].notna().sum() / len(self.df[col]) * 100
log.info(f"Percentage of {col} (of all): {col_perc:.2f}%")
[docs]
def load_data(self):
pass
[docs]
def run(self) -> DataFrame:
tqdm.pandas(desc="Running Search Offeneregister for company related data...")
self.df[self.added_cols] = self.df.progress_apply(
lambda lead: pd.Series(
get_lead_hash_generator().hash_check(
lead,
self._extract_company_related_data,
self.name,
self.added_cols,
lead,
)
),
axis=1,
)
return self.df