Source code for bdc.steps.gpt_summarizer

# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: 2023 Berkay Bozkurt <resitberkaybozkurt@gmail.com>
# SPDX-FileCopyrightText: 2023 Sophie Heasman <sophieheasmann@gmail.com>


import time
from http import HTTPStatus

import openai
import pandas as pd
import requests
from bs4 import BeautifulSoup
from pandas import DataFrame
from requests import RequestException
from tqdm import tqdm

from bdc.steps.helpers import get_lead_hash_generator
from bdc.steps.step import Step, StepError
from config import OPEN_AI_API_KEY
from database import get_database
from logger import get_logger

log = get_logger()



[docs]
class GPTSummarizer(Step):
    """
    The GPTSummarizer step will attempt to download a businesses website in raw html format and pass this information
    to OpenAIs GPT, which will then attempt to summarize the raw contents and extract valuable information for a
    salesperson.

    Attributes:
        name: Name of this step, used for logging
        added_cols: List of fields that will be added to the main dataframe by executing this step
        required_cols: List of fields that are required to be existent in the input dataframe before performing this
            step

    Added Columns:
        sales_person_summary (str): The summary of the company website for the salesperson using GPT
    """

    name = "GPT-Summarizer"
    model = "gpt-4"
    no_answer = "None"

    # system and user messages to be used for creating company summary for lead using website.
    system_message_for_website_summary = f"You are html summarizer, you being provided the companies' htmls and you answer with the summary of three to five sentences including all the necessary information which might be useful for salesperson. If no html then just answer with '{no_answer}'"
    user_message_for_website_summary = (
        "Give salesperson a summary using following html: {}"
    )

    extracted_col_name_website_summary = "sales_person_summary"
    gpt_required_fields = {
        "website": "google_places_detailed_website",
        "place_id": "google_places_place_id",
    }

    added_cols = [extracted_col_name_website_summary]
    required_cols = gpt_required_fields.values()

    client = None


[docs]
    def load_data(self) -> None:
        self.client = openai.OpenAI(api_key=OPEN_AI_API_KEY)



[docs]
    def verify(self) -> bool:
        if OPEN_AI_API_KEY is None:
            raise StepError("An API key for openAI is need to run this step!")
        return super().verify()



[docs]
    def run(self) -> DataFrame:
        tqdm.pandas(desc="Summarizing the website of leads")

        self.df[self.extracted_col_name_website_summary] = self.df.progress_apply(
            lambda lead: get_lead_hash_generator().hash_check(
                lead,
                self.summarize_the_company_website,
                self.name,
                self.extracted_col_name_website_summary,
                lead[self.gpt_required_fields["website"]],
                lead[self.gpt_required_fields["place_id"]],
            ),
            axis=1,
        )

        # self.df[self.extracted_col_name_website_summary] = self.df.progress_apply(
        #     lambda lead: self.summarize_the_company_website(
        #         lead[self.gpt_required_fields["website"]]
        #     ),
        #     axis=1,
        # )
        return self.df



[docs]
    def finish(self) -> None:
        pass



[docs]
    def summarize_the_company_website(self, website, place_id):
        """
        Summarise client website using GPT. Handles exceptions that mightarise from the API call.
        """

        if website is None or pd.isna(website):
            return None
        company_summary = get_database().fetch_gpt_result(place_id, self.name)
        if company_summary:
            return company_summary["result"]

        html = self.extract_the_raw_html_and_parse(website)

        if html is None:
            return None
        max_retries = 5  # Maximum number of retries
        retry_delay = 5  # Initial delay in seconds (5 seconds)

        for attempt in range(max_retries):
            try:
                log.info(f"Attempt {attempt+1} of {max_retries}")
                response = self.client.chat.completions.create(
                    model=self.model,
                    messages=[
                        {
                            "role": "system",
                            "content": self.system_message_for_website_summary,
                        },
                        {
                            "role": "user",
                            "content": self.user_message_for_website_summary.format(
                                html
                            ),
                        },
                    ],
                    temperature=0,
                )

                # Check if the response contains the expected data
                if response.choices[0].message.content:
                    company_summary = response.choices[0].message.content

                    if company_summary == self.no_answer:
                        return None
                    get_database().save_gpt_result(company_summary, place_id, self.name)
                    return company_summary
                else:
                    log.info("No summary data found in the response.")
                    return None
            except openai.RateLimitError as e:
                if attempt < max_retries - 1:
                    log.warning(
                        f"Rate limit exceeded, retrying in {retry_delay} seconds..."
                    )
                    time.sleep(retry_delay)
                    retry_delay *= 2  # Exponential backoff
                else:
                    log.error("Max retries reached. Unable to complete the request.")
                    break
            except (
                openai.APITimeoutError,
                openai.APIConnectionError,
                openai.BadRequestError,
                openai.AuthenticationError,
                openai.PermissionDeniedError,
                Exception,
            ) as e:
                # Handle possible errors
                log.error(
                    f"An error occurred during summarizing the lead with GPT: {e}"
                )
                pass



[docs]
    def extract_the_raw_html_and_parse(self, url):
        try:
            # Send a request to the URL
            response = requests.get(url)
        except RequestException as e:
            log.error(f"An error occured during getting repsonse from url: {e}")
            return None

        # If the request was successful
        if not response.status_code == HTTPStatus.OK:
            log.error(f"Failed to fetch data. Status code: {response.status_code}")
            return None
        try:
            # Use the detected encoding to decode the response content
            soup = BeautifulSoup(response.content, "html.parser")

            texts = []
            for element in soup.find_all(["h1", "h2", "h3", "p", "li"]):
                texts.append(element.get_text(strip=True))
            return " ".join(texts)
        except UnicodeDecodeError as e:
            return None