Merge pull request #383 from Eyobyb/llm_initialization_lifting

Update PromptReconstructor to include llm parameter
Aggregate-Intellect · Jun 12, 2024 · 0237604 · 0237604
2 parents ba9e631 + d900bc1
commit 0237604
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 31 deletions.
diff --git a/src/apps/slackapp/slackapp/bolt_app.py b/src/apps/slackapp/slackapp/bolt_app.py
@@ -170,6 +170,7 @@ def file_event_handler(say, files, user_id, thread_ts, question):
         user_id=user_id,
         files=files,
         token=cfg.SLACK_OAUTH_TOKEN,
+        llm=llm,
     )
     file_prompt_data = file_prompt.reconstruct_prompt_with_file()
     if file_prompt_data["status"] == "success":
@@ -217,6 +218,12 @@ def event_test(client, say, event):
     # only will be executed if the user don't pass the daily limit
     # the daily limit is calculated based on the user's usage in a workspace
     # users with a daily limitation can be allowed to use in a different workspace
+    llm = SherpaChatOpenAI(
+        openai_api_key=cfg.OPENAI_API_KEY,
+        user_id=user_id,
+        team_id=team_id,
+        temperature=cfg.TEMPERATURE,
+    )
 
     if can_execute:
         if "files" in event:
@@ -227,6 +234,7 @@ def event_test(client, say, event):
                 thread_ts=thread_ts,
                 user_id=combined_id,
                 question=question,
+                llm=llm,
             )
             if file_event["status"] == "error":
                 return

diff --git a/src/sherpa_ai/scrape/file_scraper.py b/src/sherpa_ai/scrape/file_scraper.py
@@ -15,7 +15,7 @@
 
 
 class QuestionWithFileHandler:
-    def __init__(self, question, files, token, user_id):
+    def __init__(self, question, files, token, user_id, team_id, llm):
         """
         Initializes the QuestionWithFileHandler instance.
 
@@ -32,6 +32,7 @@ def __init__(self, question, files, token, user_id):
         self.token = token
         self.files = files
         self.user_id = user_id
+        self.llm = llm
 
     def reconstruct_prompt_with_file(self):
         """
@@ -124,7 +125,7 @@ def prompt_reconstruct(self, file_info, data=str):
                 question=self.question,
                 title=file_info["title"],
                 text_data=data,
-                user_id=self.user_id,
+                llm=self.llm,
             )
 
             while count_string_tokens(chunk_summary, "gpt-3.5-turbo") > 3000:
@@ -134,7 +135,7 @@ def prompt_reconstruct(self, file_info, data=str):
                     question=self.question,
                     title=file_info["title"],
                     text_data=chunk_summary,
-                    user_id=self.user_id,
+                    llm=self.llm,
                 )
         result = question_with_file_reconstructor(
             file_format=file_info["filetype"],

diff --git a/src/sherpa_ai/scrape/prompt_reconstructor.py b/src/sherpa_ai/scrape/prompt_reconstructor.py
@@ -15,7 +15,7 @@ class PromptReconstructor:
     and rewrites the question to incorporate a summary of the scraped URLs.
     """
 
-    def __init__(self, question, slack_message):
+    def __init__(self, question, slack_message, llm):
         """
         Initialize the PromptReconstructor with a question and a Slack message.
 
@@ -26,8 +26,9 @@ def __init__(self, question, slack_message):
 
         self.question = question
         self.slack_message = slack_message
+        self.llm = llm
 
-    def reconstruct_prompt(self, user_id=None):
+    def reconstruct_prompt(self):
         """
         Reconstruct the prompt based on the question and the last Slack message.
 
@@ -68,7 +69,7 @@ def reconstruct_prompt(self, user_id=None):
                         link=link,
                         question=question,
                         text_data=scraped_data["data"],
-                        user_id=user_id,
+                        llm=self.llm,
                     )
 
                     while (
@@ -79,7 +80,7 @@ def reconstruct_prompt(self, user_id=None):
                             link=link,
                             question=question,
                             text_data=chunk_summary,
-                            user_id=user_id,
+                            llm=self.llm,
                         )
 
                     final_summary.append({"data": chunk_summary, "link": link})

diff --git a/src/sherpa_ai/utils.py b/src/sherpa_ai/utils.py
@@ -121,22 +121,11 @@ def count_string_tokens(string: str, model_name: str) -> int:
     return len(encoding.encode(string))
 
 
-def chunk_and_summarize(
-    text_data: str,
-    question: str,
-    link: str,
-    user_id: str = None,
-):
-    llm = SherpaOpenAI(
-        temperature=cfg.TEMPERATURE,
-        openai_api_key=cfg.OPENAI_API_KEY,
-        user_id=user_id,
-    )
-
+def chunk_and_summarize(text_data: str, question: str, link: str, llm):
     instruction = (
         "include any information that can be used to answer the "
-        "question '{question}' the given literal text is a data "
-        "from the link {link}. Do not directly answer the question itself"
+        f"question '{question}' the given literal text is a data "
+        f"from the link {link}. Do not directly answer the question itself"
     )
 
     text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=0)
@@ -161,19 +150,15 @@ def chunk_and_summarize_file(
     question: str,
     file_name: str,
     file_format: str,
+    llm,
     title: str = None,
-    user_id: str = None,
 ):
-    llm = SherpaOpenAI(
-        temperature=cfg.TEMPERATURE, openai_api_key=cfg.OPENAI_API_KEY, user_id=user_id
-    )
-
     title = f",title {title} " if title is not None else ""
 
     instruction = (
-        "include any information that can be used to answer the "
-        "question '{question}' the given literal text is a data "
-        "from the file named {file_name} {title} and file format {file_format} . Do not directly answer the question itself"
+        f"include any information that can be used to answer the "
+        f"question '{question}' the given literal text is a data "
+        f"from the file named {file_name} {title} and file format {file_format} . Do not directly answer the question itself"
     )
     text_splitter = TokenTextSplitter(chunk_size=3000, chunk_overlap=0)
     chunked_text = text_splitter.split_text(text_data)

diff --git a/...ts/data/test_prompt_reconstructor_test_reconstruct_prompt_with_link_inside_succeeds.jsonl b/...ts/data/test_prompt_reconstructor_test_reconstruct_prompt_with_link_inside_succeeds.jsonl
diff --git a/src/tests/unit_tests/scrape/test_prompt_reconstructor.py b/src/tests/unit_tests/scrape/test_prompt_reconstructor.py
@@ -3,6 +3,7 @@
 import pytest
 
 from sherpa_ai.scrape.prompt_reconstructor import PromptReconstructor
+from sherpa_ai.test_utils.llms import get_llm
 
 
 # Assuming that 'your_module' contains the 'PromptReconstructor' class
@@ -33,11 +34,12 @@ def test_reconstruct_prompt_with_link_inside_succeeds(
     mock_get_link_from_slack_client_conversation,
     mock_scrape_with_url,
     mock_chunk_and_summarize,
+    get_llm,
 ):
     question = "Here's a <https://google.com>"
     slack_message = ""
-
-    reconstructor = PromptReconstructor(question, slack_message)
+    llm = get_llm(__file__, test_reconstruct_prompt_with_link_inside_succeeds.__name__)
+    reconstructor = PromptReconstructor(question, slack_message, llm)
     with patch(
         "sherpa_ai.scrape.prompt_reconstructor.chunk_and_summarize",
         return_value=mock_chunk_and_summarize,