From 65b7d3fda92b561287e0368df3b5ba018767dad1 Mon Sep 17 00:00:00 2001
From: Vy-X-S <mikhail.ocampo@sjsu.edu>
Date: Sun, 19 May 2024 22:52:55 -0700
Subject: [PATCH] Dynamo V2 stable - RESTRICT youtube videos to within 10
 minutes ONLY - switched chain to algo: Map Reduce - When 10 minutes, expect
 about ~30 seconds for a response - ToolResponse schema will also respond as
 objects within a list -

---
 app/api/router.py                            |  4 +-
 app/features/dynamo/core.py                  | 24 +++--
 app/features/dynamo/prompt/dynamo-prompt.txt | 24 +++--
 app/features/dynamo/prompt/examples.txt      | 60 +++++++++++++
 app/features/dynamo/tools.py                 | 93 ++++++++------------
 app/requirements.txt                         |  3 +-
 app/services/schemas.py                      |  2 +-
 7 files changed, 138 insertions(+), 72 deletions(-)
 create mode 100644 app/features/dynamo/prompt/examples.txt

diff --git a/app/api/router.py b/app/api/router.py
index f9cdb79e..ac6498f0 100644
--- a/app/api/router.py
+++ b/app/api/router.py
@@ -26,10 +26,12 @@ async def submit_tool( data: ToolRequest, _ = Depends(key_check)):
         logger.error(f"Inputs: {request_inputs_dict}")
         logger.error(f"Firestore inputs: {requested_tool['inputs']}")
         raise HTTPException(status_code=400, detail="Input validation failed")
+    else:
+        logger.info(f"Input validation passed")
 
     result = execute_tool(request_data.tool_id, request_inputs_dict)
     
-    return ToolResponse(data=[result])
+    return ToolResponse(data=result)
 
 @router.post("/chat", response_model=ChatResponse)
 async def chat( request: ChatRequest, _ = Depends(key_check) ):
diff --git a/app/features/dynamo/core.py b/app/features/dynamo/core.py
index 5087f443..408b30c7 100644
--- a/app/features/dynamo/core.py
+++ b/app/features/dynamo/core.py
@@ -1,9 +1,21 @@
-from features.dynamo.tools import find_key_concepts, retrieve_youtube_documents
-
+from features.dynamo.tools import summarize_transcript, generate_flashcards
+from services.logger import setup_logger
 # TODO: Implement the executor function's verbose param to downstream logic
 
+logger = setup_logger(__name__)
+
 def executor(youtube_url: str, verbose=False):
-    yt_documents = retrieve_youtube_documents(youtube_url)
-    concepts = find_key_concepts(yt_documents)
-    
-    return concepts
\ No newline at end of file
+    summary = summarize_transcript(youtube_url, verbose=verbose)
+    flashcards = generate_flashcards(summary)
+
+    sanitized_flashcards = []
+    for flashcard in flashcards:
+        if 'concept' in flashcard and 'definition' in flashcard:
+            sanitized_flashcards.append({
+                "concept": flashcard['concept'],
+                "definition": flashcard['definition']
+            })
+        else:
+            logger.warning(f"Malformed flashcard skipped: {flashcard}")
+
+    return sanitized_flashcards 
\ No newline at end of file
diff --git a/app/features/dynamo/prompt/dynamo-prompt.txt b/app/features/dynamo/prompt/dynamo-prompt.txt
index 3b217ad5..808e9fd5 100644
--- a/app/features/dynamo/prompt/dynamo-prompt.txt
+++ b/app/features/dynamo/prompt/dynamo-prompt.txt
@@ -1,11 +1,17 @@
-You are a student a text for your exam. Consider the following transcript from a video and find the core idea or concept along with a definition. This will be used to create a flashcard to help you study. You must provide a definition for the concept. Follow the format instructions provided.
-                
-Transcript:
--------------------------------
-{text}
-
-Instructions:
--------------------------------
+You are a flashcard generation assistant designed to help students analyze a document and return a list of flashcards. Carefully consider the document and analyze what are the key terms or concepts relevant for students to better understand the topic. The topics provided will vary in a wide range of subjects as they are a summarized transcript from a youtube video; as such, all information provided is meant to be educational and all provided content is meant to educate students in the flashcards. You only respond in the response formatting provided. Do not apply any markdown or extra characters to your response.
+
+Input:
+-----------------------------
+{summary}
+
+Examples:
+-----------------------------
+{examples}
+
+Formatting:
+-----------------------------
 {format_instructions}
 
-Respond only with JSON with the concept and definition.
\ No newline at end of file
+Respond only according to the format instructions. The examples included are best responses noted by an input and output example.
+
+Output:
\ No newline at end of file
diff --git a/app/features/dynamo/prompt/examples.txt b/app/features/dynamo/prompt/examples.txt
new file mode 100644
index 00000000..58c3c1ae
--- /dev/null
+++ b/app/features/dynamo/prompt/examples.txt
@@ -0,0 +1,60 @@
+input:
+## Concise Summary of the provided document:
+
+**Large Language Models (LLMs)** are powerful AI tools trained on massive datasets to perform tasks like text generation, translation, and question answering. They can be specialized for specific domains through fine-tuning, making them versatile and adaptable. 
+
+**Key points:**
+
+* **Pre-trained and fine-tuned:** LLMs learn general knowledge from large datasets and specialize in specific tasks through additional training. 
+* **Prompt design:** Effective prompts are crucial for eliciting desired responses from LLMs.
+* **Domain knowledge:** Understanding the specific domain is essential for building and tuning LLMs.
+* **Parameter-efficient tuning methods:** This method allows for efficient customization of LLMs without altering the entire model.
+* **Vertex AI:** Provides tools for building, tuning, and deploying LLMs for specific tasks.
+* **Generative AI App Builder and PaLM API:** Tools for developers to build AI apps and experiment with LLMs.
+* **Model management tools:** Tools for training, deploying, and monitoring ML models.
+
+**This document provides a comprehensive overview of LLMs and related tools, highlighting their capabilities and potential applications.** 
+
+**Additional notes:**
+
+* The text emphasizes the importance of prompt design and domain knowledge for effective LLM usage.
+* It introduces cutting-edge technologies like PETM and Vertex AI, showcasing the rapid advancements in the field.
+* The document also provides practical resources for developers to build and deploy LLM-powered applications. 
+
+**Overall, this document is a valuable resource for anyone interested in understanding and utilizing LLMs and related technologies.**
+
+Output:
+[
+  {
+    "concept": "Large Language Models (LLMs)",
+    "definition": "Powerful AI tools trained on massive datasets to perform tasks like text generation, translation, and question answering."
+  },
+  {
+    "concept": "Pre-trained and fine-tuned",
+    "definition": "LLMs learn general knowledge from large datasets and specialize in specific tasks through additional training."
+  },
+  {
+    "concept": "Prompt design",
+    "definition": "Effective prompts are crucial for eliciting desired responses from LLMs."
+  },
+  {
+    "concept": "Domain knowledge",
+    "definition": "Understanding the specific domain is essential for building and tuning LLMs."
+  },
+  {
+    "concept": "Parameter-efficient tuning methods",
+    "definition": "This method allows for efficient customization of LLMs without altering the entire model."
+  },
+  {
+    "concept": "Vertex AI",
+    "definition": "Provides tools for building, tuning, and deploying LLMs for specific tasks."
+  },
+  {
+    "concept": "Generative AI App Builder and PaLM API",
+    "definition": "Tools for developers to build AI apps and experiment with LLMs."
+  },
+  {
+    "concept": "Model management tools",
+    "definition": "Tools for training, deploying, and monitoring ML models."
+  }
+]
diff --git a/app/features/dynamo/tools.py b/app/features/dynamo/tools.py
index 1ed63bbc..5ef10aa5 100644
--- a/app/features/dynamo/tools.py
+++ b/app/features/dynamo/tools.py
@@ -3,10 +3,12 @@
 from langchain.prompts import PromptTemplate
 from langchain_google_vertexai import VertexAI
 from langchain_core.output_parsers import JsonOutputParser
+from langchain.chains.summarize import load_summarize_chain
+from langchain_core.pydantic_v1 import BaseModel, Field
 from services.logger import setup_logger
-from pydantic import BaseModel, Field
 import os
 
+
 logger = setup_logger(__name__)
 
 # AI Model
@@ -22,78 +24,61 @@ def read_text_file(file_path):
     with open(absolute_file_path, 'r') as file:
         return file.read()
 
-
-# Youtube Loader # Chunk and Splitter
-def retrieve_youtube_documents(youtube_url: str):
-    """Retrieve youtbe transcript and create a list of documents"""
+# Summarize chain
+def summarize_transcript(youtube_url: str, max_video_length=600, verbose=False) -> str:
     loader = YoutubeLoader.from_youtube_url(youtube_url, add_video_info=True)
     splitter = RecursiveCharacterTextSplitter(
         chunk_size = 1000,
         chunk_overlap = 0
     )
-    
     docs = loader.load()
+    split_docs = splitter.split_documents(docs)
     
-    length = docs[0].metadata["length"]
-    title = docs[0].metadata["title"]
-    
-    logger.info(f"Found video with title: {title} and length: {length}")
-    
-    # If docs empty, throw error
     if not docs:
         raise ValueError("No documents found")
-    
-    # if docs too long, throw error
-    if length > 1200: # 20 minutes
-        raise ValueError("Video too long")
-    
-    return splitter.split_documents(docs)
-        
 
-# Num sampler
-def find_key_concepts(documents: list, sample_size: int = 6):
-    """Iterate through all documents of group size N and find key concepts"""
-    if sample_size > len(documents):
-        sample_size = len(documents) // 5
-    
-    num_docs_per_group = len(documents) // sample_size + (len(documents) % sample_size > 0)
+    length = docs[0].metadata["length"]
+    title = docs[0].metadata["title"]
+
+    if length > max_video_length:
+        raise ValueError(f"Video is too long, please provide a video less than {max_video_length} seconds long")
+
+    if verbose:
+        logger.info(f"Found video with title: {title} and length: {length}")
+        logger.info(f"Splitting documents into {len(split_docs)} chunks")
     
-    if num_docs_per_group > 5:
-        num_docs_per_group = 6 # Default to 6 if too many documents
-        logger.info(f"Number of documents per group is too large. Defaulting to {num_docs_per_group}")
+    chain = load_summarize_chain(model, chain_type='map_reduce')
+    response = chain(split_docs)
     
-    groups = [documents[i:i + num_docs_per_group] for i in range(0, len(documents), num_docs_per_group)]
+    if response and verbose: logger.info("Successfully completed generating summary")
     
+    return response
+
+def generate_flashcards(summary: str, verbose=False) -> list:
+    # Receive the summary from the map reduce chain and generate flashcards
     parser = JsonOutputParser(pydantic_object=Flashcard)
     
-    batch_concept = []
-    
-    logger.info(f"Beginning to process {len(groups)} groups")
+    if verbose: logger.info(f"Beginning to process summary")
     
     template = read_text_file("prompt/dynamo-prompt.txt")
+    examples = read_text_file("prompt/examples.txt")
+    
     prompt = PromptTemplate(
-                template = template,
-                input_variables=["text"],
-                partial_variables={"format_instructions": parser.get_format_instructions()}
-            )
-    # Create Chain
+        template=template,
+        input_variables=["summary", "examples"],
+        partial_variables={"format_instructions": parser.get_format_instructions()}
+    )
+    
     chain = prompt | model | parser
     
-    for group in groups:
-        group_content = ""
-        
-        for doc in group:
-            group_content += doc.page_content
-
-            # Run Chain
-            output_concept = chain.invoke({"text": group_content})
-            
-            logger.info(f"Output concept: {output_concept}\n")
-            
-            batch_concept.append(output_concept)
-            
-    return batch_concept
+    try:
+        response = chain.invoke({"summary": summary, "examples": examples})
+    except Exception as e:
+        logger.error(f"Failed to generate flashcards: {e}")
+        return []
+    
+    return response
 
 class Flashcard(BaseModel):
-    concept: str = Field(description="The concept or term")
-    definition: str = Field(description="The summarized definition of the concept or term")
\ No newline at end of file
+    concept: str = Field(description="The concept of the flashcard")
+    definition: str = Field(description="The definition of the flashcard")
\ No newline at end of file
diff --git a/app/requirements.txt b/app/requirements.txt
index 74a5a04c..d4c5c6f2 100644
--- a/app/requirements.txt
+++ b/app/requirements.txt
@@ -3,7 +3,8 @@ uvicorn[standard]
 langchain
 langchain-core
 langchain-google-vertexai
-langchain_chroma
+langchain-chroma
+langchain-community
 google-cloud-secret-manager
 google-cloud-logging
 google-auth
diff --git a/app/services/schemas.py b/app/services/schemas.py
index 70668bcb..a60c31a7 100644
--- a/app/services/schemas.py
+++ b/app/services/schemas.py
@@ -47,7 +47,7 @@ class ChatResponse(BaseModel):
     data: List[Message]
 
 class ToolResponse(BaseModel):
-    data: List[Any]
+    data: Any
     
 class ChatMessage(BaseModel):
     role: str