alkem-io · valentinyanakiev · Jan 15, 2024 · Jan 15, 2024
diff --git a/.azure-template.env b/.azure-template.env
@@ -6,7 +6,7 @@ EMBEDDINGS_DEPLOYMENT_NAME=embedding
 RABBITMQ_HOST=localhost
 RABBITMQ_USER=admin
 RABBITMQ_PASSWORD=super-secure-pass
-AI_MODEL_TEMPERATURE=0.3
+AI_MODEL_TEMPERATURE=0.4
 AI_SOURCE_WEBSITE=https://www.alkemio.org
 AI_SOURCE_WEBSITE2=https://welcome.alkem.io
 AI_LOCAL_PATH=~/alkemio/data

diff --git a/Dockerfile b/Dockerfile
@@ -1,18 +1,19 @@
 # Use an official Python runtime as a parent image
-FROM python:3.11-slim-bookworm
+ARG PYTHON_VERSION=3.11
+FROM python:${PYTHON_VERSION}-slim-bullseye as builder
 
 # Set the working directory in the container to /app
 WORKDIR /app
 
-ARG GO_VERSION=1.21.5
-ARG HUGO_VERSION=0.121.1
-ARG ARCHITECTURE=amd64
+ARG GO_VERSION=1.21.6
+ARG HUGO_VERSION=0.121.2
+ARG TARGETARCH
 
 # install git, go and hugo
 RUN  apt update && apt upgrade -y && apt install -y git wget
-RUN wget https://go.dev/dl/go${GO_VERSION}.linux-${ARCHITECTURE}.tar.gz && tar -C /usr/local -xzf go${GO_VERSION}.linux-${ARCHITECTURE}.tar.gz 
+RUN wget https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz && tar -C /usr/local -xzf go${GO_VERSION}.linux-${TARGETARCH}.tar.gz 
 RUN export PATH=$PATH:/usr/local/go/bin:/usr/local && go version
-RUN wget https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-${ARCHITECTURE}.tar.gz && tar -C /usr/local -xzf hugo_extended_${HUGO_VERSION}_linux-${ARCHITECTURE}.tar.gz && ls -al /usr/local
+RUN wget https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-${TARGETARCH}.tar.gz && tar -C /usr/local -xzf hugo_extended_${HUGO_VERSION}_linux-${TARGETARCH}.tar.gz && ls -al /usr/local
 RUN /usr/local/hugo version
 
 # Install Poetry
@@ -24,5 +25,22 @@ COPY . /app
 # Use Poetry to install dependencies
 RUN poetry config virtualenvs.create true && poetry install --no-interaction --no-ansi
 
-# Run guidance-engine.py when the container launches
-CMD ["poetry", "run", "python", "guidance_engine.py"]
+# Start a new, final stage
+FROM python:${PYTHON_VERSION}-slim-bullseye
+
+WORKDIR /app
+
+# Install git and Poetry in the final stage
+RUN apt update && apt install -y git && pip install poetry
+
+# Copy the compiled app, Hugo executable, Go, and the virtual environment from the previous stage
+COPY --from=builder /app /app
+COPY --from=builder /usr/local/hugo /usr/local/hugo
+COPY --from=builder /usr/local/go /usr/local/go
+COPY --from=builder /root/.cache/pypoetry/virtualenvs /root/.cache/pypoetry/virtualenvs
+
+# Add Hugo and Go to the PATH
+ENV PATH="/usr/local/hugo:/usr/local/go/bin:${PATH}"
+
+# Run guidance_engine.py when the container launches
+CMD ["poetry", "run", "python", "guidance_engine.py"]
diff --git a/ai_adapter.py b/ai_adapter.py
@@ -1,9 +1,9 @@
-from langchain.embeddings import AzureOpenAIEmbeddings
+from langchain_openai import AzureOpenAIEmbeddings
 from langchain.vectorstores import FAISS
-from langchain.llms import AzureOpenAI
+from langchain_openai import AzureOpenAI
 from langchain.prompts.prompt import PromptTemplate
-from langchain.prompts import ChatPromptTemplate
-from langchain.chat_models import AzureChatOpenAI
+from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
+from langchain_openai import AzureChatOpenAI
 from langchain.schema import StrOutputParser
 from langchain_core.runnables import RunnableLambda, RunnablePassthrough
 from langchain.schema import format_document
@@ -27,7 +27,7 @@
 
 # Create handlers
 c_handler = logging.StreamHandler(io.TextIOWrapper(sys.stdout.buffer, line_buffering=True))
-f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path),'app.log'))
+f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path), 'app.log'))
 
 c_handler.setLevel(level=getattr(logging, LOG_LEVEL))
 f_handler.setLevel(logging.WARNING)
@@ -45,7 +45,7 @@
 logger.info(f"log level {os.path.basename(__file__)}: {LOG_LEVEL}")
 
 # verbose output for LLMs
-if LOG_LEVEL=="DEBUG":
+if LOG_LEVEL == "DEBUG":
     verbose_models = True
 else:
     verbose_models = False
@@ -75,40 +75,56 @@ def get_language_by_code(language_code):
     return language_mapping.get(language_code, 'English')
 
 
-chat_template = """
-You are a friendly conversational agent. Use the following step-by-step instructions to respond to user inputs.
-1 - The text provided in the context delimited by triple pluses may contain questions. Remove those questions from the context. 
-2 - Provide an up to three paragraghs answer that is accurate and exthausive, taking into account the context delimited by triple pluses.
-    If the answer cannot be found within the context, write 'I could not find an answer to your question'.
-3 - Only return the answer from step 2, do not show any code or additional information.
-4 - If the question is in a different language than English, translate the question to English before answering.
-5 - Answer the question in the {language} language. 
+chat_system_template = """
+You are a friendly and talkative conversational agent, tasked with answering questions about Alkemio.
+Use the following step-by-step instructions to respond to user inputs:
+
+1 - If the question is in a different language than English, translate the question to English before answering.
+2 - The text provided in the context delimited by triple pluses is retrieved from the Alkemio website is not part of the conversation with the user.
+3 - Provide an answer of 250 words or less that is professional, engaging, accurate and exthausive, based on the context delimited by triple pluses. \
+If the answer cannot be found within the context, write 'Hmm, I am not sure'.
+4 - Only return the answer from step 3, do not show any code or additional information.
+5 - Answer the question in the {language} language.
 +++
-Context:
+context:
 {context}
 +++
-Question: {question}
 """
 
 condense_question_template = """"
-Combine the chat history delimited by triple pluses and follow-up question into a single standalone question that does justice to the follow-up question. Do only return the standalone question, do not return any other information.
+Create a single sentence standalone query based on the human input, using the following step-by-step instructions:
+
+1. If the human input is expressing a sentiment, delete and ignore the chat history delimited by triple pluses. \
+Then, return the human input containing the sentiment as the standalone query. Do NOT in any way respond to the human input, \
+simply repeat it.
+2. Otherwise, combine the chat history delimited by triple pluses and human input into a single standalone query that does \
+justice to the human input.
+3. Do only return the standalone query, do not return any other information. Never return the chat history delimited by triple pluses.
 
 +++
-Chat History: {chat_history}
+chat history:
+{chat_history}
 +++
-Follow-up question: {question}
+
+Human input: {question}
 ---
-Standalone question:
+Standalone query:
 """
 
 
 condense_question_prompt = PromptTemplate.from_template(condense_question_template)
 
-chat_prompt = ChatPromptTemplate.from_template(chat_template)
+chat_prompt = ChatPromptTemplate.from_messages(
+    [
+        ("system", chat_system_template),
+        MessagesPlaceholder(variable_name="chat_history"),
+        ("human", "{question}"),
+    ]
+)
 
 
 generic_llm = AzureOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],
-                            temperature=0, verbose=verbose_models)
+                          temperature=0, verbose=verbose_models)
 
 embeddings = AzureOpenAIEmbeddings(
     azure_deployment=config['embeddings_deployment_name'],
@@ -121,7 +137,7 @@ def load_vector_db():
     Purpose:
         Load the data into the vector database.
     Args:
-        
+
     Returns:
         vectorstore: the vectorstore object
     """
@@ -130,25 +146,35 @@ def load_vector_db():
         logger.info(f"The file vector database is present")
     else:
         logger.info(f"The file vector database is not present, ingesting")
-        def_ingest.ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path, config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)    
+        def_ingest.ingest(
+            config['source_website'],
+            config['website_repo'],
+            website_generated_path,
+            website_source_path,
+            config['source_website2'],
+            config['website_repo2'],
+            website_generated_path2,
+            website_source_path2)
 
     return FAISS.load_local(vectordb_path, embeddings)
 
+
 vectorstore = load_vector_db()
 
 retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .5})
 
 chat_llm = AzureChatOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],
-                            temperature=os.environ["AI_MODEL_TEMPERATURE"],
-                            max_tokens=max_token_limit, verbose=verbose_models)
+                           temperature=os.environ["AI_MODEL_TEMPERATURE"],
+                           max_tokens=max_token_limit, verbose=verbose_models)
 
 condense_llm = AzureChatOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],
-                            temperature=0,
-                            verbose=verbose_models)
+                               temperature=0,
+                               verbose=verbose_models)
 
 def format_docs(docs):
     return "\n\n".join(doc.page_content for doc in docs)
 
+
 DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")
 
 def _combine_documents(
@@ -157,11 +183,10 @@ def _combine_documents(
     doc_strings = [format_document(doc, document_prompt) for doc in docs]
     return document_separator.join(doc_strings)
 
-
 async def query_chain(question, language, chat_history):
 
     # check whether the chat history is empty
-    if  chat_history.buffer == []:
+    if chat_history.buffer == []:
         first_call = True
     else:
         first_call = False
@@ -176,11 +201,12 @@ async def query_chain(question, language, chat_history):
     # This adds a "memory" key to the input object
     loaded_memory = RunnablePassthrough.assign(
         chat_history=RunnableLambda(chat_history.load_memory_variables) | itemgetter("history"),
-    )    
+    )
 
     logger.debug(f"loaded memory {loaded_memory}\n")
     logger.debug(f"chat history {chat_history}\n")
 
+
     # Now we calculate the standalone question if the chat_history is not empty
     standalone_question = {
         "standalone_question": {
@@ -208,10 +234,10 @@ async def query_chain(question, language, chat_history):
         "question": lambda x: x["standalone_question"],
     }
 
-
     # Now we construct the inputs for the final prompt
     final_inputs = {
         "context": lambda x: _combine_documents(x["docs"]),
+        "chat_history" : lambda x: chat_history.buffer,
         "question": itemgetter("question"),
         "language": lambda x: language['language'],
     }
@@ -222,7 +248,6 @@ async def query_chain(question, language, chat_history):
         "docs": itemgetter("docs"),
     }
 
-
     # And now we put it all together in a 'RunnableBranch', so we only invoke the rephrasing part when the chat history is not empty
     final_chain = RunnableBranch(
         (lambda x: x["first_call"], loaded_memory | direct_question | retrieved_documents | answer),

diff --git a/def_ingest.py b/def_ingest.py
@@ -3,7 +3,7 @@
 import sys
 import io
 import re
-from langchain.embeddings import AzureOpenAIEmbeddings
+from langchain_openai import AzureOpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import FAISS
 from langchain.callbacks import get_openai_callback
@@ -26,7 +26,7 @@
 
 # Create handlers
 c_handler = logging.StreamHandler(io.TextIOWrapper(sys.stdout.buffer, line_buffering=True))
-f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path),'app.log'))
+f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path), 'app.log'))
 
 c_handler.setLevel(level=getattr(logging, LOG_LEVEL))
 f_handler.setLevel(logging.WARNING)
@@ -102,13 +102,13 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
     """
     # Transform
     bs_transformer = BeautifulSoupTransformer()
-    
+
     # Get all links from the sitemaps
     logger.info(f"generating html: {local_source_path}, {source_website_url}")
     full_sitemap_list = extract_urls_from_sitemap(website_generated_path)
 
-    exclusion_list = [os.sep + 'tag' + os.sep, 
-                      os.sep + 'category' + os.sep, 
+    exclusion_list = [os.sep + 'tag' + os.sep,
+                      os.sep + 'category' + os.sep,
                       os.sep + 'help' + os.sep + 'index']
 
     data = []
@@ -126,7 +126,10 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
                 continue
             document = loader.load()
             # note h5 and h6 tags for our website contain a lot of irrelevant metadata
-            doc_transformed = bs_transformer.transform_documents(document, tags_to_extract=["p", "article", "title", "h1"], unwanted_tags=["h5", "h6"], remove_lines=True)
+            doc_transformed = bs_transformer.transform_documents(
+                document, tags_to_extract=[
+                    "p", "article", "title", "h1"], unwanted_tags=[
+                    "h5", "h6"], remove_lines=True)
             body_text = doc_transformed[0]
 
             # first remove duplicate spaces, then remove duplicate '\n\n', then remove duplicate '\n \n '
@@ -139,11 +142,11 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
                 data.append(body_text)
                 included_files.append(file_name)
             else:
-                #logger.info(f"document too small, not adding: {body_text.page_content}\n")
+                # logger.info(f"document too small, not adding: {body_text.page_content}\n")
                 excluded_files.append(file_name)
         except Exception as e:
-             # logger.error(f"...unable to process file: {str(e)}")
-             error_files.append(file_name)
+            # logger.error(f"...unable to process file: {str(e)}")
+            error_files.append(file_name)
 
     logger.info(f"==> Returning {len(included_files)} files; {len(error_files)} gave errors + {len(excluded_files)} files were skipped")
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/5)
@@ -169,7 +172,7 @@ def clone_and_generate(website_repo, destination_path, source_path):
         destination_path: path to directory containing generated html files
         source_path: path to directory containing the checked out github repo
     Returns:
-        
+
     """
 
     logger.info(f"About to generate website: {website_repo}")
@@ -179,7 +182,7 @@ def clone_and_generate(website_repo, destination_path, source_path):
     os.chdir(source_path)
     git_switch_command = ['git', 'switch', branch]
     git_directory = os.path.join(source_path, '.git')
-    # Check if the repository already exists in the source_path    
+    # Check if the repository already exists in the source_path
     if os.path.exists(git_directory):
         # Repository exists, perform a git pull to update it
         logger.info(f"...git directory exists, pulling in {os.getcwd()}")
@@ -191,7 +194,7 @@ def clone_and_generate(website_repo, destination_path, source_path):
         if (result_switch.returncode != 0):
             logger.error(f"Unable to switch {website_repo} repository: {result_switch.stderr}")
     else:
-        logger.info(f"...git directory does not exist, cloning in {os.getcwd()}")# Repository doesn't exist, perform a git clone
+        logger.info(f"...git directory does not exist, cloning in {os.getcwd()}")  # Repository doesn't exist, perform a git clone
         clone_command = ['git', 'clone', "https://" + github_user + ":" + github_pat + "@" + website_repo, source_path]
         result_clone = subprocess.run(clone_command, capture_output=True, text=True)
         if (result_clone.returncode != 0):
@@ -255,5 +258,5 @@ def create_vector_db(source_website_url, source_website_url2) -> None:
 
 # only execute if this is the main program run (so not imported)
 if __name__ == "__main__":
-    ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path, 
-           config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)   
+    ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path,
+           config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)