Skip to content

Commit

Permalink
Merge pull request #44 from alkem-io/develop
Browse files Browse the repository at this point in the history
Release: Error Handling, Answer Parsing, Configuration, Dependencies
  • Loading branch information
valentinyanakiev authored Jan 15, 2024
2 parents 71971a2 + 550ffc3 commit ca7ee83
Show file tree
Hide file tree
Showing 7 changed files with 179 additions and 321 deletions.
2 changes: 1 addition & 1 deletion .azure-template.env
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ EMBEDDINGS_DEPLOYMENT_NAME=embedding
RABBITMQ_HOST=localhost
RABBITMQ_USER=admin
RABBITMQ_PASSWORD=super-secure-pass
AI_MODEL_TEMPERATURE=0.3
AI_MODEL_TEMPERATURE=0.4
AI_SOURCE_WEBSITE=https://www.alkemio.org
AI_SOURCE_WEBSITE2=https://welcome.alkem.io
AI_LOCAL_PATH=~/alkemio/data
Expand Down
34 changes: 26 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
# Use an official Python runtime as a parent image
FROM python:3.11-slim-bookworm
ARG PYTHON_VERSION=3.11
FROM python:${PYTHON_VERSION}-slim-bullseye as builder

# Set the working directory in the container to /app
WORKDIR /app

ARG GO_VERSION=1.21.5
ARG HUGO_VERSION=0.121.1
ARG ARCHITECTURE=amd64
ARG GO_VERSION=1.21.6
ARG HUGO_VERSION=0.121.2
ARG TARGETARCH

# install git, go and hugo
RUN apt update && apt upgrade -y && apt install -y git wget
RUN wget https://go.dev/dl/go${GO_VERSION}.linux-${ARCHITECTURE}.tar.gz && tar -C /usr/local -xzf go${GO_VERSION}.linux-${ARCHITECTURE}.tar.gz
RUN wget https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz && tar -C /usr/local -xzf go${GO_VERSION}.linux-${TARGETARCH}.tar.gz
RUN export PATH=$PATH:/usr/local/go/bin:/usr/local && go version
RUN wget https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-${ARCHITECTURE}.tar.gz && tar -C /usr/local -xzf hugo_extended_${HUGO_VERSION}_linux-${ARCHITECTURE}.tar.gz && ls -al /usr/local
RUN wget https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-${TARGETARCH}.tar.gz && tar -C /usr/local -xzf hugo_extended_${HUGO_VERSION}_linux-${TARGETARCH}.tar.gz && ls -al /usr/local
RUN /usr/local/hugo version

# Install Poetry
Expand All @@ -24,5 +25,22 @@ COPY . /app
# Use Poetry to install dependencies
RUN poetry config virtualenvs.create true && poetry install --no-interaction --no-ansi

# Run guidance-engine.py when the container launches
CMD ["poetry", "run", "python", "guidance_engine.py"]
# Start a new, final stage
FROM python:${PYTHON_VERSION}-slim-bullseye

WORKDIR /app

# Install git and Poetry in the final stage
RUN apt update && apt install -y git && pip install poetry

# Copy the compiled app, Hugo executable, Go, and the virtual environment from the previous stage
COPY --from=builder /app /app
COPY --from=builder /usr/local/hugo /usr/local/hugo
COPY --from=builder /usr/local/go /usr/local/go
COPY --from=builder /root/.cache/pypoetry/virtualenvs /root/.cache/pypoetry/virtualenvs

# Add Hugo and Go to the PATH
ENV PATH="/usr/local/hugo:/usr/local/go/bin:${PATH}"

# Run guidance_engine.py when the container launches
CMD ["poetry", "run", "python", "guidance_engine.py"]
91 changes: 58 additions & 33 deletions ai_adapter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import AzureOpenAI
from langchain_openai import AzureOpenAI
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import AzureChatOpenAI
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder, PromptTemplate
from langchain_openai import AzureChatOpenAI
from langchain.schema import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.schema import format_document
Expand All @@ -27,7 +27,7 @@

# Create handlers
c_handler = logging.StreamHandler(io.TextIOWrapper(sys.stdout.buffer, line_buffering=True))
f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path),'app.log'))
f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path), 'app.log'))

c_handler.setLevel(level=getattr(logging, LOG_LEVEL))
f_handler.setLevel(logging.WARNING)
Expand All @@ -45,7 +45,7 @@
logger.info(f"log level {os.path.basename(__file__)}: {LOG_LEVEL}")

# verbose output for LLMs
if LOG_LEVEL=="DEBUG":
if LOG_LEVEL == "DEBUG":
verbose_models = True
else:
verbose_models = False
Expand Down Expand Up @@ -75,40 +75,56 @@ def get_language_by_code(language_code):
return language_mapping.get(language_code, 'English')


chat_template = """
You are a friendly conversational agent. Use the following step-by-step instructions to respond to user inputs.
1 - The text provided in the context delimited by triple pluses may contain questions. Remove those questions from the context.
2 - Provide an up to three paragraghs answer that is accurate and exthausive, taking into account the context delimited by triple pluses.
If the answer cannot be found within the context, write 'I could not find an answer to your question'.
3 - Only return the answer from step 2, do not show any code or additional information.
4 - If the question is in a different language than English, translate the question to English before answering.
5 - Answer the question in the {language} language.
chat_system_template = """
You are a friendly and talkative conversational agent, tasked with answering questions about Alkemio.
Use the following step-by-step instructions to respond to user inputs:
1 - If the question is in a different language than English, translate the question to English before answering.
2 - The text provided in the context delimited by triple pluses is retrieved from the Alkemio website is not part of the conversation with the user.
3 - Provide an answer of 250 words or less that is professional, engaging, accurate and exthausive, based on the context delimited by triple pluses. \
If the answer cannot be found within the context, write 'Hmm, I am not sure'.
4 - Only return the answer from step 3, do not show any code or additional information.
5 - Answer the question in the {language} language.
+++
Context:
context:
{context}
+++
Question: {question}
"""

condense_question_template = """"
Combine the chat history delimited by triple pluses and follow-up question into a single standalone question that does justice to the follow-up question. Do only return the standalone question, do not return any other information.
Create a single sentence standalone query based on the human input, using the following step-by-step instructions:
1. If the human input is expressing a sentiment, delete and ignore the chat history delimited by triple pluses. \
Then, return the human input containing the sentiment as the standalone query. Do NOT in any way respond to the human input, \
simply repeat it.
2. Otherwise, combine the chat history delimited by triple pluses and human input into a single standalone query that does \
justice to the human input.
3. Do only return the standalone query, do not return any other information. Never return the chat history delimited by triple pluses.
+++
Chat History: {chat_history}
chat history:
{chat_history}
+++
Follow-up question: {question}
Human input: {question}
---
Standalone question:
Standalone query:
"""


condense_question_prompt = PromptTemplate.from_template(condense_question_template)

chat_prompt = ChatPromptTemplate.from_template(chat_template)
chat_prompt = ChatPromptTemplate.from_messages(
[
("system", chat_system_template),
MessagesPlaceholder(variable_name="chat_history"),
("human", "{question}"),
]
)


generic_llm = AzureOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],
temperature=0, verbose=verbose_models)
temperature=0, verbose=verbose_models)

embeddings = AzureOpenAIEmbeddings(
azure_deployment=config['embeddings_deployment_name'],
Expand All @@ -121,7 +137,7 @@ def load_vector_db():
Purpose:
Load the data into the vector database.
Args:
Returns:
vectorstore: the vectorstore object
"""
Expand All @@ -130,25 +146,35 @@ def load_vector_db():
logger.info(f"The file vector database is present")
else:
logger.info(f"The file vector database is not present, ingesting")
def_ingest.ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path, config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)
def_ingest.ingest(
config['source_website'],
config['website_repo'],
website_generated_path,
website_source_path,
config['source_website2'],
config['website_repo2'],
website_generated_path2,
website_source_path2)

return FAISS.load_local(vectordb_path, embeddings)


vectorstore = load_vector_db()

retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .5})

chat_llm = AzureChatOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],
temperature=os.environ["AI_MODEL_TEMPERATURE"],
max_tokens=max_token_limit, verbose=verbose_models)
temperature=os.environ["AI_MODEL_TEMPERATURE"],
max_tokens=max_token_limit, verbose=verbose_models)

condense_llm = AzureChatOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],
temperature=0,
verbose=verbose_models)
temperature=0,
verbose=verbose_models)

def format_docs(docs):
return "\n\n".join(doc.page_content for doc in docs)


DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")

def _combine_documents(
Expand All @@ -157,11 +183,10 @@ def _combine_documents(
doc_strings = [format_document(doc, document_prompt) for doc in docs]
return document_separator.join(doc_strings)


async def query_chain(question, language, chat_history):

# check whether the chat history is empty
if chat_history.buffer == []:
if chat_history.buffer == []:
first_call = True
else:
first_call = False
Expand All @@ -176,11 +201,12 @@ async def query_chain(question, language, chat_history):
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
chat_history=RunnableLambda(chat_history.load_memory_variables) | itemgetter("history"),
)
)

logger.debug(f"loaded memory {loaded_memory}\n")
logger.debug(f"chat history {chat_history}\n")


# Now we calculate the standalone question if the chat_history is not empty
standalone_question = {
"standalone_question": {
Expand Down Expand Up @@ -208,10 +234,10 @@ async def query_chain(question, language, chat_history):
"question": lambda x: x["standalone_question"],
}


# Now we construct the inputs for the final prompt
final_inputs = {
"context": lambda x: _combine_documents(x["docs"]),
"chat_history" : lambda x: chat_history.buffer,
"question": itemgetter("question"),
"language": lambda x: language['language'],
}
Expand All @@ -222,7 +248,6 @@ async def query_chain(question, language, chat_history):
"docs": itemgetter("docs"),
}


# And now we put it all together in a 'RunnableBranch', so we only invoke the rephrasing part when the chat history is not empty
final_chain = RunnableBranch(
(lambda x: x["first_call"], loaded_memory | direct_question | retrieved_documents | answer),
Expand Down
31 changes: 17 additions & 14 deletions def_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import io
import re
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain_openai import AzureOpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.callbacks import get_openai_callback
Expand All @@ -26,7 +26,7 @@

# Create handlers
c_handler = logging.StreamHandler(io.TextIOWrapper(sys.stdout.buffer, line_buffering=True))
f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path),'app.log'))
f_handler = logging.FileHandler(os.path.join(os.path.expanduser(local_path), 'app.log'))

c_handler.setLevel(level=getattr(logging, LOG_LEVEL))
f_handler.setLevel(logging.WARNING)
Expand Down Expand Up @@ -102,13 +102,13 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
"""
# Transform
bs_transformer = BeautifulSoupTransformer()

# Get all links from the sitemaps
logger.info(f"generating html: {local_source_path}, {source_website_url}")
full_sitemap_list = extract_urls_from_sitemap(website_generated_path)

exclusion_list = [os.sep + 'tag' + os.sep,
os.sep + 'category' + os.sep,
exclusion_list = [os.sep + 'tag' + os.sep,
os.sep + 'category' + os.sep,
os.sep + 'help' + os.sep + 'index']

data = []
Expand All @@ -126,7 +126,10 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
continue
document = loader.load()
# note h5 and h6 tags for our website contain a lot of irrelevant metadata
doc_transformed = bs_transformer.transform_documents(document, tags_to_extract=["p", "article", "title", "h1"], unwanted_tags=["h5", "h6"], remove_lines=True)
doc_transformed = bs_transformer.transform_documents(
document, tags_to_extract=[
"p", "article", "title", "h1"], unwanted_tags=[
"h5", "h6"], remove_lines=True)
body_text = doc_transformed[0]

# first remove duplicate spaces, then remove duplicate '\n\n', then remove duplicate '\n \n '
Expand All @@ -139,11 +142,11 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
data.append(body_text)
included_files.append(file_name)
else:
#logger.info(f"document too small, not adding: {body_text.page_content}\n")
# logger.info(f"document too small, not adding: {body_text.page_content}\n")
excluded_files.append(file_name)
except Exception as e:
# logger.error(f"...unable to process file: {str(e)}")
error_files.append(file_name)
# logger.error(f"...unable to process file: {str(e)}")
error_files.append(file_name)

logger.info(f"==> Returning {len(included_files)} files; {len(error_files)} gave errors + {len(excluded_files)} files were skipped")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/5)
Expand All @@ -169,7 +172,7 @@ def clone_and_generate(website_repo, destination_path, source_path):
destination_path: path to directory containing generated html files
source_path: path to directory containing the checked out github repo
Returns:
"""

logger.info(f"About to generate website: {website_repo}")
Expand All @@ -179,7 +182,7 @@ def clone_and_generate(website_repo, destination_path, source_path):
os.chdir(source_path)
git_switch_command = ['git', 'switch', branch]
git_directory = os.path.join(source_path, '.git')
# Check if the repository already exists in the source_path
# Check if the repository already exists in the source_path
if os.path.exists(git_directory):
# Repository exists, perform a git pull to update it
logger.info(f"...git directory exists, pulling in {os.getcwd()}")
Expand All @@ -191,7 +194,7 @@ def clone_and_generate(website_repo, destination_path, source_path):
if (result_switch.returncode != 0):
logger.error(f"Unable to switch {website_repo} repository: {result_switch.stderr}")
else:
logger.info(f"...git directory does not exist, cloning in {os.getcwd()}")# Repository doesn't exist, perform a git clone
logger.info(f"...git directory does not exist, cloning in {os.getcwd()}") # Repository doesn't exist, perform a git clone
clone_command = ['git', 'clone', "https://" + github_user + ":" + github_pat + "@" + website_repo, source_path]
result_clone = subprocess.run(clone_command, capture_output=True, text=True)
if (result_clone.returncode != 0):
Expand Down Expand Up @@ -255,5 +258,5 @@ def create_vector_db(source_website_url, source_website_url2) -> None:

# only execute if this is the main program run (so not imported)
if __name__ == "__main__":
ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path,
config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)
ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path,
config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)
Loading

0 comments on commit ca7ee83

Please sign in to comment.