Skip to content

Commit

Permalink
Merge pull request #41 from alkem-io/develop
Browse files Browse the repository at this point in the history
Release: Logging, Refactor
  • Loading branch information
valentinyanakiev authored Jan 4, 2024
2 parents 1a18f49 + 1ff93f9 commit 71971a2
Show file tree
Hide file tree
Showing 9 changed files with 290 additions and 198 deletions.
16 changes: 16 additions & 0 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0",
"configurations": [
{
"name": "Python: Current File",
"type": "python",
"request": "launch",
"program": "${file}",
"console": "integratedTerminal",
"justMyCode": true
}
]
}
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,5 @@ COPY . /app
# Use Poetry to install dependencies
RUN poetry config virtualenvs.create true && poetry install --no-interaction --no-ansi

# Run app.py when the container launches
CMD ["poetry", "run", "python", "app.py"]
# Run guidance-engine.py when the container launches
CMD ["poetry", "run", "python", "guidance_engine.py"]
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,9 @@ You can find sample values in `.azure-template.env`. Configure them and create `

### Python & Poetry
The project requires Python & Poetry installed. The minimum version dependencies can be found at `pyproject.toml`.
After installing Python & Poetry, you simply need to run `poetry run python app.py`
After installing Python & Poetry:
* Install the dependencies: `poetry install`
* Run using `poetry run python guidance_engine.py`

### Linux
The project requires Python 3.11 as a minimum and needs Go and Hugo installed for creating a local version of the website. See Go and Hugo documentation for installation instructions (only when running outside container)
Expand Down
37 changes: 23 additions & 14 deletions ai_utils.py → ai_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import sys
import io
import def_ingest
from config import config, website_source_path, website_generated_path, website_source_path2, website_generated_path2, vectordb_path, local_path, generate_website, LOG_LEVEL, max_token_limit
from config import config, website_source_path, website_generated_path, website_source_path2, website_generated_path2, vectordb_path, local_path, LOG_LEVEL, max_token_limit

import os

Expand All @@ -33,16 +33,16 @@
f_handler.setLevel(logging.WARNING)

# Create formatters and add them to handlers
c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)

logger.info(f"log level ai_utils: {LOG_LEVEL}")
logger.info(f"log level {os.path.basename(__file__)}: {LOG_LEVEL}")

# verbose output for LLMs
if LOG_LEVEL=="DEBUG":
Expand Down Expand Up @@ -116,17 +116,26 @@ def get_language_by_code(language_code):
chunk_size=1
)

# Check if the vector database exists
if os.path.exists(vectordb_path+"/index.pkl"):
logger.info(f"The file vector database is present")
else:
# ingest data
if generate_website:
def_ingest.clone_and_generate(config['website_repo'], website_generated_path, website_source_path)
def_ingest.clone_and_generate(config['website_repo2'], website_generated_path2, website_source_path2)
def_ingest.mainapp(config['source_website'], config['source_website2'])
def load_vector_db():
"""
Purpose:
Load the data into the vector database.
Args:
Returns:
vectorstore: the vectorstore object
"""
# Check if the vector database exists
if os.path.exists(vectordb_path + os.sep + "index.pkl"):
logger.info(f"The file vector database is present")
else:
logger.info(f"The file vector database is not present, ingesting")
def_ingest.ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path, config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)

return FAISS.load_local(vectordb_path, embeddings)

vectorstore = load_vector_db()

vectorstore = FAISS.load_local(vectordb_path, embeddings)
retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .5})

chat_llm = AzureChatOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],
Expand Down
12 changes: 6 additions & 6 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
local_path = config['local_path']
github_user = config['github_user']
github_pat = config['github_pat']
website_source_path = local_path + '/website/source'
website_source_path2 = local_path + '/website2/source'
website_generated_path = local_path + '/website/generated'
website_generated_path2 = local_path + '/website2/generated'
vectordb_path = local_path + "/vectordb"
generate_website = True
website_source_path = local_path + os.sep + 'website' + os.sep + 'source'
website_source_path2 = local_path + os.sep + 'website2' + os.sep + 'source'
website_generated_path = local_path + os.sep + 'website' + os.sep + 'generated'
website_generated_path2 = local_path + os.sep + 'website2' + os.sep + 'generated'
vectordb_path = local_path + os.sep + 'vectordb'

chunk_size = 3000
# token limit for for the completion of the chat model, this does not include the overall context length
max_token_limit = 2000
Expand Down
156 changes: 107 additions & 49 deletions def_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,17 +32,23 @@
f_handler.setLevel(logging.WARNING)

# Create formatters and add them to handlers
c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
c_handler.setFormatter(c_format)
f_handler.setFormatter(f_format)

# Add handlers to the logger
logger.addHandler(c_handler)
logger.addHandler(f_handler)

logger.info(f"log level ingest: {LOG_LEVEL}")
logger.info(f"log level {os.path.basename(__file__)}: {LOG_LEVEL}")

def create_sitemap_filepath(website_generated_path):
return website_generated_path + os.sep + "sitemap.xml"

def sitemap_file_exists(website_generated_path):
sitemap_file = create_sitemap_filepath(website_generated_path)
return os.path.exists(sitemap_file)

def extract_urls_from_sitemap(base_directory):
"""
Expand All @@ -54,29 +60,32 @@ def extract_urls_from_sitemap(base_directory):
list of files to be retrieved
"""

sitemap_file = base_directory + os.sep + "sitemap.xml"
sitemap_file = create_sitemap_filepath(base_directory)
logger.info(f"Extracting urls using {sitemap_file}")

# Parse the XML directly from the file
tree = ET.parse(sitemap_file)
root = tree.getroot()

# List to store the complete URLs of the webpages to be retrieved
webpages_to_retrieve = []
# Extract the URLs from the sitemap
to_be_retieved = [
base_directory + elem.text + "index.html"
for elem in root.iter("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
]
for elem in root.iter("{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
# replace the / with the os separator
url_path = elem.text.replace("/", os.sep)
complete_url = base_directory + url_path + "index.html"
webpages_to_retrieve.append(complete_url)

logger.info(f"...sitemap as urls: {to_be_retieved[:5]}....")
return to_be_retieved
# logger.info(f"...sitemap as urls: {webpages_to_retrieve[:5]}....")
return webpages_to_retrieve


def embed_text(texts, save_loc):
embeddings = AzureOpenAIEmbeddings(
azure_deployment=config['embeddings_deployment_name'],
openai_api_version=config['openai_api_version'],
chunk_size=1
)
azure_deployment=config['embeddings_deployment_name'],
openai_api_version=config['openai_api_version'],
chunk_size=1
)
docsearch = FAISS.from_documents(texts, embeddings)

docsearch.save_local(save_loc)
Expand All @@ -98,29 +107,45 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
logger.info(f"generating html: {local_source_path}, {source_website_url}")
full_sitemap_list = extract_urls_from_sitemap(website_generated_path)

exclusion_list = [os.sep + 'tag' + os.sep,
os.sep + 'category' + os.sep,
os.sep + 'help' + os.sep + 'index']

data = []
included_files = []
error_files = []
excluded_files = []
for file_name in full_sitemap_list:
loader = TextLoader(file_name)
# ignore url's with /tag/ or /category/ as they do not contain relevant info.
if '/tag/' in file_name or '/category/' in file_name or '/help/index' in file_name:
logger.warning(f"exclusion found, not ingesting {file_name}\n")
continue
document = loader.load()
# note h5 and h6 tags for our website contain a lot of irrelevant metadata
doc_transformed = bs_transformer.transform_documents(document, tags_to_extract=["p", "article", "title", "h1"], unwanted_tags=["h5", "h6"], remove_lines=True)
body_text = doc_transformed[0]

# first remove duplicate spaces, then remove duplicate '\n\n', then remove duplicate '\n \n '
body_text.page_content = re.sub(r'(\n ){2,}', '\n', re.sub(r'\n+', '\n', re.sub(r' +', ' ', body_text.page_content)))

# remove the local directory from the source object
body_text.metadata['source'] = body_text.metadata['source'].replace(website_generated_path, source_website_url)

if len(body_text.page_content) > 100:
data.append(body_text)
else:
logger.warning(f"document too small, not adding: {body_text.page_content}\n")

# logger.info(f"Processing file {file_name}")
try:
loader = TextLoader(file_name)
# ignore url's with /tag/ or /category/ as they do not contain relevant info.
if any(exclusion in file_name for exclusion in exclusion_list):
# logger.info(f"...exclusion found, not ingesting {file_name}")
excluded_files.append(file_name)
continue
document = loader.load()
# note h5 and h6 tags for our website contain a lot of irrelevant metadata
doc_transformed = bs_transformer.transform_documents(document, tags_to_extract=["p", "article", "title", "h1"], unwanted_tags=["h5", "h6"], remove_lines=True)
body_text = doc_transformed[0]

# first remove duplicate spaces, then remove duplicate '\n\n', then remove duplicate '\n \n '
body_text.page_content = re.sub(r'(\n ){2,}', '\n', re.sub(r'\n+', '\n', re.sub(r' +', ' ', body_text.page_content)))

# remove the local directory from the source object
body_text.metadata['source'] = body_text.metadata['source'].replace(website_generated_path, source_website_url)

if len(body_text.page_content) > 100:
data.append(body_text)
included_files.append(file_name)
else:
#logger.info(f"document too small, not adding: {body_text.page_content}\n")
excluded_files.append(file_name)
except Exception as e:
# logger.error(f"...unable to process file: {str(e)}")
error_files.append(file_name)

logger.info(f"==> Returning {len(included_files)} files; {len(error_files)} gave errors + {len(excluded_files)} files were skipped")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/5)
texts = text_splitter.split_documents(data)
return texts
Expand All @@ -129,34 +154,52 @@ def remove_and_recreate(dir_path):
try:
if os.path.exists(dir_path):
shutil.rmtree(dir_path)
logger.info(f"Directory {dir_path} and its contents removed successfully.")
logger.info(f"...removed directory {dir_path} and its contents.")
os.makedirs(dir_path)
logger.info(f"...directory {dir_path} (re)created.")
except OSError as e:
logger.error(f"Error: {e.strerror}")

def clone_and_generate(website_repo, destination_path, source_path):
logger.info(f"About to generate website")
"""
Purpose:
Retrieve the Hugo based website and generate the static html files.
Args:
website_repo: github repo containing the Hugo based website
destination_path: path to directory containing generated html files
source_path: path to directory containing the checked out github repo
Returns:
"""

logger.info(f"About to generate website: {website_repo}")
remove_and_recreate(source_path)
remove_and_recreate(destination_path)
logger.info(f"...cloning or updating repo")
branch = "main"
os.chdir(source_path)
git_switch_command = ['git', 'switch', branch]
# Check if the repository already exists in the source_path
if os.path.exists(os.path.join(source_path, '.git')):
git_directory = os.path.join(source_path, '.git')
# Check if the repository already exists in the source_path
if os.path.exists(git_directory):
# Repository exists, perform a git pull to update it
logger.info(f"...git directory exists, pulling in {os.getcwd()}")
git_pull_command = ['git', 'pull', 'origin', branch] # Modify branch name as needed
result_pull = subprocess.run(git_pull_command, cwd=source_path, capture_output=True, text=True)
logger.info(f"git pull result: {result_pull.stdout}")
result_pull = subprocess.run(git_pull_command, capture_output=True, text=True)
if (result_pull.returncode != 0):
logger.error(f"Unable to pull {website_repo} repository: {result_pull.stderr}")
result_switch = subprocess.run(git_switch_command, cwd=source_path, capture_output=True, text=True)
logger.info(f"git switch result: {result_switch.stdout}")
if (result_switch.returncode != 0):
logger.error(f"Unable to switch {website_repo} repository: {result_switch.stderr}")
else:
# Repository doesn't exist, perform a git clone
logger.info(f"...git directory does not exist, cloning in {os.getcwd()}")# Repository doesn't exist, perform a git clone
clone_command = ['git', 'clone', "https://" + github_user + ":" + github_pat + "@" + website_repo, source_path]
result_clone = subprocess.run(clone_command, capture_output=True, text=True)
logger.info(f"git clone result: {result_clone.stdout}")
if (result_clone.returncode != 0):
raise Exception(f"Unable to clone {website_repo} repository: {result_clone.stderr}")
result_switch = subprocess.run(git_switch_command, cwd=source_path, capture_output=True, text=True)
logger.info(f"git switch result: {result_switch.stdout}")
if (result_switch.returncode != 0):
raise Exception(f"Unable to switch {website_repo} repository: {result_switch.stderr}")
logger.info(f"git cloned + switch completed")

os.chdir(source_path)
logger.info(f"...cloned/updated, moved to directory: {os.getcwd()}")
Expand All @@ -166,11 +209,24 @@ def clone_and_generate(website_repo, destination_path, source_path):
additional_path_usr = '/usr/local'
env["PATH"] = additional_path_go + os.pathsep + additional_path_usr + os.pathsep + env["PATH"]
hugo_command = ['hugo', '--gc', '-b', '/', '-d', destination_path]
logger.info(f"hugo command: {hugo_command}")
result_hugo = subprocess.run(hugo_command, env=env, capture_output=True, text=True)
if (result_hugo.returncode != 0):
raise Exception(f"Unable to generate website using hugo command: '{hugo_command}': {result_hugo.stderr}")
logger.info(f"hugo result: {result_hugo.stdout}")

sitemap_file = create_sitemap_filepath(destination_path)
if not os.path.exists(sitemap_file):
raise Exception(f"Unable to generate website in {destination_path}: sitemap.xml not found: {sitemap_file}")


def ingest(source_url, website_repo, destination_path, source_path, source_url2, website_repo2, destination_path2, source_path2):
clone_and_generate(website_repo, destination_path, source_path)
clone_and_generate(website_repo2, destination_path2, source_path2)
create_vector_db(source_url, source_url2)
logger.info(f"Ingest successful")

def mainapp(source_website_url, source_website_url2) -> None:
def create_vector_db(source_website_url, source_website_url2) -> None:
"""
Purpose:
ingest the transformed website contents into a vector database in presized chunks.
Expand All @@ -192,10 +248,12 @@ def mainapp(source_website_url, source_website_url2) -> None:
with get_openai_callback() as cb:
embed_text(texts, vectordb_path)
logger.info(f"\nEmbedding costs: {cb.total_cost}")
f.write(str(texts))
stringified_texts = str(texts)
f.write(stringified_texts)
f.close()


# only execute if this is the main program run (so not imported)
if __name__ == "__main__":
mainapp(os.getenv('AI_SOURCE_WEBSITE'),os.getenv('AI_SOURCE_WEBSITE2'))
ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path,
config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)
Loading

0 comments on commit 71971a2

Please sign in to comment.