Merge pull request #41 from alkem-io/develop

Release: Logging, Refactor
alkem-io · Jan 4, 2024 · 71971a2 · 71971a2
2 parents 1a18f49 + 1ff93f9
commit 71971a2
Show file tree

Hide file tree

Showing 9 changed files with 290 additions and 198 deletions.
diff --git a/.vscode/launch.json b/.vscode/launch.json
@@ -0,0 +1,16 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python: Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true
+        }
+    ]
+}
diff --git a/Dockerfile b/Dockerfile
@@ -24,5 +24,5 @@ COPY . /app
 # Use Poetry to install dependencies
 RUN poetry config virtualenvs.create true && poetry install --no-interaction --no-ansi
 
-# Run app.py when the container launches
-CMD ["poetry", "run", "python", "app.py"]
+# Run guidance-engine.py when the container launches
+CMD ["poetry", "run", "python", "guidance_engine.py"]
diff --git a/README.md b/README.md
@@ -95,7 +95,9 @@ You can find sample values in `.azure-template.env`. Configure them and create `
 
 ### Python & Poetry
 The project requires Python & Poetry installed. The minimum version dependencies can be found at `pyproject.toml`.
-After installing Python & Poetry, you simply need to run `poetry run python app.py`
+After installing Python & Poetry:
+* Install the dependencies: `poetry install`
+* Run using `poetry run python guidance_engine.py`
 
 ### Linux
 The project requires Python 3.11 as a minimum and needs Go and Hugo installed for creating a local version of the website. See Go and Hugo documentation for installation instructions (only when running outside container)

diff --git a/ai_utils.py → ai_adapter.py b/ai_utils.py → ai_adapter.py
@@ -15,7 +15,7 @@
 import sys
 import io
 import def_ingest
-from config import config, website_source_path, website_generated_path, website_source_path2, website_generated_path2, vectordb_path, local_path, generate_website, LOG_LEVEL, max_token_limit
+from config import config, website_source_path, website_generated_path, website_source_path2, website_generated_path2, vectordb_path, local_path, LOG_LEVEL, max_token_limit
 
 import os
 
@@ -33,16 +33,16 @@
 f_handler.setLevel(logging.WARNING)
 
 # Create formatters and add them to handlers
-c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
-f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
+f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
 c_handler.setFormatter(c_format)
 f_handler.setFormatter(f_format)
 
 # Add handlers to the logger
 logger.addHandler(c_handler)
 logger.addHandler(f_handler)
 
-logger.info(f"log level ai_utils: {LOG_LEVEL}")
+logger.info(f"log level {os.path.basename(__file__)}: {LOG_LEVEL}")
 
 # verbose output for LLMs
 if LOG_LEVEL=="DEBUG":
@@ -116,17 +116,26 @@ def get_language_by_code(language_code):
     chunk_size=1
 )
 
-# Check if the vector database exists
-if os.path.exists(vectordb_path+"/index.pkl"):
-    logger.info(f"The file vector database is present")
-else:
-    # ingest data
-    if generate_website:
-        def_ingest.clone_and_generate(config['website_repo'], website_generated_path, website_source_path)
-        def_ingest.clone_and_generate(config['website_repo2'], website_generated_path2, website_source_path2)
-    def_ingest.mainapp(config['source_website'], config['source_website2'])
+def load_vector_db():
+    """
+    Purpose:
+        Load the data into the vector database.
+    Args:
+        
+    Returns:
+        vectorstore: the vectorstore object
+    """
+    # Check if the vector database exists
+    if os.path.exists(vectordb_path + os.sep + "index.pkl"):
+        logger.info(f"The file vector database is present")
+    else:
+        logger.info(f"The file vector database is not present, ingesting")
+        def_ingest.ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path, config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)    
+
+    return FAISS.load_local(vectordb_path, embeddings)
+
+vectorstore = load_vector_db()
 
-vectorstore = FAISS.load_local(vectordb_path, embeddings)
 retriever = vectorstore.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .5})
 
 chat_llm = AzureChatOpenAI(azure_deployment=os.environ["LLM_DEPLOYMENT_NAME"],

diff --git a/config.py b/config.py
@@ -22,12 +22,12 @@
 local_path = config['local_path']
 github_user = config['github_user']
 github_pat = config['github_pat']
-website_source_path = local_path + '/website/source'
-website_source_path2 = local_path + '/website2/source'
-website_generated_path = local_path + '/website/generated'
-website_generated_path2 = local_path + '/website2/generated'
-vectordb_path = local_path + "/vectordb"
-generate_website = True
+website_source_path = local_path + os.sep + 'website' + os.sep + 'source'
+website_source_path2 = local_path + os.sep + 'website2' + os.sep + 'source'
+website_generated_path = local_path + os.sep + 'website' + os.sep + 'generated'
+website_generated_path2 = local_path + os.sep + 'website2' + os.sep + 'generated'
+vectordb_path = local_path + os.sep + 'vectordb'
+
 chunk_size = 3000
 # token limit for for the completion of the chat model, this does not include the overall context length
 max_token_limit = 2000

diff --git a/def_ingest.py b/def_ingest.py
@@ -32,17 +32,23 @@
 f_handler.setLevel(logging.WARNING)
 
 # Create formatters and add them to handlers
-c_format = logging.Formatter('%(name)s - %(levelname)s - %(message)s')
-f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+c_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
+f_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s', '%m-%d %H:%M:%S')
 c_handler.setFormatter(c_format)
 f_handler.setFormatter(f_format)
 
 # Add handlers to the logger
 logger.addHandler(c_handler)
 logger.addHandler(f_handler)
 
-logger.info(f"log level ingest: {LOG_LEVEL}")
+logger.info(f"log level {os.path.basename(__file__)}: {LOG_LEVEL}")
 
+def create_sitemap_filepath(website_generated_path):
+    return website_generated_path + os.sep + "sitemap.xml"
+
+def sitemap_file_exists(website_generated_path):
+    sitemap_file = create_sitemap_filepath(website_generated_path)
+    return os.path.exists(sitemap_file)
 
 def extract_urls_from_sitemap(base_directory):
     """
@@ -54,29 +60,32 @@ def extract_urls_from_sitemap(base_directory):
         list of files to be retrieved
     """
 
-    sitemap_file = base_directory + os.sep + "sitemap.xml"
+    sitemap_file = create_sitemap_filepath(base_directory)
     logger.info(f"Extracting urls using {sitemap_file}")
 
     # Parse the XML directly from the file
     tree = ET.parse(sitemap_file)
     root = tree.getroot()
 
+    # List to store the complete URLs of the webpages to be retrieved
+    webpages_to_retrieve = []
     # Extract the URLs from the sitemap
-    to_be_retieved = [
-        base_directory + elem.text + "index.html"
-        for elem in root.iter("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
-    ]
+    for elem in root.iter("{http://www.sitemaps.org/schemas/sitemap/0.9}loc"):
+        # replace the / with the os separator
+        url_path = elem.text.replace("/", os.sep)
+        complete_url = base_directory + url_path + "index.html"
+        webpages_to_retrieve.append(complete_url)
 
-    logger.info(f"...sitemap as urls: {to_be_retieved[:5]}....")
-    return to_be_retieved
+    # logger.info(f"...sitemap as urls: {webpages_to_retrieve[:5]}....")
+    return webpages_to_retrieve
 
 
 def embed_text(texts, save_loc):
     embeddings = AzureOpenAIEmbeddings(
-    azure_deployment=config['embeddings_deployment_name'],
-    openai_api_version=config['openai_api_version'],
-    chunk_size=1
-)
+        azure_deployment=config['embeddings_deployment_name'],
+        openai_api_version=config['openai_api_version'],
+        chunk_size=1
+    )
     docsearch = FAISS.from_documents(texts, embeddings)
 
     docsearch.save_local(save_loc)
@@ -98,29 +107,45 @@ def read_and_parse_html(local_source_path, source_website_url, website_generated
     logger.info(f"generating html: {local_source_path}, {source_website_url}")
     full_sitemap_list = extract_urls_from_sitemap(website_generated_path)
 
+    exclusion_list = [os.sep + 'tag' + os.sep, 
+                      os.sep + 'category' + os.sep, 
+                      os.sep + 'help' + os.sep + 'index']
+
     data = []
+    included_files = []
+    error_files = []
+    excluded_files = []
     for file_name in full_sitemap_list:
-        loader = TextLoader(file_name)
-        # ignore url's with /tag/ or /category/ as they do not contain relevant info.
-        if '/tag/' in file_name or '/category/' in file_name or '/help/index' in file_name:
-            logger.warning(f"exclusion found, not ingesting {file_name}\n")
-            continue
-        document = loader.load()
-        # note h5 and h6 tags for our website contain a lot of irrelevant metadata
-        doc_transformed = bs_transformer.transform_documents(document, tags_to_extract=["p", "article", "title", "h1"], unwanted_tags=["h5", "h6"], remove_lines=True)
-        body_text = doc_transformed[0]
-
-        # first remove duplicate spaces, then remove duplicate '\n\n', then remove duplicate '\n \n '
-        body_text.page_content = re.sub(r'(\n ){2,}', '\n', re.sub(r'\n+', '\n', re.sub(r' +', ' ', body_text.page_content)))
-
-        # remove the local directory from the source object
-        body_text.metadata['source'] = body_text.metadata['source'].replace(website_generated_path, source_website_url)
-
-        if len(body_text.page_content) > 100:
-            data.append(body_text)
-        else:
-            logger.warning(f"document too small, not adding: {body_text.page_content}\n")
-
+        # logger.info(f"Processing file {file_name}")
+        try:
+            loader = TextLoader(file_name)
+            # ignore url's with /tag/ or /category/ as they do not contain relevant info.
+            if any(exclusion in file_name for exclusion in exclusion_list):
+                # logger.info(f"...exclusion found, not ingesting {file_name}")
+                excluded_files.append(file_name)
+                continue
+            document = loader.load()
+            # note h5 and h6 tags for our website contain a lot of irrelevant metadata
+            doc_transformed = bs_transformer.transform_documents(document, tags_to_extract=["p", "article", "title", "h1"], unwanted_tags=["h5", "h6"], remove_lines=True)
+            body_text = doc_transformed[0]
+
+            # first remove duplicate spaces, then remove duplicate '\n\n', then remove duplicate '\n \n '
+            body_text.page_content = re.sub(r'(\n ){2,}', '\n', re.sub(r'\n+', '\n', re.sub(r' +', ' ', body_text.page_content)))
+
+            # remove the local directory from the source object
+            body_text.metadata['source'] = body_text.metadata['source'].replace(website_generated_path, source_website_url)
+
+            if len(body_text.page_content) > 100:
+                data.append(body_text)
+                included_files.append(file_name)
+            else:
+                #logger.info(f"document too small, not adding: {body_text.page_content}\n")
+                excluded_files.append(file_name)
+        except Exception as e:
+             # logger.error(f"...unable to process file: {str(e)}")
+             error_files.append(file_name)
+
+    logger.info(f"==> Returning {len(included_files)} files; {len(error_files)} gave errors + {len(excluded_files)} files were skipped")
     text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_size/5)
     texts = text_splitter.split_documents(data)
     return texts
@@ -129,34 +154,52 @@ def remove_and_recreate(dir_path):
     try:
         if os.path.exists(dir_path):
             shutil.rmtree(dir_path)
-            logger.info(f"Directory {dir_path} and its contents removed successfully.")
+            logger.info(f"...removed directory {dir_path} and its contents.")
         os.makedirs(dir_path)
         logger.info(f"...directory {dir_path} (re)created.")
     except OSError as e:
         logger.error(f"Error: {e.strerror}")
 
 def clone_and_generate(website_repo, destination_path, source_path):
-    logger.info(f"About to generate website")
+    """
+    Purpose:
+        Retrieve the Hugo based website and generate the static html files.
+    Args:
+        website_repo: github repo containing the Hugo based website
+        destination_path: path to directory containing generated html files
+        source_path: path to directory containing the checked out github repo
+    Returns:
+        
+    """
+
+    logger.info(f"About to generate website: {website_repo}")
     remove_and_recreate(source_path)
     remove_and_recreate(destination_path)
-    logger.info(f"...cloning or updating repo")
     branch = "main"
+    os.chdir(source_path)
     git_switch_command = ['git', 'switch', branch]
-    # Check if the repository already exists in the source_path
-    if os.path.exists(os.path.join(source_path, '.git')):
+    git_directory = os.path.join(source_path, '.git')
+    # Check if the repository already exists in the source_path    
+    if os.path.exists(git_directory):
         # Repository exists, perform a git pull to update it
+        logger.info(f"...git directory exists, pulling in {os.getcwd()}")
         git_pull_command = ['git', 'pull', 'origin', branch]  # Modify branch name as needed
-        result_pull = subprocess.run(git_pull_command, cwd=source_path, capture_output=True, text=True)
-        logger.info(f"git pull result: {result_pull.stdout}")
+        result_pull = subprocess.run(git_pull_command, capture_output=True, text=True)
+        if (result_pull.returncode != 0):
+            logger.error(f"Unable to pull {website_repo} repository: {result_pull.stderr}")
         result_switch = subprocess.run(git_switch_command, cwd=source_path, capture_output=True, text=True)
-        logger.info(f"git switch result: {result_switch.stdout}")
+        if (result_switch.returncode != 0):
+            logger.error(f"Unable to switch {website_repo} repository: {result_switch.stderr}")
     else:
-        # Repository doesn't exist, perform a git clone
+        logger.info(f"...git directory does not exist, cloning in {os.getcwd()}")# Repository doesn't exist, perform a git clone
         clone_command = ['git', 'clone', "https://" + github_user + ":" + github_pat + "@" + website_repo, source_path]
         result_clone = subprocess.run(clone_command, capture_output=True, text=True)
-        logger.info(f"git clone result: {result_clone.stdout}")
+        if (result_clone.returncode != 0):
+            raise Exception(f"Unable to clone {website_repo} repository: {result_clone.stderr}")
         result_switch = subprocess.run(git_switch_command, cwd=source_path, capture_output=True, text=True)
-        logger.info(f"git switch result: {result_switch.stdout}")
+        if (result_switch.returncode != 0):
+            raise Exception(f"Unable to switch {website_repo} repository: {result_switch.stderr}")
+        logger.info(f"git cloned + switch completed")
 
     os.chdir(source_path)
     logger.info(f"...cloned/updated, moved to directory: {os.getcwd()}")
@@ -166,11 +209,24 @@ def clone_and_generate(website_repo, destination_path, source_path):
     additional_path_usr = '/usr/local'
     env["PATH"] = additional_path_go + os.pathsep + additional_path_usr + os.pathsep + env["PATH"]
     hugo_command = ['hugo', '--gc', '-b', '/', '-d', destination_path]
+    logger.info(f"hugo command: {hugo_command}")
     result_hugo = subprocess.run(hugo_command, env=env, capture_output=True, text=True)
+    if (result_hugo.returncode != 0):
+        raise Exception(f"Unable to generate website using hugo command: '{hugo_command}': {result_hugo.stderr}")
     logger.info(f"hugo result: {result_hugo.stdout}")
 
+    sitemap_file = create_sitemap_filepath(destination_path)
+    if not os.path.exists(sitemap_file):
+        raise Exception(f"Unable to generate website in {destination_path}: sitemap.xml not found: {sitemap_file}")
+
+
+def ingest(source_url, website_repo, destination_path, source_path, source_url2, website_repo2, destination_path2, source_path2):
+    clone_and_generate(website_repo, destination_path, source_path)
+    clone_and_generate(website_repo2, destination_path2, source_path2)
+    create_vector_db(source_url, source_url2)
+    logger.info(f"Ingest successful")
 
-def mainapp(source_website_url, source_website_url2) -> None:
+def create_vector_db(source_website_url, source_website_url2) -> None:
     """
     Purpose:
         ingest the transformed website contents into a vector database in presized chunks.
@@ -192,10 +248,12 @@ def mainapp(source_website_url, source_website_url2) -> None:
     with get_openai_callback() as cb:
         embed_text(texts, vectordb_path)
     logger.info(f"\nEmbedding costs: {cb.total_cost}")
-    f.write(str(texts))
+    stringified_texts = str(texts)
+    f.write(stringified_texts)
     f.close()
 
 
 # only execute if this is the main program run (so not imported)
 if __name__ == "__main__":
-    mainapp(os.getenv('AI_SOURCE_WEBSITE'),os.getenv('AI_SOURCE_WEBSITE2'))
+    ingest(config['source_website'], config['website_repo'], website_generated_path, website_source_path, 
+           config['source_website2'], config['website_repo2'], website_generated_path2, website_source_path2)