vectara · ofermend · Sep 17, 2024 · Sep 14, 2024 · Sep 17, 2024
diff --git a/README.md b/README.md
@@ -178,6 +178,9 @@ vectara:
   # flag: should vectara-ingest reindex if document already exists (optional)
   reindex: false
 
+  # flag: store a copy of all crawled data that is indexed into a local folder
+  store_docs: false
+
   # timeout: sets the URL crawling timeout in seconds (optional)
   timeout: 90
 

diff --git a/core/indexer.py b/core/indexer.py
@@ -4,17 +4,22 @@
 from typing import Tuple, Dict, Any, List, Optional
 import uuid
 import pandas as pd
+import shutil
 
 import time
-from slugify import slugify
 import unicodedata
+from slugify import slugify
 
 from omegaconf import OmegaConf
 from nbconvert import HTMLExporter      # type: ignore
 import nbformat
 import markdown
 
-from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries, TableSummarizer, mask_pii, safe_remove_file, detect_file_type
+from core.utils import (
+    html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries, 
+    TableSummarizer, mask_pii, safe_remove_file, detect_file_type,
+    url_to_filename
+)
 from core.extract import get_article_content
 
 from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
@@ -85,6 +90,7 @@ def __init__(self, cfg: OmegaConf, endpoint: str,
         self.api_key = api_key
         self.reindex = cfg.vectara.get("reindex", False)
         self.verbose = cfg.vectara.get("verbose", False)
+        self.store_docs = cfg.vectara.get("store_docs", False)
         self.remove_code = cfg.vectara.get("remove_code", True)
         self.remove_boilerplate = cfg.vectara.get("remove_boilerplate", False)
         self.post_load_timeout = cfg.vectara.get("post_load_timeout", 5)
@@ -123,6 +129,17 @@ def setup(self, use_playwright: bool = True) -> None:
             self.browser = self.p.firefox.launch(headless=True)
             self.browser_use_count = 0
         self.tmp_file = 'tmp_' + str(uuid.uuid4())
+        if self.store_docs:
+            self.store_docs_folder = '/home/vectara/env/indexed_docs_' + str(uuid.uuid4())
+            if os.path.exists(self.store_docs_folder):
+                shutil.rmtree(self.store_docs_folder)
+            os.makedirs(self.store_docs_folder)
+
+    def store_file(self, filename: str, orig_filename) -> None:
+        if self.store_docs:
+            dest_path = f"{self.store_docs_folder}/{orig_filename}"
+            shutil.copyfile(filename, dest_path)
+
 
     def url_triggers_download(self, url: str) -> bool:
         download_triggered = False
@@ -316,7 +333,7 @@ def _index_file(self, filename: str, uri: str, metadata: Dict[str, Any]) -> bool
         """
         Index a file on local file system by uploading it to the Vectara corpus.
         Args:
-            filename (str): Name of the PDF file to create.
+            filename (str): Name of the file to create.
             uri (str): URI for where the document originated. In some cases the local file name is not the same, and we want to include this in the index.
             metadata (dict): Metadata for the document.
         Returns:
@@ -351,6 +368,7 @@ def get_files(filename: str, metadata: dict):
                 )
                 if response.status_code == 200:
                     self.logger.info(f"REST upload for {uri} successful (reindex)")
+                    self.store_file(filename, url_to_filename(uri))
                     return True
                 else:
                     self.logger.info(f"REST upload for {uri} ({filename}) (reindex) failed with code = {response.status_code}, text = {response.text}")
@@ -361,6 +379,7 @@ def get_files(filename: str, metadata: dict):
             return False
 
         self.logger.info(f"REST upload for {uri} succeesful")
+        self.store_file(filename, url_to_filename(uri))
         return True
 
     def _index_document(self, document: Dict[str, Any]) -> bool:
@@ -412,6 +431,9 @@ def _index_document(self, document: Dict[str, Any]) -> bool:
                 self.logger.info(f"Document {document['documentId']} already exists, skipping")
                 return False
         if "status" in result and result["status"] and "OK" in result["status"]["code"]:
+            if self.store_docs:
+                with open(f"{self.store_docs_folder}/{document['documentId']}.json", "w") as f:
+                    json.dump(document, f)
             return True
 
         self.logger.info(f"Indexing document {document['documentId']} failed, response = {result}")

diff --git a/core/utils.py b/core/utils.py
@@ -4,7 +4,9 @@
 from pathlib import Path
 
 from bs4 import BeautifulSoup
+import xml.etree.ElementTree as ET
 from urllib.parse import urljoin, urlparse
+from slugify import slugify
 
 import re
 from typing import List, Set
@@ -29,7 +31,6 @@
 except ImportError:
     logging.info("Presidio is not installed. if PII detection and masking is requested - it will not work.")
 
-
 img_extensions = [".gif", ".jpeg", ".jpg", ".mp3", ".mp4", ".png", ".svg", ".bmp", ".eps", ".ico"]
 doc_extensions = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".pdf", ".ps"]
 archive_extensions = [".zip", ".gz", ".tar", ".bz2", ".7z", ".rar"]
@@ -44,18 +45,58 @@ def setup_logging():
     handler.setFormatter(formatter)
     root.addHandler(handler)
 
+def url_to_filename(url):
+    parsed_url = urlparse(url)
+    path_parts = parsed_url.path.split('/')
+    last_part = path_parts[-1]
+    name, ext = os.path.splitext(last_part)
+    slugified_name = slugify(name)
+    return f"{slugified_name}{ext}"
+
+
+
+import magic
+from bs4 import BeautifulSoup
+import xml.etree.ElementTree as ET
 
 def detect_file_type(file_path):
     """
-    Detect the type of a file using the `magic` library.
-    PDF files are detected as 'application/pdf' and HTML files as 'text/html'.
+    Detect the type of a file using the `magic` library and further analysis.
+
+    Returns:
+        str: The detected MIME type, e.g., 'text/html', 'application/xml', etc.
     """
+    # Initialize magic for MIME type detection
     mime = magic.Magic(mime=True)
     mime_type = mime.from_file(file_path)
-    with open(file_path, 'r', encoding='utf-8') as file:
-        first_1024_bytes = file.read(1024)
-    if '<html' in first_1024_bytes.lower() and '</html>' in first_1024_bytes.lower():
-        return 'text/html'
+
+    # Define MIME types that require further inspection
+    ambiguous_mime_types = ['text/html', 'application/xml', 'text/xml', 'application/xhtml+xml']    
+    if mime_type in ambiguous_mime_types:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as file:
+                content = file.read()
+        except UnicodeDecodeError:
+            # If the file isn't UTF-8 encoded, it might not be HTML or XML
+            return mime_type
+
+        stripped_content = content.lstrip()
+        if stripped_content.startswith('<?xml'):
+            return 'application/xml'
+
+        # Use BeautifulSoup to parse as HTML
+        soup = BeautifulSoup(content, 'html.parser')
+        if soup.find('html'):
+            return 'text/html'
+
+        # Attempt to parse as XML
+        try:
+            ET.fromstring(content)
+            return 'application/xml'
+        except ET.ParseError:
+            pass  # Not well-formed XML
+
+        # Fallback to magic-detected MIME type if unsure
     return mime_type
 
 def remove_code_from_html(html: str) -> str: