diff --git a/README.md b/README.md index a363b0f..6004338 100644 --- a/README.md +++ b/README.md @@ -178,6 +178,9 @@ vectara: # flag: should vectara-ingest reindex if document already exists (optional) reindex: false + # flag: store a copy of all crawled data that is indexed into a local folder + store_docs: false + # timeout: sets the URL crawling timeout in seconds (optional) timeout: 90 diff --git a/core/indexer.py b/core/indexer.py index 2bb4094..5d3493e 100644 --- a/core/indexer.py +++ b/core/indexer.py @@ -4,17 +4,22 @@ from typing import Tuple, Dict, Any, List, Optional import uuid import pandas as pd +import shutil import time -from slugify import slugify import unicodedata +from slugify import slugify from omegaconf import OmegaConf from nbconvert import HTMLExporter # type: ignore import nbformat import markdown -from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries, TableSummarizer, mask_pii, safe_remove_file, detect_file_type +from core.utils import ( + html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries, + TableSummarizer, mask_pii, safe_remove_file, detect_file_type, + url_to_filename +) from core.extract import get_article_content from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError @@ -85,6 +90,7 @@ def __init__(self, cfg: OmegaConf, endpoint: str, self.api_key = api_key self.reindex = cfg.vectara.get("reindex", False) self.verbose = cfg.vectara.get("verbose", False) + self.store_docs = cfg.vectara.get("store_docs", False) self.remove_code = cfg.vectara.get("remove_code", True) self.remove_boilerplate = cfg.vectara.get("remove_boilerplate", False) self.post_load_timeout = cfg.vectara.get("post_load_timeout", 5) @@ -123,6 +129,17 @@ def setup(self, use_playwright: bool = True) -> None: self.browser = self.p.firefox.launch(headless=True) self.browser_use_count = 0 self.tmp_file = 'tmp_' + str(uuid.uuid4()) + if self.store_docs: + self.store_docs_folder = '/home/vectara/env/indexed_docs_' + str(uuid.uuid4()) + if os.path.exists(self.store_docs_folder): + shutil.rmtree(self.store_docs_folder) + os.makedirs(self.store_docs_folder) + + def store_file(self, filename: str, orig_filename) -> None: + if self.store_docs: + dest_path = f"{self.store_docs_folder}/{orig_filename}" + shutil.copyfile(filename, dest_path) + def url_triggers_download(self, url: str) -> bool: download_triggered = False @@ -316,7 +333,7 @@ def _index_file(self, filename: str, uri: str, metadata: Dict[str, Any]) -> bool """ Index a file on local file system by uploading it to the Vectara corpus. Args: - filename (str): Name of the PDF file to create. + filename (str): Name of the file to create. uri (str): URI for where the document originated. In some cases the local file name is not the same, and we want to include this in the index. metadata (dict): Metadata for the document. Returns: @@ -351,6 +368,7 @@ def get_files(filename: str, metadata: dict): ) if response.status_code == 200: self.logger.info(f"REST upload for {uri} successful (reindex)") + self.store_file(filename, url_to_filename(uri)) return True else: self.logger.info(f"REST upload for {uri} ({filename}) (reindex) failed with code = {response.status_code}, text = {response.text}") @@ -361,6 +379,7 @@ def get_files(filename: str, metadata: dict): return False self.logger.info(f"REST upload for {uri} succeesful") + self.store_file(filename, url_to_filename(uri)) return True def _index_document(self, document: Dict[str, Any]) -> bool: @@ -412,6 +431,9 @@ def _index_document(self, document: Dict[str, Any]) -> bool: self.logger.info(f"Document {document['documentId']} already exists, skipping") return False if "status" in result and result["status"] and "OK" in result["status"]["code"]: + if self.store_docs: + with open(f"{self.store_docs_folder}/{document['documentId']}.json", "w") as f: + json.dump(document, f) return True self.logger.info(f"Indexing document {document['documentId']} failed, response = {result}") diff --git a/core/utils.py b/core/utils.py index b5cc55f..b5e68d3 100644 --- a/core/utils.py +++ b/core/utils.py @@ -4,7 +4,9 @@ from pathlib import Path from bs4 import BeautifulSoup +import xml.etree.ElementTree as ET from urllib.parse import urljoin, urlparse +from slugify import slugify import re from typing import List, Set @@ -29,7 +31,6 @@ except ImportError: logging.info("Presidio is not installed. if PII detection and masking is requested - it will not work.") - img_extensions = [".gif", ".jpeg", ".jpg", ".mp3", ".mp4", ".png", ".svg", ".bmp", ".eps", ".ico"] doc_extensions = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".pdf", ".ps"] archive_extensions = [".zip", ".gz", ".tar", ".bz2", ".7z", ".rar"] @@ -44,18 +45,58 @@ def setup_logging(): handler.setFormatter(formatter) root.addHandler(handler) +def url_to_filename(url): + parsed_url = urlparse(url) + path_parts = parsed_url.path.split('/') + last_part = path_parts[-1] + name, ext = os.path.splitext(last_part) + slugified_name = slugify(name) + return f"{slugified_name}{ext}" + + + +import magic +from bs4 import BeautifulSoup +import xml.etree.ElementTree as ET def detect_file_type(file_path): """ - Detect the type of a file using the `magic` library. - PDF files are detected as 'application/pdf' and HTML files as 'text/html'. + Detect the type of a file using the `magic` library and further analysis. + + Returns: + str: The detected MIME type, e.g., 'text/html', 'application/xml', etc. """ + # Initialize magic for MIME type detection mime = magic.Magic(mime=True) mime_type = mime.from_file(file_path) - with open(file_path, 'r', encoding='utf-8') as file: - first_1024_bytes = file.read(1024) - if '' in first_1024_bytes.lower(): - return 'text/html' + + # Define MIME types that require further inspection + ambiguous_mime_types = ['text/html', 'application/xml', 'text/xml', 'application/xhtml+xml'] + if mime_type in ambiguous_mime_types: + try: + with open(file_path, 'r', encoding='utf-8') as file: + content = file.read() + except UnicodeDecodeError: + # If the file isn't UTF-8 encoded, it might not be HTML or XML + return mime_type + + stripped_content = content.lstrip() + if stripped_content.startswith(' str: