Skip to content

Commit

Permalink
Store docs locally (#119)
Browse files Browse the repository at this point in the history
* fixed issue with detect_file_type
added store_docs configuration option to store all file content locally

* more robust detection of HTML vs XML
  • Loading branch information
ofermend authored Sep 17, 2024
1 parent 1bdb76c commit ca23dcb
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 10 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ vectara:
# flag: should vectara-ingest reindex if document already exists (optional)
reindex: false
# flag: store a copy of all crawled data that is indexed into a local folder
store_docs: false
# timeout: sets the URL crawling timeout in seconds (optional)
timeout: 90
Expand Down
28 changes: 25 additions & 3 deletions core/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,22 @@
from typing import Tuple, Dict, Any, List, Optional
import uuid
import pandas as pd
import shutil

import time
from slugify import slugify
import unicodedata
from slugify import slugify

from omegaconf import OmegaConf
from nbconvert import HTMLExporter # type: ignore
import nbformat
import markdown

from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries, TableSummarizer, mask_pii, safe_remove_file, detect_file_type
from core.utils import (
html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries,
TableSummarizer, mask_pii, safe_remove_file, detect_file_type,
url_to_filename
)
from core.extract import get_article_content

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
Expand Down Expand Up @@ -85,6 +90,7 @@ def __init__(self, cfg: OmegaConf, endpoint: str,
self.api_key = api_key
self.reindex = cfg.vectara.get("reindex", False)
self.verbose = cfg.vectara.get("verbose", False)
self.store_docs = cfg.vectara.get("store_docs", False)
self.remove_code = cfg.vectara.get("remove_code", True)
self.remove_boilerplate = cfg.vectara.get("remove_boilerplate", False)
self.post_load_timeout = cfg.vectara.get("post_load_timeout", 5)
Expand Down Expand Up @@ -123,6 +129,17 @@ def setup(self, use_playwright: bool = True) -> None:
self.browser = self.p.firefox.launch(headless=True)
self.browser_use_count = 0
self.tmp_file = 'tmp_' + str(uuid.uuid4())
if self.store_docs:
self.store_docs_folder = '/home/vectara/env/indexed_docs_' + str(uuid.uuid4())
if os.path.exists(self.store_docs_folder):
shutil.rmtree(self.store_docs_folder)
os.makedirs(self.store_docs_folder)

def store_file(self, filename: str, orig_filename) -> None:
if self.store_docs:
dest_path = f"{self.store_docs_folder}/{orig_filename}"
shutil.copyfile(filename, dest_path)


def url_triggers_download(self, url: str) -> bool:
download_triggered = False
Expand Down Expand Up @@ -316,7 +333,7 @@ def _index_file(self, filename: str, uri: str, metadata: Dict[str, Any]) -> bool
"""
Index a file on local file system by uploading it to the Vectara corpus.
Args:
filename (str): Name of the PDF file to create.
filename (str): Name of the file to create.
uri (str): URI for where the document originated. In some cases the local file name is not the same, and we want to include this in the index.
metadata (dict): Metadata for the document.
Returns:
Expand Down Expand Up @@ -351,6 +368,7 @@ def get_files(filename: str, metadata: dict):
)
if response.status_code == 200:
self.logger.info(f"REST upload for {uri} successful (reindex)")
self.store_file(filename, url_to_filename(uri))
return True
else:
self.logger.info(f"REST upload for {uri} ({filename}) (reindex) failed with code = {response.status_code}, text = {response.text}")
Expand All @@ -361,6 +379,7 @@ def get_files(filename: str, metadata: dict):
return False

self.logger.info(f"REST upload for {uri} succeesful")
self.store_file(filename, url_to_filename(uri))
return True

def _index_document(self, document: Dict[str, Any]) -> bool:
Expand Down Expand Up @@ -412,6 +431,9 @@ def _index_document(self, document: Dict[str, Any]) -> bool:
self.logger.info(f"Document {document['documentId']} already exists, skipping")
return False
if "status" in result and result["status"] and "OK" in result["status"]["code"]:
if self.store_docs:
with open(f"{self.store_docs_folder}/{document['documentId']}.json", "w") as f:
json.dump(document, f)
return True

self.logger.info(f"Indexing document {document['documentId']} failed, response = {result}")
Expand Down
55 changes: 48 additions & 7 deletions core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from pathlib import Path

from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from urllib.parse import urljoin, urlparse
from slugify import slugify

import re
from typing import List, Set
Expand All @@ -29,7 +31,6 @@
except ImportError:
logging.info("Presidio is not installed. if PII detection and masking is requested - it will not work.")


img_extensions = [".gif", ".jpeg", ".jpg", ".mp3", ".mp4", ".png", ".svg", ".bmp", ".eps", ".ico"]
doc_extensions = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".pdf", ".ps"]
archive_extensions = [".zip", ".gz", ".tar", ".bz2", ".7z", ".rar"]
Expand All @@ -44,18 +45,58 @@ def setup_logging():
handler.setFormatter(formatter)
root.addHandler(handler)

def url_to_filename(url):
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
last_part = path_parts[-1]
name, ext = os.path.splitext(last_part)
slugified_name = slugify(name)
return f"{slugified_name}{ext}"



import magic
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

def detect_file_type(file_path):
"""
Detect the type of a file using the `magic` library.
PDF files are detected as 'application/pdf' and HTML files as 'text/html'.
Detect the type of a file using the `magic` library and further analysis.
Returns:
str: The detected MIME type, e.g., 'text/html', 'application/xml', etc.
"""
# Initialize magic for MIME type detection
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
with open(file_path, 'r', encoding='utf-8') as file:
first_1024_bytes = file.read(1024)
if '<html' in first_1024_bytes.lower() and '</html>' in first_1024_bytes.lower():
return 'text/html'

# Define MIME types that require further inspection
ambiguous_mime_types = ['text/html', 'application/xml', 'text/xml', 'application/xhtml+xml']
if mime_type in ambiguous_mime_types:
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
except UnicodeDecodeError:
# If the file isn't UTF-8 encoded, it might not be HTML or XML
return mime_type

stripped_content = content.lstrip()
if stripped_content.startswith('<?xml'):
return 'application/xml'

# Use BeautifulSoup to parse as HTML
soup = BeautifulSoup(content, 'html.parser')
if soup.find('html'):
return 'text/html'

# Attempt to parse as XML
try:
ET.fromstring(content)
return 'application/xml'
except ET.ParseError:
pass # Not well-formed XML

# Fallback to magic-detected MIME type if unsure
return mime_type

def remove_code_from_html(html: str) -> str:
Expand Down

0 comments on commit ca23dcb

Please sign in to comment.