Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store docs locally #119

Merged
merged 2 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ vectara:
# flag: should vectara-ingest reindex if document already exists (optional)
reindex: false

# flag: store a copy of all crawled data that is indexed into a local folder
store_docs: false

# timeout: sets the URL crawling timeout in seconds (optional)
timeout: 90

Expand Down
28 changes: 25 additions & 3 deletions core/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,22 @@
from typing import Tuple, Dict, Any, List, Optional
import uuid
import pandas as pd
import shutil

import time
from slugify import slugify
import unicodedata
from slugify import slugify

from omegaconf import OmegaConf
from nbconvert import HTMLExporter # type: ignore
import nbformat
import markdown

from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries, TableSummarizer, mask_pii, safe_remove_file, detect_file_type
from core.utils import (
html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries,
TableSummarizer, mask_pii, safe_remove_file, detect_file_type,
url_to_filename
)
from core.extract import get_article_content

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
Expand Down Expand Up @@ -85,6 +90,7 @@ def __init__(self, cfg: OmegaConf, endpoint: str,
self.api_key = api_key
self.reindex = cfg.vectara.get("reindex", False)
self.verbose = cfg.vectara.get("verbose", False)
self.store_docs = cfg.vectara.get("store_docs", False)
self.remove_code = cfg.vectara.get("remove_code", True)
self.remove_boilerplate = cfg.vectara.get("remove_boilerplate", False)
self.post_load_timeout = cfg.vectara.get("post_load_timeout", 5)
Expand Down Expand Up @@ -123,6 +129,17 @@ def setup(self, use_playwright: bool = True) -> None:
self.browser = self.p.firefox.launch(headless=True)
self.browser_use_count = 0
self.tmp_file = 'tmp_' + str(uuid.uuid4())
if self.store_docs:
self.store_docs_folder = '/home/vectara/env/indexed_docs_' + str(uuid.uuid4())
if os.path.exists(self.store_docs_folder):
shutil.rmtree(self.store_docs_folder)
os.makedirs(self.store_docs_folder)

def store_file(self, filename: str, orig_filename) -> None:
if self.store_docs:
dest_path = f"{self.store_docs_folder}/{orig_filename}"
shutil.copyfile(filename, dest_path)


def url_triggers_download(self, url: str) -> bool:
download_triggered = False
Expand Down Expand Up @@ -316,7 +333,7 @@ def _index_file(self, filename: str, uri: str, metadata: Dict[str, Any]) -> bool
"""
Index a file on local file system by uploading it to the Vectara corpus.
Args:
filename (str): Name of the PDF file to create.
filename (str): Name of the file to create.
uri (str): URI for where the document originated. In some cases the local file name is not the same, and we want to include this in the index.
metadata (dict): Metadata for the document.
Returns:
Expand Down Expand Up @@ -351,6 +368,7 @@ def get_files(filename: str, metadata: dict):
)
if response.status_code == 200:
self.logger.info(f"REST upload for {uri} successful (reindex)")
self.store_file(filename, url_to_filename(uri))
return True
else:
self.logger.info(f"REST upload for {uri} ({filename}) (reindex) failed with code = {response.status_code}, text = {response.text}")
Expand All @@ -361,6 +379,7 @@ def get_files(filename: str, metadata: dict):
return False

self.logger.info(f"REST upload for {uri} succeesful")
self.store_file(filename, url_to_filename(uri))
return True

def _index_document(self, document: Dict[str, Any]) -> bool:
Expand Down Expand Up @@ -412,6 +431,9 @@ def _index_document(self, document: Dict[str, Any]) -> bool:
self.logger.info(f"Document {document['documentId']} already exists, skipping")
return False
if "status" in result and result["status"] and "OK" in result["status"]["code"]:
if self.store_docs:
with open(f"{self.store_docs_folder}/{document['documentId']}.json", "w") as f:
json.dump(document, f)
return True

self.logger.info(f"Indexing document {document['documentId']} failed, response = {result}")
Expand Down
55 changes: 48 additions & 7 deletions core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
from pathlib import Path

from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET
from urllib.parse import urljoin, urlparse
from slugify import slugify

import re
from typing import List, Set
Expand All @@ -29,7 +31,6 @@
except ImportError:
logging.info("Presidio is not installed. if PII detection and masking is requested - it will not work.")


img_extensions = [".gif", ".jpeg", ".jpg", ".mp3", ".mp4", ".png", ".svg", ".bmp", ".eps", ".ico"]
doc_extensions = [".doc", ".docx", ".ppt", ".pptx", ".xls", ".xlsx", ".pdf", ".ps"]
archive_extensions = [".zip", ".gz", ".tar", ".bz2", ".7z", ".rar"]
Expand All @@ -44,18 +45,58 @@ def setup_logging():
handler.setFormatter(formatter)
root.addHandler(handler)

def url_to_filename(url):
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
last_part = path_parts[-1]
name, ext = os.path.splitext(last_part)
slugified_name = slugify(name)
return f"{slugified_name}{ext}"



import magic
from bs4 import BeautifulSoup
import xml.etree.ElementTree as ET

def detect_file_type(file_path):
"""
Detect the type of a file using the `magic` library.
PDF files are detected as 'application/pdf' and HTML files as 'text/html'.
Detect the type of a file using the `magic` library and further analysis.

Returns:
str: The detected MIME type, e.g., 'text/html', 'application/xml', etc.
"""
# Initialize magic for MIME type detection
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
with open(file_path, 'r', encoding='utf-8') as file:
first_1024_bytes = file.read(1024)
if '<html' in first_1024_bytes.lower() and '</html>' in first_1024_bytes.lower():
return 'text/html'

# Define MIME types that require further inspection
ambiguous_mime_types = ['text/html', 'application/xml', 'text/xml', 'application/xhtml+xml']
if mime_type in ambiguous_mime_types:
try:
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
except UnicodeDecodeError:
# If the file isn't UTF-8 encoded, it might not be HTML or XML
return mime_type

stripped_content = content.lstrip()
if stripped_content.startswith('<?xml'):
return 'application/xml'

# Use BeautifulSoup to parse as HTML
soup = BeautifulSoup(content, 'html.parser')
if soup.find('html'):
return 'text/html'

# Attempt to parse as XML
try:
ET.fromstring(content)
return 'application/xml'
except ET.ParseError:
pass # Not well-formed XML

# Fallback to magic-detected MIME type if unsure
return mime_type

def remove_code_from_html(html: str) -> str:
Expand Down