Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Store docs locally #119

Merged
merged 2 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,9 @@ vectara:
# flag: should vectara-ingest reindex if document already exists (optional)
reindex: false

# flag: store a copy of all crawled data that is indexed into a local folder
store_docs: false

# timeout: sets the URL crawling timeout in seconds (optional)
timeout: 90

Expand Down
28 changes: 25 additions & 3 deletions core/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,22 @@
from typing import Tuple, Dict, Any, List, Optional
import uuid
import pandas as pd
import shutil

import time
from slugify import slugify
import unicodedata
from slugify import slugify

from omegaconf import OmegaConf
from nbconvert import HTMLExporter # type: ignore
import nbformat
import markdown

from core.utils import html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries, TableSummarizer, mask_pii, safe_remove_file, detect_file_type
from core.utils import (
html_to_text, detect_language, get_file_size_in_MB, create_session_with_retries,
TableSummarizer, mask_pii, safe_remove_file, detect_file_type,
url_to_filename
)
from core.extract import get_article_content

from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeoutError
Expand Down Expand Up @@ -85,6 +90,7 @@ def __init__(self, cfg: OmegaConf, endpoint: str,
self.api_key = api_key
self.reindex = cfg.vectara.get("reindex", False)
self.verbose = cfg.vectara.get("verbose", False)
self.store_docs = cfg.vectara.get("store_docs", False)
self.remove_code = cfg.vectara.get("remove_code", True)
self.remove_boilerplate = cfg.vectara.get("remove_boilerplate", False)
self.post_load_timeout = cfg.vectara.get("post_load_timeout", 5)
Expand Down Expand Up @@ -123,6 +129,17 @@ def setup(self, use_playwright: bool = True) -> None:
self.browser = self.p.firefox.launch(headless=True)
self.browser_use_count = 0
self.tmp_file = 'tmp_' + str(uuid.uuid4())
if self.store_docs:
self.store_docs_folder = '/home/vectara/env/indexed_docs_' + str(uuid.uuid4())
if os.path.exists(self.store_docs_folder):
shutil.rmtree(self.store_docs_folder)
os.makedirs(self.store_docs_folder)

def store_file(self, filename: str, orig_filename) -> None:
if self.store_docs:
dest_path = f"{self.store_docs_folder}/{orig_filename}"
shutil.copyfile(filename, dest_path)


def url_triggers_download(self, url: str) -> bool:
download_triggered = False
Expand Down Expand Up @@ -316,7 +333,7 @@ def _index_file(self, filename: str, uri: str, metadata: Dict[str, Any]) -> bool
"""
Index a file on local file system by uploading it to the Vectara corpus.
Args:
filename (str): Name of the PDF file to create.
filename (str): Name of the file to create.
uri (str): URI for where the document originated. In some cases the local file name is not the same, and we want to include this in the index.
metadata (dict): Metadata for the document.
Returns:
Expand Down Expand Up @@ -351,6 +368,7 @@ def get_files(filename: str, metadata: dict):
)
if response.status_code == 200:
self.logger.info(f"REST upload for {uri} successful (reindex)")
self.store_file(filename, url_to_filename(uri))
return True
else:
self.logger.info(f"REST upload for {uri} ({filename}) (reindex) failed with code = {response.status_code}, text = {response.text}")
Expand All @@ -361,6 +379,7 @@ def get_files(filename: str, metadata: dict):
return False

self.logger.info(f"REST upload for {uri} succeesful")
self.store_file(filename, url_to_filename(uri))
return True

def _index_document(self, document: Dict[str, Any]) -> bool:
Expand Down Expand Up @@ -412,6 +431,9 @@ def _index_document(self, document: Dict[str, Any]) -> bool:
self.logger.info(f"Document {document['documentId']} already exists, skipping")
return False
if "status" in result and result["status"] and "OK" in result["status"]["code"]:
if self.store_docs:
with open(f"{self.store_docs_folder}/{document['documentId']}.json", "w") as f:
json.dump(document, f)
return True

self.logger.info(f"Indexing document {document['documentId']} failed, response = {result}")
Expand Down
17 changes: 13 additions & 4 deletions core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from slugify import slugify

import re
from typing import List, Set
Expand Down Expand Up @@ -44,6 +45,13 @@ def setup_logging():
handler.setFormatter(formatter)
root.addHandler(handler)

def url_to_filename(url):
parsed_url = urlparse(url)
path_parts = parsed_url.path.split('/')
last_part = path_parts[-1]
name, ext = os.path.splitext(last_part)
slugified_name = slugify(name)
return f"{slugified_name}{ext}"

def detect_file_type(file_path):
"""
Expand All @@ -52,10 +60,11 @@ def detect_file_type(file_path):
"""
mime = magic.Magic(mime=True)
mime_type = mime.from_file(file_path)
with open(file_path, 'r', encoding='utf-8') as file:
first_1024_bytes = file.read(1024)
if '<html' in first_1024_bytes.lower() and '</html>' in first_1024_bytes.lower():
return 'text/html'
if mime_type in ['text/html', 'application/xml', 'text/xml']:
with open(file_path, 'r', encoding='utf-8') as file:
first_1024_bytes = file.read(1024)
if '<html' in first_1024_bytes.lower() and '</html>' in first_1024_bytes.lower():
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this looking for the closing html tag in the first 1024 bytes? Shouldn't that be looked for in the last 1024 bytes?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is actually okay since usually XML files look this way (very close start/end HTML tags), but let me see if I can figure out a more robust detection option.

return 'text/html'
return mime_type

def remove_code_from_html(html: str) -> str:
Expand Down