Skip to content

Commit

Permalink
Add docling support (#132)
Browse files Browse the repository at this point in the history
* initial

* added .dockerignore

* updated Dockerfile

* updated

* updates

* updates

* bug fixes

* bugfix

* bugfix

* bug fixes

* added metadata
  • Loading branch information
ofermend authored Dec 7, 2024
1 parent 9c979ae commit 8d0233a
Show file tree
Hide file tree
Showing 24 changed files with 293 additions and 250 deletions.
6 changes: 6 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
.git
__pycache__
*.pyc
.env
.vscode
.DS_Store
98 changes: 60 additions & 38 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,64 +1,86 @@
FROM ubuntu:22.04
# Stage 1: Build stage
FROM python:3.11-slim AS builder

ENV DEBIAN_FRONTEND=noninteractive \
HOME=/home/vectara \
HOME=/home/vectara \
XDG_RUNTIME_DIR=/tmp \
RAY_DEDUP_LOGS="0" \
CUDA_VISIBLE_DEVICES=""

RUN sed 's/main$/main universe/' -i /etc/apt/sources.list
CUDA_VISIBLE_DEVICES="" \
UV_SYSTEM_PYTHON=1

# Install build dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
libopenblas-dev \
unzip \
wget \
git \
curl \
wkhtmltopdf \
libssl-dev \
unixodbc \
poppler-utils \
tesseract-ocr \
libtesseract-dev \
xvfb \
python3-pip python3-dev \
libmagic1 \
libfontconfig fontconfig \
libjpeg-turbo8 \
fonts-noto-color-emoji unifont fonts-indic xfonts-75dpi \
&& apt-get purge -y \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
python3-dev \
&& rm -rf /var/lib/apt/lists/* /tmp/*

RUN rm -f /usr/share/fonts/truetype/unifont/unifont_sample.ttf /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf
ENV OMP_NUM_THREADS=4

# Install python packages
# Install Python packages
WORKDIR ${HOME}
COPY requirements.txt requirements-extra.txt $HOME/

RUN pip install --no-cache-dir torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu \
&& pip install --no-cache-dir -r requirements.txt \
&& playwright install --with-deps firefox \
&& find /usr/local -type d \( -name test -o -name tests \) -exec rm -rf '{}' + \
&& find /usr/local -type f \( -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' + \
&& find /usr/local -type d \( -name '__pycache__' \) -exec rm -rf '{}' + \
&& find /usr/local -type d \( -name 'build' \) -exec rm -rf '{}' + \
&& rm -rf /root/.cache/* /tmp/* \
&& pip cache purge
RUN pip install --no-cache-dir uv==0.5.6
RUN uv pip install --no-cache-dir torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cpu \
&& uv pip install --no-cache-dir -r requirements.txt

# Install additional large packages for all-docs unstructured inference and PII detection
ARG INSTALL_EXTRA=false
RUN if [ "$INSTALL_EXTRA" = "true" ]; then \
pip3 install --no-cache-dir -r requirements-extra.txt && \
uv pip install --no-cache-dir -r requirements-extra.txt && \
python3 -m spacy download en_core_web_lg; \
fi

# Clean up unnecessary files
RUN find /usr/local -type d \( -name test -o -name tests \) -exec rm -rf '{}' + \
&& find /usr/local -type f \( -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' + \
&& find /usr/local -type d -name '__pycache__' -exec rm -rf '{}' + \
&& rm -rf /root/.cache/* /tmp/*

# Clean up unnecessary filesin site-packages
RUN find /usr/local/lib/python3.11/site-packages \
-type d \( -name 'tests' -o -name 'test' -o -name 'examples' \) -exec rm -rf '{}' + \
&& find /usr/local/lib/python3.11/site-packages -type d -name '__pycache__' -exec rm -rf '{}' + \
&& find /usr/local/lib/python3.11/site-packages -type f -name '*.pyc' -exec rm -f '{}' + \
&& find /usr/local/lib/python3.11/site-packages -type f -name '*.pyo' -exec rm -f '{}' +

# Stage 2: Final image
FROM python:3.11-slim

ENV DEBIAN_FRONTEND=noninteractive \
HOME=/home/vectara \
XDG_RUNTIME_DIR=/tmp \
RAY_DEDUP_LOGS="0" \
CUDA_VISIBLE_DEVICES=""

# Install runtime dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
# libopenblas-dev \
tesseract-ocr \
# xvfb \
unixodbc poppler-utils libmagic1 libjpeg62-turbo \
libfontconfig fonts-noto-color-emoji unifont fonts-indic xfonts-75dpi \
&& rm -rf /var/lib/apt/lists/*

# Copy Python packages and application code from the builder stage
COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
COPY --from=builder /usr/local/bin /usr/local/bin

# Install Playwright browsers
RUN playwright install --with-deps firefox \
&& rm -f /usr/local/bin/pwdebug \
&& rm -rf /var/lib/apt/lists/* /tmp/* /root/.cache/*

# Set working directory
WORKDIR ${HOME}

# Copy application code
COPY *.py $HOME/
COPY core/*.py $HOME/core/
COPY crawlers/ $HOME/crawlers/

#SHELL ["/bin/bash", "-c"]

# Set entrypoint and command
ENTRYPOINT ["/bin/bash", "-l", "-c"]
CMD ["python3 ingest.py $CONFIG $PROFILE"]
CMD ["python3 ingest.py $CONFIG $PROFILE"]
46 changes: 28 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -197,24 +197,6 @@ vectara:
# this can be helpful when processing news pages or others which have a lot of advertising content
remove_boilerplate: false
# flag: enable special processing for tables, or images (inside PDF, HTML, PPT, DOCX; optional)
# Notes:
# 1. This processing uses OPENAI, and requires to list the OPENAI_API_KEY in your `secrets.toml` under a special profile called `general`.
# 2. When crawling PDF, PPTX or DOCX files
# - if summarize_tables is enabled, the code will extract table content, then use GPT-4o to summarize the table, and ingest this summarized text while into Vectara.
# - if summarize_images is enabled, the code will use GPT-4o vision to summarize the content of the images.
# 3. This processing is quite slow and will require you to have an additional paid subscription to OpenAI.
# For PDF files, the code uses the "detectron2_onnx" unstructured model to detect tables
# You can modify this to use one of the alternatives: https://unstructured-io.github.io/unstructured/best_practices/models.html) if you want a slower but more performance model.
# See [here](TABLE_SUMMARY.md) for some examples of how table summary works.
summarize_tables: false
summarize_images: false
# If using Unstructured to process files locally, we define a few arguments
unst_chunking_strategy: none # chunking strategy to use: basic, by_title or none; default none
unst_chunk_size: 1024 # chunk size if using unstructured chunking; default 1024
unst_use_core_indexing: true # whether to use core_indexing which maintains the chunks from unstructured, or let vectara chunk further
# Whether masking of PII is attempted on all text fields (title, text, metadata values)
# Notes:
# 1. This masking is never done on files uploaded to Vectara directly (via e.g. indexer.index_file())
Expand All @@ -226,6 +208,34 @@ vectara:
whisper_model: the model name for whisper
doc_processing:
# Whether or not to summarize table content with GPT-4o (inside PDF, HTML, PPT, DOCX; optional)
# When using this feature, you need to list the OPENAI_API_KEY in your `secrets.toml` under a special profile called `general`.
# This processing is quite slow and will require you to have an additional paid subscription to OpenAI.
# See [here](TABLE_SUMMARY.md) for some examples of how table summary works.
summarize_tables: false
# Whether or not to summarize image content using GPT-4o vision
# When using this feature, you need to list the OPENAI_API_KEY in your `secrets.toml` under a special profile called `general`.
# This processing is quite slow and will require you to have an additional paid subscription to OpenAI.
summarize_images: false
# which document parser to use for local file parsing: unstructured or docling
doc_parser: unstructured
# whether to use core_indexing which maintains the chunks from unstructured or docling, or let vectara chunk further
use_core_indexing: false
# Unstructured document parsing configuration
unstructured_config:
chunking_strategy: none # chunking strategy to use: basic, by_title or none; default none
chunk_size: 1024 # chunk size if using unstructured chunking; default 1024
# Docling document parsing configuation
docling_config:
chunk: false # Whether to use Docling Chunking
crawling:
# type of crawler; valid options are website, docusaurus, notion, jira, rss, mediawiki, discourse, github and others (this continues to evolve as new crawler types are added)
crawler_type: XXX
Expand Down
1 change: 0 additions & 1 deletion config/askFeynman.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,3 @@ website_crawler:
num_per_second: 1
pos_regex: ["https://www.feynmanlectures.caltech.edu/.*"]
pages_source: sitemap # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/askHBS.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,3 @@ website_crawler:
pos_regex: [".*hbs.edu.*"]
num_per_second: 1
pages_source: sitemap
extraction: playwright
1 change: 0 additions & 1 deletion config/legalaid-IL.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,3 @@ website_crawler:
pos_regex: [".*illinoislegalaid.*"]
num_per_second: 1
pages_source: sitemap # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/lethain.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,3 @@ website_crawler:
["https://infraeng.dev/", "https://staffeng.com/", "https://lethain.com/"]
num_per_second: 1
pages_source: sitemap # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/news-bbc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@ rss_crawler:
]
days_past: 30
delay: 1
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/news-cnbc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,3 @@ rss_crawler:
]
days_past: 30
delay: 1
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/news-cnn.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,3 @@ rss_crawler:
]
days_past: 30
delay: 1
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/news-fox.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,3 @@ rss_crawler:
]
days_past: 30
delay: 1
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/news-npr.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,3 @@ rss_crawler:
]
days_past: 30
delay: 1
extraction: playwright # pdf or playwright
1 change: 0 additions & 1 deletion config/sf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,4 @@ website_crawler:
neg_regex: [".*sf.gov/es/.*", ".*sf.gov/fil/.*", ".*sf.gov/zh-hant/.*"]
pages_source: crawl # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
max_depth: 3
extraction: playwright # pdf or playwright
ray_workers: 0
1 change: 0 additions & 1 deletion config/vectara-website.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,4 @@ website_crawler:
num_per_second: 1
pages_source: sitemap
pos_regex: [".*"]
extraction: playwright
ray_workers: 0
42 changes: 0 additions & 42 deletions core/crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging
from typing import Set, Optional, List, Any
from core.indexer import Indexer
from core.pdf_convert import PDFConverter
from core.utils import img_extensions, doc_extensions, archive_extensions
from slugify import slugify
from urllib.parse import urlparse
Expand Down Expand Up @@ -92,44 +91,3 @@ def __init__(
self.cfg: DictConfig = DictConfig(cfg)
self.indexer = Indexer(cfg, endpoint, customer_id, corpus_id, api_key)
self.verbose = cfg.vectara.get("verbose", False)

def url_to_file(self, url: str, title: str) -> str:
"""
Crawl a single webpage and create a PDF file to reflect its rendered content.
Args:
url (str): URL of the page to crawl.
title (str): Title to use in case HTML does not have its own title.
Returns:
str: Name of the PDF file created.
"""
# first verify the URL is valid
response = requests.get(url, headers=get_headers)
if response.status_code != 200:
if response.status_code == 404:
raise Exception(f"Error 404 - URL not found: {url}")
elif response.status_code == 401:
raise Exception(f"Error 403 - Unauthorized: {url}")
elif response.status_code == 403:
raise Exception(f"Error 403 - Access forbidden: {url}")
elif response.status_code == 405:
raise Exception(f"Error 405 - Method not allowed: {url}")
else:
raise Exception(
f"Invalid URL: {url} (status code={response.status_code}, reason={response.reason})"
)

if title is None or len(title)==0:
soup = BeautifulSoup(response.text, "html.parser")
title = str(soup.title)

# convert to local file (PDF)
filename = slugify(url) + ".pdf"
if not PDFConverter(use_pdfkit=False).from_url(url, filename, title=title):
raise Exception(f"Failed to convert {url} to PDF")

return filename

def crawl(self) -> None:
raise Exception("Not implemented")
Loading

0 comments on commit 8d0233a

Please sign in to comment.