Add docling support (#132)

* initial * added .dockerignore * updated Dockerfile * updated * updates * updates * bug fixes * bugfix * bugfix * bug fixes * added metadata
vectara · Dec 7, 2024 · 8d0233a · 8d0233a
1 parent 9c979ae
commit 8d0233a
Show file tree

Hide file tree

Showing 24 changed files with 293 additions and 250 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,6 @@
+.git
+__pycache__
+*.pyc
+.env
+.vscode
+.DS_Store
diff --git a/Dockerfile b/Dockerfile
@@ -1,64 +1,86 @@
-FROM ubuntu:22.04
+# Stage 1: Build stage
+FROM python:3.11-slim AS builder
 
 ENV DEBIAN_FRONTEND=noninteractive \
-    HOME=/home/vectara  \
+    HOME=/home/vectara \
     XDG_RUNTIME_DIR=/tmp \
     RAY_DEDUP_LOGS="0" \
-    CUDA_VISIBLE_DEVICES=""
-
-RUN sed 's/main$/main universe/' -i /etc/apt/sources.list
+    CUDA_VISIBLE_DEVICES="" \
+    UV_SYSTEM_PYTHON=1
 
+# Install build dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential \
     libopenblas-dev \
-    unzip \
     wget \
     git \
     curl \
-    wkhtmltopdf \
-    libssl-dev \
-    unixodbc \
-    poppler-utils \
-    tesseract-ocr \
-    libtesseract-dev \
-    xvfb \
-    python3-pip python3-dev \
-    libmagic1  \
-    libfontconfig fontconfig \
-    libjpeg-turbo8 \
-    fonts-noto-color-emoji unifont fonts-indic xfonts-75dpi \
-    && apt-get purge -y \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+    python3-dev \
+    && rm -rf /var/lib/apt/lists/* /tmp/*
 
-RUN rm -f /usr/share/fonts/truetype/unifont/unifont_sample.ttf /usr/share/fonts/truetype/noto/NotoColorEmoji.ttf
-ENV OMP_NUM_THREADS=4
-
-# Install python packages
+# Install Python packages
 WORKDIR ${HOME}
 COPY requirements.txt requirements-extra.txt $HOME/
 
-RUN pip install --no-cache-dir torch==2.4.1 --index-url https://download.pytorch.org/whl/cpu \
-    && pip install --no-cache-dir -r requirements.txt \
-    && playwright install --with-deps firefox \
-    && find /usr/local -type d \( -name test -o -name tests \) -exec rm -rf '{}' + \
-    && find /usr/local -type f \( -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' + \
-    && find /usr/local -type d \( -name '__pycache__' \) -exec rm -rf '{}' + \
-    && find /usr/local -type d \( -name 'build' \) -exec rm -rf '{}' + \
-    && rm -rf /root/.cache/* /tmp/* \
-    && pip cache purge
+RUN pip install --no-cache-dir uv==0.5.6
+RUN uv pip install --no-cache-dir torch==2.4.1 torchvision==0.19.1 --index-url https://download.pytorch.org/whl/cpu \
+    && uv pip install --no-cache-dir -r requirements.txt
 
-# Install additional large packages for all-docs unstructured inference and PII detection
 ARG INSTALL_EXTRA=false
 RUN if [ "$INSTALL_EXTRA" = "true" ]; then \
-        pip3 install --no-cache-dir -r requirements-extra.txt && \
+        uv pip install --no-cache-dir -r requirements-extra.txt && \
         python3 -m spacy download en_core_web_lg; \
     fi
+
+# Clean up unnecessary files
+RUN find /usr/local -type d \( -name test -o -name tests \) -exec rm -rf '{}' + \
+    && find /usr/local -type f \( -name '*.pyc' -o -name '*.pyo' \) -exec rm -rf '{}' + \
+    && find /usr/local -type d -name '__pycache__' -exec rm -rf '{}' + \
+    && rm -rf /root/.cache/* /tmp/*
+
+# Clean up unnecessary filesin site-packages
+RUN find /usr/local/lib/python3.11/site-packages \
+    -type d \( -name 'tests' -o -name 'test' -o -name 'examples' \) -exec rm -rf '{}' + \
+    && find /usr/local/lib/python3.11/site-packages -type d -name '__pycache__' -exec rm -rf '{}' + \
+    && find /usr/local/lib/python3.11/site-packages -type f -name '*.pyc' -exec rm -f '{}' + \
+    && find /usr/local/lib/python3.11/site-packages -type f -name '*.pyo' -exec rm -f '{}' +
+
+# Stage 2: Final image
+FROM python:3.11-slim
+
+ENV DEBIAN_FRONTEND=noninteractive \
+    HOME=/home/vectara \
+    XDG_RUNTIME_DIR=/tmp \
+    RAY_DEDUP_LOGS="0" \
+    CUDA_VISIBLE_DEVICES=""
+
+# Install runtime dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+#    libopenblas-dev \
+    tesseract-ocr \
+#    xvfb \
+    unixodbc poppler-utils libmagic1 libjpeg62-turbo \
+    libfontconfig fonts-noto-color-emoji unifont fonts-indic xfonts-75dpi \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy Python packages and application code from the builder stage
+COPY --from=builder /usr/local/lib/python3.11/site-packages /usr/local/lib/python3.11/site-packages
+COPY --from=builder /usr/local/bin /usr/local/bin
+
+# Install Playwright browsers
+RUN playwright install --with-deps firefox \
+    && rm -f /usr/local/bin/pwdebug \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /root/.cache/*
 
+# Set working directory
+WORKDIR ${HOME}
+
+# Copy application code
 COPY *.py $HOME/
 COPY core/*.py $HOME/core/
 COPY crawlers/ $HOME/crawlers/
 
-#SHELL ["/bin/bash", "-c"]
+
+# Set entrypoint and command
 ENTRYPOINT ["/bin/bash", "-l", "-c"]
-CMD ["python3 ingest.py $CONFIG $PROFILE"]
+CMD ["python3 ingest.py $CONFIG $PROFILE"]
diff --git a/README.md b/README.md
@@ -197,24 +197,6 @@ vectara:
   # this can be helpful when processing news pages or others which have a lot of advertising content
   remove_boilerplate: false
 
-  # flag: enable special processing for tables, or images (inside PDF, HTML, PPT, DOCX; optional)
-  # Notes:
-  # 1. This processing uses OPENAI, and requires to list the OPENAI_API_KEY in your `secrets.toml` under a special profile called `general`.
-  # 2. When crawling PDF, PPTX or DOCX files
-  #    - if summarize_tables is enabled, the code will extract table content, then use GPT-4o to summarize the table, and ingest this summarized text while into Vectara.
-  #    - if summarize_images is enabled, the code will use GPT-4o vision to summarize the content of the images.
-  # 3. This processing is quite slow and will require you to have an additional paid subscription to OpenAI. 
-  #    For PDF files, the code uses the "detectron2_onnx" unstructured model to detect tables
-  #    You can modify this to use one of the alternatives: https://unstructured-io.github.io/unstructured/best_practices/models.html) if you want a slower but more performance model.
-  # See [here](TABLE_SUMMARY.md) for some examples of how table summary works.
-  summarize_tables: false
-  summarize_images: false
-
-  # If using Unstructured to process files locally, we define a few arguments
-  unst_chunking_strategy: none            # chunking strategy to use: basic, by_title or none; default none
-  unst_chunk_size: 1024                   # chunk size if using unstructured chunking; default 1024
-  unst_use_core_indexing: true            # whether to use core_indexing which maintains the chunks from unstructured, or let vectara chunk further
-
   # Whether masking of PII is attempted on all text fields (title, text, metadata values)
   # Notes: 
   # 1. This masking is never done on files uploaded to Vectara directly (via e.g. indexer.index_file())
@@ -226,6 +208,34 @@ vectara:
   whisper_model: the model name for whisper
 
 
+doc_processing:
+  # Whether or not to summarize table content with GPT-4o (inside PDF, HTML, PPT, DOCX; optional)
+  # When using this feature, you need to list the OPENAI_API_KEY in your `secrets.toml` under a special profile called `general`.
+  # This processing is quite slow and will require you to have an additional paid subscription to OpenAI. 
+  # See [here](TABLE_SUMMARY.md) for some examples of how table summary works.
+  summarize_tables: false
+  
+  # Whether or not to summarize image content using GPT-4o vision 
+  # When using this feature, you need to list the OPENAI_API_KEY in your `secrets.toml` under a special profile called `general`.
+  # This processing is quite slow and will require you to have an additional paid subscription to OpenAI. 
+  summarize_images: false
+
+  # which document parser to use for local file parsing: unstructured or docling
+  doc_parser: unstructured
+
+  # whether to use core_indexing which maintains the chunks from unstructured or docling, or let vectara chunk further
+  use_core_indexing: false            
+
+  # Unstructured document parsing configuration
+  unstructured_config:
+    chunking_strategy: none            # chunking strategy to use: basic, by_title or none; default none
+    chunk_size: 1024                   # chunk size if using unstructured chunking; default 1024
+
+  # Docling document parsing configuation
+  docling_config:
+    chunk: false                            # Whether to use Docling Chunking
+
+
 crawling:
   # type of crawler; valid options are website, docusaurus, notion, jira, rss, mediawiki, discourse, github and others (this continues to evolve as new crawler types are added)
   crawler_type: XXX

diff --git a/config/askFeynman.yaml b/config/askFeynman.yaml
@@ -11,4 +11,3 @@ website_crawler:
   num_per_second: 1
   pos_regex: ["https://www.feynmanlectures.caltech.edu/.*"]
   pages_source: sitemap # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
-  extraction: playwright # pdf or playwright
diff --git a/config/askHBS.yaml b/config/askHBS.yaml
@@ -11,4 +11,3 @@ website_crawler:
   pos_regex: [".*hbs.edu.*"]
   num_per_second: 1
   pages_source: sitemap
-  extraction: playwright
diff --git a/config/legalaid-IL.yaml b/config/legalaid-IL.yaml
@@ -11,4 +11,3 @@ website_crawler:
   pos_regex: [".*illinoislegalaid.*"]
   num_per_second: 1
   pages_source: sitemap # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
-  extraction: playwright # pdf or playwright
diff --git a/config/lethain.yaml b/config/lethain.yaml
@@ -11,4 +11,3 @@ website_crawler:
     ["https://infraeng.dev/", "https://staffeng.com/", "https://lethain.com/"]
   num_per_second: 1
   pages_source: sitemap # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
-  extraction: playwright # pdf or playwright
diff --git a/config/news-bbc.yaml b/config/news-bbc.yaml
@@ -19,4 +19,3 @@ rss_crawler:
   ]
   days_past: 30
   delay: 1
-  extraction: playwright           # pdf or playwright
diff --git a/config/news-cnbc.yaml b/config/news-cnbc.yaml
@@ -19,4 +19,3 @@ rss_crawler:
   ]
   days_past: 30
   delay: 1
-  extraction: playwright           # pdf or playwright
diff --git a/config/news-cnn.yaml b/config/news-cnn.yaml
@@ -38,4 +38,3 @@ rss_crawler:
     ]
   days_past: 30
   delay: 1
-  extraction: playwright # pdf or playwright
diff --git a/config/news-fox.yaml b/config/news-fox.yaml
@@ -16,4 +16,3 @@ rss_crawler:
   ]
   days_past: 30
   delay: 1
-  extraction: playwright           # pdf or playwright
diff --git a/config/news-npr.yaml b/config/news-npr.yaml
@@ -17,4 +17,3 @@ rss_crawler:
   ]
   days_past: 30
   delay: 1
-  extraction: playwright           # pdf or playwright
diff --git a/config/sf.yaml b/config/sf.yaml
@@ -14,5 +14,4 @@ website_crawler:
   neg_regex: [".*sf.gov/es/.*", ".*sf.gov/fil/.*", ".*sf.gov/zh-hant/.*"]
   pages_source: crawl # options are: (1) 'sitemap' automatically retreived from website (2) 'crawl' for recursive crawling
   max_depth: 3
-  extraction: playwright # pdf or playwright
   ray_workers: 0
diff --git a/config/vectara-website.yaml b/config/vectara-website.yaml
@@ -11,5 +11,4 @@ website_crawler:
   num_per_second: 1
   pages_source: sitemap
   pos_regex: [".*"]
-  extraction: playwright
   ray_workers: 0
diff --git a/core/crawler.py b/core/crawler.py
@@ -5,7 +5,6 @@
 import logging
 from typing import Set, Optional, List, Any
 from core.indexer import Indexer
-from core.pdf_convert import PDFConverter
 from core.utils import img_extensions, doc_extensions, archive_extensions
 from slugify import slugify
 from urllib.parse import urlparse
@@ -92,44 +91,3 @@ def __init__(
         self.cfg: DictConfig = DictConfig(cfg)
         self.indexer = Indexer(cfg, endpoint, customer_id, corpus_id, api_key)
         self.verbose = cfg.vectara.get("verbose", False)
-
-    def url_to_file(self, url: str, title: str) -> str:
-        """
-        Crawl a single webpage and create a PDF file to reflect its rendered content.
-
-        Args:
-            url (str): URL of the page to crawl.
-            title (str): Title to use in case HTML does not have its own title.
-
-        Returns:
-            str: Name of the PDF file created.
-        """
-        # first verify the URL is valid
-        response = requests.get(url, headers=get_headers)
-        if response.status_code != 200:
-            if response.status_code == 404:
-                raise Exception(f"Error 404 - URL not found: {url}")
-            elif response.status_code == 401:
-                raise Exception(f"Error 403 - Unauthorized: {url}")
-            elif response.status_code == 403:
-                raise Exception(f"Error 403 - Access forbidden: {url}")
-            elif response.status_code == 405:
-                raise Exception(f"Error 405 - Method not allowed: {url}")
-            else:
-                raise Exception(
-                    f"Invalid URL: {url} (status code={response.status_code}, reason={response.reason})"
-                )
-
-        if title is None or len(title)==0:
-            soup = BeautifulSoup(response.text, "html.parser")
-            title = str(soup.title)
-
-        # convert to local file (PDF)
-        filename = slugify(url) + ".pdf"
-        if not PDFConverter(use_pdfkit=False).from_url(url, filename, title=title):
-            raise Exception(f"Failed to convert {url} to PDF")
-
-        return filename
-
-    def crawl(self) -> None:
-        raise Exception("Not implemented")