diff --git a/CHANGELOG.md b/CHANGELOG.md index bbb31a75b4..1ab8853383 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.16.12-dev5 +## 0.16.12-dev6 ### Enhancements @@ -14,6 +14,7 @@ - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file. - **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements. - **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`. +- **Fix NLTK Download** to contain NLTK data on docker image ## 0.16.11 diff --git a/Dockerfile b/Dockerfile index c6d6e906b6..5ba4e9c3e2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base +FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base USER root @@ -18,12 +18,16 @@ RUN chown -R notebook-user:notebook-user /app && \ USER notebook-user -RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \ - python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \ - python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \ - python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" +RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' + +RUN python3.11 -c "import os; os.makedirs('/home/notebook-user/nltk_data', exist_ok=True)" && \ + python3.11 -c "from nltk.downloader import download; download('punkt_tab'); download('averaged_perceptron_tagger_eng')" + +RUN python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \ + python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" ENV PATH="${PATH}:/home/notebook-user/.local/bin" ENV TESSDATA_PREFIX=/usr/local/share/tessdata +ENV NLTK_DATA=/home/notebook-user/nltk_data CMD ["/bin/bash"] diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py index f0262484cd..afe8cfe083 100644 --- a/test_unstructured/nlp/test_tokenize.py +++ b/test_unstructured/nlp/test_tokenize.py @@ -1,27 +1,14 @@ from typing import List, Tuple from unittest.mock import patch -import nltk - from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize from unstructured.nlp import tokenize -def test_nltk_packages_download_if_not_present(): - tokenize._download_nltk_packages_if_not_present.cache_clear() - with patch.object(nltk, "find", side_effect=LookupError): - with patch.object(tokenize, "download_nltk_packages") as mock_download: - tokenize._download_nltk_packages_if_not_present() - - mock_download.assert_called_once() - - -def test_nltk_packages_do_not_download_if(): - tokenize._download_nltk_packages_if_not_present.cache_clear() - with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download: - tokenize._download_nltk_packages_if_not_present() - - mock_download.assert_not_called() +def test_nltk_assets_validation(): + with patch("unstructured.nlp.tokenize.validate_nltk_assets") as mock_validate: + tokenize.validate_nltk_assets() + mock_validate.assert_called_once() def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]: diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 07eda39112..c9ccfa3156 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.16.12-dev5" # pragma: no cover +__version__ = "0.16.12-dev6" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index f26770d53f..116dd13996 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -11,66 +11,56 @@ CACHE_MAX_SIZE: Final[int] = 128 - -def download_nltk_packages(): - nltk.download("averaged_perceptron_tagger_eng", quiet=True) - nltk.download("punkt_tab", quiet=True) +# Define the NLTK data path based on the Docker image environment +NLTK_DATA_PATH = os.getenv("NLTK_DATA", "/home/notebook-user/nltk_data") +nltk.data.path.append(NLTK_DATA_PATH) def check_for_nltk_package(package_name: str, package_category: str) -> bool: - """Checks to see if the specified NLTK package exists on the file system""" - paths: list[str] = [] - for path in nltk.data.path: - if not path.endswith("nltk_data"): - path = os.path.join(path, "nltk_data") - paths.append(path) - + """Checks to see if the specified NLTK package exists on the file system.""" try: - nltk.find(f"{package_category}/{package_name}", paths=paths) + nltk.find(f"{package_category}/{package_name}") return True except (LookupError, OSError): return False -# We cache this because we do not want to attempt -# downloading the packages multiple times -@lru_cache() -def _download_nltk_packages_if_not_present(): - """If required NLTK packages are not available, download them.""" +# Ensure NLTK data exists in the specified path (pre-baked in Docker) +def validate_nltk_assets(): + """Validate that required NLTK packages are preloaded in the image.""" + required_assets = [ + ("punkt_tab", "tokenizers"), + ("averaged_perceptron_tagger_eng", "taggers"), + ] + for package_name, category in required_assets: + if not check_for_nltk_package(package_name, category): + raise RuntimeError( + f"Required NLTK package '{package_name}' is missing. " + f"Ensure it is baked into the Docker image at '{NLTK_DATA_PATH}'." + ) - tagger_available = check_for_nltk_package( - package_category="taggers", - package_name="averaged_perceptron_tagger_eng", - ) - tokenizer_available = check_for_nltk_package( - package_category="tokenizers", package_name="punkt_tab" - ) - if (not tokenizer_available) or (not tagger_available): - download_nltk_packages() +# Validate NLTK assets at import time +validate_nltk_assets() @lru_cache(maxsize=CACHE_MAX_SIZE) def sent_tokenize(text: str) -> List[str]: """A wrapper around the NLTK sentence tokenizer with LRU caching enabled.""" - _download_nltk_packages_if_not_present() return _sent_tokenize(text) @lru_cache(maxsize=CACHE_MAX_SIZE) def word_tokenize(text: str) -> List[str]: """A wrapper around the NLTK word tokenizer with LRU caching enabled.""" - _download_nltk_packages_if_not_present() return _word_tokenize(text) @lru_cache(maxsize=CACHE_MAX_SIZE) def pos_tag(text: str) -> List[Tuple[str, str]]: """A wrapper around the NLTK POS tagger with LRU caching enabled.""" - _download_nltk_packages_if_not_present() - # NOTE(robinson) - Splitting into sentences before tokenizing. The helps with - # situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken - # for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject. + # NOTE: Splitting into sentences before tokenizing helps with situations + # like "ITEM 1A. PROPERTIES" where tokens can be misinterpreted. sentences = _sent_tokenize(text) parts_of_speech: list[tuple[str, str]] = [] for sentence in sentences: