Unstructured-IO · christinestraub · Dec 23, 2024 · Dec 23, 2024 · cragwolfe · Dec 23, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.16.12-dev5
+## 0.16.12-dev6
 
 ### Enhancements
 
@@ -14,6 +14,7 @@
 - **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
 - **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
 - **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`.
+- **Fix NLTK Download** to contain NLTK data on docker image
 
 ## 0.16.11
 

diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base
+FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base
 
 USER root
 
@@ -18,12 +18,16 @@ RUN chown -R notebook-user:notebook-user /app && \
 
 USER notebook-user
 
-RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
-  python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
-  python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
-  python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
+RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';'
+
+RUN python3.11 -c "import os; os.makedirs('/home/notebook-user/nltk_data', exist_ok=True)" && \
+    python3.11 -c "from nltk.downloader import download; download('punkt_tab'); download('averaged_perceptron_tagger_eng')"
+
+RUN python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
+    python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
 
 ENV PATH="${PATH}:/home/notebook-user/.local/bin"
 ENV TESSDATA_PREFIX=/usr/local/share/tessdata
+ENV NLTK_DATA=/home/notebook-user/nltk_data
 
 CMD ["/bin/bash"]
diff --git a/test_unstructured/nlp/test_tokenize.py b/test_unstructured/nlp/test_tokenize.py
@@ -1,27 +1,14 @@
 from typing import List, Tuple
 from unittest.mock import patch
 
-import nltk
-
 from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
 from unstructured.nlp import tokenize
 
 
-def test_nltk_packages_download_if_not_present():
-    tokenize._download_nltk_packages_if_not_present.cache_clear()
-    with patch.object(nltk, "find", side_effect=LookupError):
-        with patch.object(tokenize, "download_nltk_packages") as mock_download:
-            tokenize._download_nltk_packages_if_not_present()
-
-    mock_download.assert_called_once()
-
-
-def test_nltk_packages_do_not_download_if():
-    tokenize._download_nltk_packages_if_not_present.cache_clear()
-    with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
-        tokenize._download_nltk_packages_if_not_present()
-
-    mock_download.assert_not_called()
+def test_nltk_assets_validation():
+    with patch("unstructured.nlp.tokenize.validate_nltk_assets") as mock_validate:
+        tokenize.validate_nltk_assets()
+        mock_validate.assert_called_once()
 
 
 def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.16.12-dev5"  # pragma: no cover
+__version__ = "0.16.12-dev6"  # pragma: no cover
diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py
@@ -11,66 +11,56 @@
 
 CACHE_MAX_SIZE: Final[int] = 128
 
-
-def download_nltk_packages():
-    nltk.download("averaged_perceptron_tagger_eng", quiet=True)
-    nltk.download("punkt_tab", quiet=True)
+# Define the NLTK data path based on the Docker image environment
+NLTK_DATA_PATH = os.getenv("NLTK_DATA", "/home/notebook-user/nltk_data")
+nltk.data.path.append(NLTK_DATA_PATH)
 
 
 def check_for_nltk_package(package_name: str, package_category: str) -> bool:
-    """Checks to see if the specified NLTK package exists on the file system"""
-    paths: list[str] = []
-    for path in nltk.data.path:
-        if not path.endswith("nltk_data"):
-            path = os.path.join(path, "nltk_data")
-        paths.append(path)
-
+    """Checks to see if the specified NLTK package exists on the file system."""
     try:
-        nltk.find(f"{package_category}/{package_name}", paths=paths)
+        nltk.find(f"{package_category}/{package_name}")
         return True
     except (LookupError, OSError):
         return False
 
 
-# We cache this because we do not want to attempt
-# downloading the packages multiple times
-@lru_cache()
-def _download_nltk_packages_if_not_present():
-    """If required NLTK packages are not available, download them."""
+# Ensure NLTK data exists in the specified path (pre-baked in Docker)
+def validate_nltk_assets():
+    """Validate that required NLTK packages are preloaded in the image."""
+    required_assets = [
+        ("punkt_tab", "tokenizers"),
+        ("averaged_perceptron_tagger_eng", "taggers"),
+    ]
+    for package_name, category in required_assets:
+        if not check_for_nltk_package(package_name, category):
+            raise RuntimeError(
+                f"Required NLTK package '{package_name}' is missing. "
+                f"Ensure it is baked into the Docker image at '{NLTK_DATA_PATH}'."
+            )
 
-    tagger_available = check_for_nltk_package(
-        package_category="taggers",
-        package_name="averaged_perceptron_tagger_eng",
-    )
-    tokenizer_available = check_for_nltk_package(
-        package_category="tokenizers", package_name="punkt_tab"
-    )
 
-    if (not tokenizer_available) or (not tagger_available):
-        download_nltk_packages()
+# Validate NLTK assets at import time
+validate_nltk_assets()
 
 
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def sent_tokenize(text: str) -> List[str]:
     """A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
-    _download_nltk_packages_if_not_present()
     return _sent_tokenize(text)
 
 
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def word_tokenize(text: str) -> List[str]:
     """A wrapper around the NLTK word tokenizer with LRU caching enabled."""
-    _download_nltk_packages_if_not_present()
     return _word_tokenize(text)
 
 
 @lru_cache(maxsize=CACHE_MAX_SIZE)
 def pos_tag(text: str) -> List[Tuple[str, str]]:
     """A wrapper around the NLTK POS tagger with LRU caching enabled."""
-    _download_nltk_packages_if_not_present()
-    # NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
-    # situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
-    # for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
+    # NOTE: Splitting into sentences before tokenizing helps with situations
+    # like "ITEM 1A. PROPERTIES" where tokens can be misinterpreted.
     sentences = _sent_tokenize(text)
     parts_of_speech: list[tuple[str, str]] = []
     for sentence in sentences:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.16.12-dev5" # pragma: no cover
		__version__ = "0.16.12-dev6" # pragma: no cover