Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add ability to contain nltk data on docker image #3851

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.16.12-dev5
## 0.16.12-dev6

### Enhancements

Expand All @@ -14,6 +14,7 @@
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
- **Improve element-type mapping for HTML.** Fixes bug where certain non-title elements were classified as `Title`.
- **Fix NLTK Download** to contain NLTK data on docker image

## 0.16.11

Expand Down
14 changes: 9 additions & 5 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM quay.io/unstructured-io/base-images:wolfi-base-latest as base
FROM quay.io/unstructured-io/base-images:wolfi-base-latest AS base

USER root

Expand All @@ -18,12 +18,16 @@ RUN chown -R notebook-user:notebook-user /app && \

USER notebook-user

RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' && \
python3.11 -c "from unstructured.nlp.tokenize import download_nltk_packages; download_nltk_packages()" && \
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"
RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';'

RUN python3.11 -c "import os; os.makedirs('/home/notebook-user/nltk_data', exist_ok=True)" && \
python3.11 -c "from nltk.downloader import download; download('punkt_tab'); download('averaged_perceptron_tagger_eng')"

RUN python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a reason to move these into 3 separate docker layers (with the 3 different RUNs)? (generally fewer is better)

ENV PATH="${PATH}:/home/notebook-user/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata
ENV NLTK_DATA=/home/notebook-user/nltk_data

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should this be before the relevant RUN commands?

CMD ["/bin/bash"]
21 changes: 4 additions & 17 deletions test_unstructured/nlp/test_tokenize.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,14 @@
from typing import List, Tuple
from unittest.mock import patch

import nltk

from test_unstructured.nlp.mock_nltk import mock_sent_tokenize, mock_word_tokenize
from unstructured.nlp import tokenize


def test_nltk_packages_download_if_not_present():
tokenize._download_nltk_packages_if_not_present.cache_clear()
with patch.object(nltk, "find", side_effect=LookupError):
with patch.object(tokenize, "download_nltk_packages") as mock_download:
tokenize._download_nltk_packages_if_not_present()

mock_download.assert_called_once()


def test_nltk_packages_do_not_download_if():
tokenize._download_nltk_packages_if_not_present.cache_clear()
with patch.object(nltk, "find"), patch.object(nltk, "download") as mock_download:
tokenize._download_nltk_packages_if_not_present()

mock_download.assert_not_called()
def test_nltk_assets_validation():
with patch("unstructured.nlp.tokenize.validate_nltk_assets") as mock_validate:
tokenize.validate_nltk_assets()
mock_validate.assert_called_once()


def mock_pos_tag(tokens: List[str]) -> List[Tuple[str, str]]:
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.16.12-dev5" # pragma: no cover
__version__ = "0.16.12-dev6" # pragma: no cover
54 changes: 22 additions & 32 deletions unstructured/nlp/tokenize.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,66 +11,56 @@

CACHE_MAX_SIZE: Final[int] = 128


def download_nltk_packages():
nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("punkt_tab", quiet=True)
# Define the NLTK data path based on the Docker image environment
NLTK_DATA_PATH = os.getenv("NLTK_DATA", "/home/notebook-user/nltk_data")
nltk.data.path.append(NLTK_DATA_PATH)


def check_for_nltk_package(package_name: str, package_category: str) -> bool:
"""Checks to see if the specified NLTK package exists on the file system"""
paths: list[str] = []
for path in nltk.data.path:
if not path.endswith("nltk_data"):
path = os.path.join(path, "nltk_data")
paths.append(path)

"""Checks to see if the specified NLTK package exists on the file system."""
try:
nltk.find(f"{package_category}/{package_name}", paths=paths)
nltk.find(f"{package_category}/{package_name}")
return True
except (LookupError, OSError):
return False


# We cache this because we do not want to attempt
# downloading the packages multiple times
@lru_cache()
def _download_nltk_packages_if_not_present():
"""If required NLTK packages are not available, download them."""
# Ensure NLTK data exists in the specified path (pre-baked in Docker)
def validate_nltk_assets():
"""Validate that required NLTK packages are preloaded in the image."""
required_assets = [
("punkt_tab", "tokenizers"),
("averaged_perceptron_tagger_eng", "taggers"),
]
for package_name, category in required_assets:
if not check_for_nltk_package(package_name, category):
raise RuntimeError(
f"Required NLTK package '{package_name}' is missing. "
f"Ensure it is baked into the Docker image at '{NLTK_DATA_PATH}'."
)

tagger_available = check_for_nltk_package(
package_category="taggers",
package_name="averaged_perceptron_tagger_eng",
)
tokenizer_available = check_for_nltk_package(
package_category="tokenizers", package_name="punkt_tab"
)

if (not tokenizer_available) or (not tagger_available):
download_nltk_packages()
# Validate NLTK assets at import time
validate_nltk_assets()


@lru_cache(maxsize=CACHE_MAX_SIZE)
def sent_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK sentence tokenizer with LRU caching enabled."""
_download_nltk_packages_if_not_present()
return _sent_tokenize(text)


@lru_cache(maxsize=CACHE_MAX_SIZE)
def word_tokenize(text: str) -> List[str]:
"""A wrapper around the NLTK word tokenizer with LRU caching enabled."""
_download_nltk_packages_if_not_present()
return _word_tokenize(text)


@lru_cache(maxsize=CACHE_MAX_SIZE)
def pos_tag(text: str) -> List[Tuple[str, str]]:
"""A wrapper around the NLTK POS tagger with LRU caching enabled."""
_download_nltk_packages_if_not_present()
# NOTE(robinson) - Splitting into sentences before tokenizing. The helps with
# situations like "ITEM 1A. PROPERTIES" where "PROPERTIES" can be mistaken
# for a verb because it looks like it's in verb form an "ITEM 1A." looks like the subject.
# NOTE: Splitting into sentences before tokenizing helps with situations
# like "ITEM 1A. PROPERTIES" where tokens can be misinterpreted.
sentences = _sent_tokenize(text)
parts_of_speech: list[tuple[str, str]] = []
for sentence in sentences:
Expand Down
Loading