From 1f8030dd0ee76389c619603197fc5f7f4e546d09 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Mon, 19 Aug 2024 15:59:36 -0500 Subject: [PATCH] fix(CVE-2024-39705): bump to `nltk` 3.9.1; correct model download issues (#3541) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary Bumps to `nltk==3.9.1` and resolves [CVE-2024-39705](https://nvd.nist.gov/vuln/detail/CVE-2024-39705). An NLTK version bump was originally introduced in #3512 and rolled back in #3527 because `nltk==3.8.2` was yanked from PyPI, and also because we observed significant slowdowns in processing time after bumping to `nltk==3.8.2`. The processing time regression does not appear in `nltk==3.9.1`. ### Testing After the bump, CI should pass. Additionally we verified locally that files processing takes around the amount of time we would expect for a long `.docx` file. ```python In [1]: from unstructured.partition.auto import partition In [2]: filename = "test-doc.docx" In [3]: %timeit partition(filename=filename) 3.92 s ± 73 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) ``` --- CHANGELOG.md | 3 ++- requirements/base.txt | 6 +++--- requirements/deps/constraints.txt | 3 +++ requirements/dev.txt | 4 ++-- requirements/extra-markdown.txt | 2 +- requirements/extra-paddleocr.txt | 8 ++++---- requirements/extra-pdf-image.txt | 17 +++++++++-------- requirements/huggingface.txt | 2 +- requirements/ingest/azure.txt | 4 ++-- requirements/ingest/biomed.txt | 2 +- requirements/ingest/chroma.txt | 21 +++++++++++---------- requirements/ingest/clarifai.txt | 8 +++++--- requirements/ingest/confluence.txt | 2 +- requirements/ingest/databricks-volumes.txt | 6 +++--- requirements/ingest/delta-table.txt | 4 +--- requirements/ingest/discord.txt | 4 ++-- requirements/ingest/elasticsearch.txt | 6 +++--- requirements/ingest/embed-aws-bedrock.txt | 10 +++++----- requirements/ingest/embed-huggingface.txt | 4 ++-- requirements/ingest/embed-octoai.txt | 2 +- requirements/ingest/embed-openai.txt | 6 +++--- requirements/ingest/embed-vertexai.txt | 19 ++++++++++--------- requirements/ingest/embed-voyageai.txt | 8 ++++---- requirements/ingest/gcs.txt | 10 +++++----- requirements/ingest/google-drive.txt | 6 +++--- requirements/ingest/jira.txt | 2 +- requirements/ingest/onedrive.txt | 2 +- requirements/ingest/qdrant.txt | 5 +++-- requirements/ingest/s3.txt | 4 ++-- requirements/ingest/singlestore.txt | 2 +- requirements/ingest/weaviate.txt | 3 ++- requirements/ingest/wikipedia.txt | 2 +- requirements/test.txt | 14 ++++++++------ unstructured/__version__.py | 2 +- unstructured/nlp/tokenize.py | 10 +++++----- 35 files changed, 112 insertions(+), 101 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a1017bdb1..21c86340d1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.15.6-dev1 +## 0.15.6 ### Enhancements @@ -6,6 +6,7 @@ ### Fixes +* **Bump to NLTK 3.9.x** Bumps to the latest `nltk` version to resolve CVE. * **Update CI for `ingest-test-fixture-update-pr` to resolve NLTK model download errors.** * **Synchronized text and html on `TableChunk` splits.** When a `Table` element is divided during chunking to fit the chunking window, `TableChunk.text` corresponds exactly with the table text in `TableChunk.metadata.text_as_html`, `.text_as_html` is always parseable HTML, and the table is split on even row boundaries whenever possible. diff --git a/requirements/base.txt b/requirements/base.txt index 7fa88148cd..cc53655884 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -69,7 +69,7 @@ mypy-extensions==1.0.0 # unstructured-client nest-asyncio==1.6.0 # via unstructured-client -nltk==3.8.1 +nltk==3.9.1 # via -r ./base.in numpy==1.26.4 # via -r ./base.in @@ -110,7 +110,7 @@ sniffio==1.3.1 # via # anyio # httpx -soupsieve==2.5 +soupsieve==2.6 # via beautifulsoup4 tabulate==0.9.0 # via -r ./base.in @@ -129,7 +129,7 @@ typing-inspect==0.9.0 # via # dataclasses-json # unstructured-client -unstructured-client==0.25.4 +unstructured-client==0.25.5 # via # -c ././deps/constraints.txt # -r ./base.in diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 71c5097f66..6bd153fb89 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -56,3 +56,6 @@ fsspec==2024.5.0 wrapt>=1.14.0 langchain-community>=0.2.5 + +grpcio==1.64.3 +label-studio-sdk==0.0.34 diff --git a/requirements/dev.txt b/requirements/dev.txt index aed5776aea..835470e220 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -310,7 +310,7 @@ pyyaml==6.0.2 # -c ./test.txt # jupyter-events # pre-commit -pyzmq==26.1.0 +pyzmq==26.1.1 # via # ipykernel # jupyter-client @@ -360,7 +360,7 @@ sniffio==1.3.1 # -c ./base.txt # anyio # httpx -soupsieve==2.5 +soupsieve==2.6 # via # -c ./base.txt # beautifulsoup4 diff --git a/requirements/extra-markdown.txt b/requirements/extra-markdown.txt index ba1cd8fd59..1f36f6c1b7 100644 --- a/requirements/extra-markdown.txt +++ b/requirements/extra-markdown.txt @@ -6,7 +6,7 @@ # importlib-metadata==8.2.0 # via markdown -markdown==3.6 +markdown==3.7 # via -r ./extra-markdown.in zipp==3.20.0 # via importlib-metadata diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index 6b04daff2f..b2159d1c2f 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -13,7 +13,7 @@ astor==0.8.1 # via paddlepaddle attrdict==2.0.1 # via unstructured-paddleocr -cachetools==5.4.0 +cachetools==5.5.0 # via premailer certifi==2024.7.4 # via @@ -64,13 +64,13 @@ idna==3.7 # anyio # httpx # requests -imageio==2.34.2 +imageio==2.35.1 # via # imgaug # scikit-image imgaug==0.4.0 # via unstructured-paddleocr -importlib-resources==6.4.0 +importlib-resources==6.4.3 # via matplotlib kiwisolver==1.4.5 # via matplotlib @@ -83,7 +83,7 @@ lxml==5.3.0 # -c ./base.txt # premailer # unstructured-paddleocr -matplotlib==3.9.1.post1 +matplotlib==3.9.2 # via imgaug more-itertools==10.4.0 # via cssutils diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 264e3976a9..e80f4762c2 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -6,7 +6,7 @@ # antlr4-python3-runtime==4.9.3 # via omegaconf -cachetools==5.4.0 +cachetools==5.5.0 # via google-auth certifi==2024.7.4 # via @@ -48,7 +48,7 @@ fsspec==2024.5.0 # torch google-api-core[grpc]==2.19.1 # via google-cloud-vision -google-auth==2.33.0 +google-auth==2.34.0 # via # google-api-core # google-cloud-vision @@ -58,13 +58,14 @@ googleapis-common-protos==1.63.2 # via # google-api-core # grpcio-status -grpcio==1.65.4 +grpcio==1.64.3 # via + # -c ././deps/constraints.txt # google-api-core # grpcio-status grpcio-status==1.62.3 # via google-api-core -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # timm # tokenizers @@ -76,7 +77,7 @@ idna==3.7 # via # -c ./base.txt # requests -importlib-resources==6.4.0 +importlib-resources==6.4.3 # via matplotlib iopath==0.1.10 # via layoutparser @@ -92,7 +93,7 @@ lxml==5.3.0 # pikepdf markupsafe==2.1.5 # via jinja2 -matplotlib==3.9.1.post1 +matplotlib==3.9.2 # via # pycocotools # unstructured-inference @@ -120,7 +121,7 @@ onnx==1.16.2 # via # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.18.1 +onnxruntime==1.19.0 # via unstructured-inference opencv-python==4.8.0.76 # via @@ -147,7 +148,7 @@ pdfminer-six==20231228 # via # -r ./extra-pdf-image.in # pdfplumber -pdfplumber==0.11.3 +pdfplumber==0.11.4 # via layoutparser pikepdf==9.1.1 # via -r ./extra-pdf-image.in diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 4f8ce4b1f7..b0a8ede171 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -27,7 +27,7 @@ fsspec==2024.5.0 # -c ././deps/constraints.txt # huggingface-hub # torch -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # tokenizers # transformers diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt index 4d7f4b2815..336733ff59 100644 --- a/requirements/ingest/azure.txt +++ b/requirements/ingest/azure.txt @@ -6,9 +6,9 @@ # adlfs==2024.7.0 # via -r ./ingest/azure.in -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via adlfs aiosignal==1.3.1 # via aiohttp diff --git a/requirements/ingest/biomed.txt b/requirements/ingest/biomed.txt index 408202e06c..770ec68a40 100644 --- a/requirements/ingest/biomed.txt +++ b/requirements/ingest/biomed.txt @@ -10,7 +10,7 @@ beautifulsoup4==4.12.3 # bs4 bs4==0.0.2 # via -r ./ingest/biomed.in -soupsieve==2.5 +soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index 0dac79f285..3bf8d33bd9 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -20,7 +20,7 @@ backoff==2.2.1 # posthog bcrypt==4.2.0 # via chromadb -cachetools==5.4.0 +cachetools==5.5.0 # via google-auth certifi==2024.7.4 # via @@ -51,7 +51,7 @@ exceptiongroup==1.2.2 # via # -c ./ingest/../base.txt # anyio -fastapi==0.112.0 +fastapi==0.112.1 # via chromadb filelock==3.15.4 # via huggingface-hub @@ -61,12 +61,13 @@ fsspec==2024.5.0 # via # -c ./ingest/../deps/constraints.txt # huggingface-hub -google-auth==2.33.0 +google-auth==2.34.0 # via kubernetes googleapis-common-protos==1.63.2 # via opentelemetry-exporter-otlp-proto-grpc -grpcio==1.65.4 +grpcio==1.64.3 # via + # -c ./ingest/../deps/constraints.txt # chromadb # opentelemetry-exporter-otlp-proto-grpc h11==0.14.0 @@ -76,7 +77,7 @@ h11==0.14.0 # uvicorn httptools==0.6.1 # via uvicorn -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via tokenizers humanfriendly==10.0 # via coloredlogs @@ -88,7 +89,7 @@ idna==3.7 # requests importlib-metadata==8.2.0 # via -r ./ingest/chroma.in -importlib-resources==6.4.0 +importlib-resources==6.4.3 # via chromadb kubernetes==30.1.0 # via chromadb @@ -106,7 +107,7 @@ oauthlib==3.2.2 # via # kubernetes # requests-oauthlib -onnxruntime==1.18.1 +onnxruntime==1.19.0 # via chromadb opentelemetry-api==1.16.0 # via @@ -192,7 +193,7 @@ sniffio==1.3.1 # -c ./ingest/../base.txt # anyio # httpx -starlette==0.37.2 +starlette==0.38.2 # via fastapi sympy==1.13.2 # via onnxruntime @@ -231,9 +232,9 @@ urllib3==1.26.19 # -c ./ingest/../deps/constraints.txt # kubernetes # requests -uvicorn[standard]==0.30.5 +uvicorn[standard]==0.30.6 # via chromadb -uvloop==0.19.0 +uvloop==0.20.0 # via uvicorn watchfiles==0.23.0 # via uvicorn diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt index a5f625bc71..7ec568b99b 100644 --- a/requirements/ingest/clarifai.txt +++ b/requirements/ingest/clarifai.txt @@ -15,14 +15,16 @@ charset-normalizer==3.3.2 # requests clarifai==10.7.0 # via -r ./ingest/clarifai.in -clarifai-grpc==10.7.1 +clarifai-grpc==10.7.2 # via clarifai contextlib2==21.6.0 # via schema googleapis-common-protos==1.63.2 # via clarifai-grpc -grpcio==1.65.4 - # via clarifai-grpc +grpcio==1.64.3 + # via + # -c ./ingest/../deps/constraints.txt + # clarifai-grpc idna==3.7 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/confluence.txt b/requirements/ingest/confluence.txt index a1787851d1..a3739bc1a2 100644 --- a/requirements/ingest/confluence.txt +++ b/requirements/ingest/confluence.txt @@ -42,7 +42,7 @@ six==1.16.0 # via # -c ./ingest/../base.txt # atlassian-python-api -soupsieve==2.5 +soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 diff --git a/requirements/ingest/databricks-volumes.txt b/requirements/ingest/databricks-volumes.txt index 70807f674f..e9c51e790b 100644 --- a/requirements/ingest/databricks-volumes.txt +++ b/requirements/ingest/databricks-volumes.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/databricks-volumes.in # -cachetools==5.4.0 +cachetools==5.5.0 # via google-auth certifi==2024.7.4 # via @@ -15,9 +15,9 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -databricks-sdk==0.29.0 +databricks-sdk==0.30.0 # via -r ./ingest/databricks-volumes.in -google-auth==2.33.0 +google-auth==2.34.0 # via databricks-sdk idna==3.7 # via diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt index aeea02966b..84bfb7cdff 100644 --- a/requirements/ingest/delta-table.txt +++ b/requirements/ingest/delta-table.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/delta-table.in # -deltalake==0.18.2 +deltalake==0.19.0 # via -r ./ingest/delta-table.in fsspec==2024.5.0 # via @@ -16,5 +16,3 @@ numpy==1.26.4 # pyarrow pyarrow==17.0.0 # via deltalake -pyarrow-hotfix==0.6 - # via deltalake diff --git a/requirements/ingest/discord.txt b/requirements/ingest/discord.txt index ce1e81cdb8..daccb6e9f3 100644 --- a/requirements/ingest/discord.txt +++ b/requirements/ingest/discord.txt @@ -4,9 +4,9 @@ # # pip-compile ./ingest/discord.in # -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via discord-py aiosignal==1.3.1 # via aiohttp diff --git a/requirements/ingest/elasticsearch.txt b/requirements/ingest/elasticsearch.txt index 452f79d486..ff3055c61a 100644 --- a/requirements/ingest/elasticsearch.txt +++ b/requirements/ingest/elasticsearch.txt @@ -4,9 +4,9 @@ # # pip-compile ./ingest/elasticsearch.in # -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via elasticsearch aiosignal==1.3.1 # via aiohttp @@ -21,7 +21,7 @@ certifi==2024.7.4 # elastic-transport elastic-transport==8.15.0 # via elasticsearch -elasticsearch[async]==8.14.0 +elasticsearch[async]==8.15.0 # via -r ./ingest/elasticsearch.in frozenlist==1.4.1 # via diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index 1a32b2e791..9efe196583 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -4,9 +4,9 @@ # # pip-compile ./ingest/embed-aws-bedrock.in # -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via # langchain # langchain-community @@ -57,13 +57,13 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain==0.2.12 +langchain==0.2.14 # via langchain-community -langchain-community==0.2.11 +langchain-community==0.2.12 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-aws-bedrock.in -langchain-core==0.2.29 +langchain-core==0.2.33 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index 3e28584081..b91a513346 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -25,7 +25,7 @@ fsspec==2024.5.0 # -c ./ingest/../deps/constraints.txt # huggingface-hub # torch -huggingface-hub==0.24.5 +huggingface-hub==0.24.6 # via # langchain-huggingface # sentence-transformers @@ -45,7 +45,7 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain-core==0.2.29 +langchain-core==0.2.33 # via langchain-huggingface langchain-huggingface==0.0.3 # via -r ./ingest/embed-huggingface.in diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index d3d902d215..cac4f297ba 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -49,7 +49,7 @@ idna==3.7 # requests jiter==0.5.0 # via openai -openai==1.40.3 +openai==1.41.0 # via -r ./ingest/embed-octoai.in pydantic==2.8.2 # via openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 9a59602469..619c83976e 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -53,13 +53,13 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain-core==0.2.29 +langchain-core==0.2.33 # via langchain-openai -langchain-openai==0.1.21 +langchain-openai==0.1.22 # via -r ./ingest/embed-openai.in langsmith==0.1.99 # via langchain-core -openai==1.40.3 +openai==1.41.0 # via langchain-openai orjson==3.10.7 # via langsmith diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index b01a62b834..e58a614bc9 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -4,9 +4,9 @@ # # pip-compile ./ingest/embed-vertexai.in # -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via # langchain # langchain-community @@ -20,7 +20,7 @@ async-timeout==4.0.3 # langchain attrs==24.2.0 # via aiohttp -cachetools==5.4.0 +cachetools==5.5.0 # via google-auth certifi==2024.7.4 # via @@ -48,7 +48,7 @@ google-api-core[grpc]==2.19.1 # google-cloud-core # google-cloud-resource-manager # google-cloud-storage -google-auth==2.33.0 +google-auth==2.34.0 # via # google-api-core # google-cloud-aiplatform @@ -56,7 +56,7 @@ google-auth==2.33.0 # google-cloud-core # google-cloud-resource-manager # google-cloud-storage -google-cloud-aiplatform==1.61.0 +google-cloud-aiplatform==1.62.0 # via langchain-google-vertexai google-cloud-bigquery==3.25.0 # via google-cloud-aiplatform @@ -85,8 +85,9 @@ googleapis-common-protos[grpc]==1.63.2 # grpcio-status grpc-google-iam-v1==0.13.1 # via google-cloud-resource-manager -grpcio==1.65.4 +grpcio==1.64.3 # via + # -c ./ingest/../deps/constraints.txt # google-api-core # googleapis-common-protos # grpc-google-iam-v1 @@ -102,15 +103,15 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain==0.2.12 +langchain==0.2.14 # via # -r ./ingest/embed-vertexai.in # langchain-community -langchain-community==0.2.11 +langchain-community==0.2.12 # via # -c ./ingest/../deps/constraints.txt # -r ./ingest/embed-vertexai.in -langchain-core==0.2.29 +langchain-core==0.2.33 # via # langchain # langchain-community diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index 50b557367f..191f807ba6 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -4,9 +4,9 @@ # # pip-compile ./ingest/embed-voyageai.in # -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via # langchain # voyageai @@ -44,9 +44,9 @@ jsonpatch==1.33 # via langchain-core jsonpointer==3.0.0 # via jsonpatch -langchain==0.2.12 +langchain==0.2.14 # via -r ./ingest/embed-voyageai.in -langchain-core==0.2.29 +langchain-core==0.2.33 # via # langchain # langchain-text-splitters diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt index 07ac93d7f6..ef9e619359 100644 --- a/requirements/ingest/gcs.txt +++ b/requirements/ingest/gcs.txt @@ -4,9 +4,9 @@ # # pip-compile ./ingest/gcs.in # -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via gcsfs aiosignal==1.3.1 # via aiohttp @@ -20,7 +20,7 @@ beautifulsoup4==4.12.3 # bs4 bs4==0.0.2 # via -r ./ingest/gcs.in -cachetools==5.4.0 +cachetools==5.5.0 # via google-auth certifi==2024.7.4 # via @@ -48,7 +48,7 @@ google-api-core==2.19.1 # via # google-cloud-core # google-cloud-storage -google-auth==2.33.0 +google-auth==2.34.0 # via # gcsfs # google-api-core @@ -105,7 +105,7 @@ requests-oauthlib==2.0.0 # via google-auth-oauthlib rsa==4.9 # via google-auth -soupsieve==2.5 +soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index 4193eecb2f..774bef8251 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/google-drive.in # -cachetools==5.4.0 +cachetools==5.5.0 # via google-auth certifi==2024.7.4 # via @@ -17,9 +17,9 @@ charset-normalizer==3.3.2 # requests google-api-core==2.19.1 # via google-api-python-client -google-api-python-client==2.140.0 +google-api-python-client==2.141.0 # via -r ./ingest/google-drive.in -google-auth==2.33.0 +google-auth==2.34.0 # via # google-api-core # google-api-python-client diff --git a/requirements/ingest/jira.txt b/requirements/ingest/jira.txt index b98dab7bf0..336933b8eb 100644 --- a/requirements/ingest/jira.txt +++ b/requirements/ingest/jira.txt @@ -42,7 +42,7 @@ six==1.16.0 # via # -c ./ingest/../base.txt # atlassian-python-api -soupsieve==2.5 +soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt index 0fd9b13df1..3917318566 100644 --- a/requirements/ingest/onedrive.txt +++ b/requirements/ingest/onedrive.txt @@ -48,7 +48,7 @@ requests==2.32.3 # -c ./ingest/../base.txt # msal # office365-rest-python-client -soupsieve==2.5 +soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index 4fd229cc80..0068b190f7 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -21,8 +21,9 @@ exceptiongroup==1.2.2 # via # -c ./ingest/../base.txt # anyio -grpcio==1.65.4 +grpcio==1.64.3 # via + # -c ./ingest/../deps/constraints.txt # grpcio-tools # qdrant-client grpcio-tools==1.62.3 @@ -64,7 +65,7 @@ pydantic==2.8.2 # via qdrant-client pydantic-core==2.20.1 # via pydantic -qdrant-client==1.10.1 +qdrant-client==1.11.0 # via -r ./ingest/qdrant.in sniffio==1.3.1 # via diff --git a/requirements/ingest/s3.txt b/requirements/ingest/s3.txt index ed0380e0fb..fe7c697335 100644 --- a/requirements/ingest/s3.txt +++ b/requirements/ingest/s3.txt @@ -6,9 +6,9 @@ # aiobotocore==2.13.2 # via s3fs -aiohappyeyeballs==2.3.5 +aiohappyeyeballs==2.3.7 # via aiohttp -aiohttp==3.10.3 +aiohttp==3.10.4 # via # aiobotocore # s3fs diff --git a/requirements/ingest/singlestore.txt b/requirements/ingest/singlestore.txt index 6d51e2e451..b1278f87f6 100644 --- a/requirements/ingest/singlestore.txt +++ b/requirements/ingest/singlestore.txt @@ -42,7 +42,7 @@ requests==2.32.3 # singlestoredb singlestoredb==1.6.2 # via -r ./ingest/singlestore.in -sqlparams==6.0.1 +sqlparams==6.1.0 # via singlestoredb tomli==2.0.1 # via diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index 8d1899bb60..2b1df81622 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -32,8 +32,9 @@ exceptiongroup==1.2.2 # via # -c ./ingest/../base.txt # anyio -grpcio==1.65.4 +grpcio==1.64.3 # via + # -c ./ingest/../deps/constraints.txt # grpcio-health-checking # grpcio-tools # weaviate-client diff --git a/requirements/ingest/wikipedia.txt b/requirements/ingest/wikipedia.txt index 108838a5c7..7dea12ae32 100644 --- a/requirements/ingest/wikipedia.txt +++ b/requirements/ingest/wikipedia.txt @@ -25,7 +25,7 @@ requests==2.32.3 # via # -c ./ingest/../base.txt # wikipedia -soupsieve==2.5 +soupsieve==2.6 # via # -c ./ingest/../base.txt # beautifulsoup4 diff --git a/requirements/test.txt b/requirements/test.txt index a34a19e147..f0d68496f5 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -7,9 +7,7 @@ annotated-types==0.7.0 # via pydantic appdirs==1.4.4 - # via - # label-studio-sdk - # label-studio-tools + # via label-studio-tools attrs==24.2.0 # via # jsonschema @@ -47,8 +45,10 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.5.1 # via -r ./test.in -grpcio==1.65.4 - # via -r ./test.in +grpcio==1.64.3 + # via + # -c ././deps/constraints.txt + # -r ./test.in idna==3.7 # via # -c ./base.txt @@ -61,7 +61,9 @@ jsonschema==4.23.0 jsonschema-specifications==2023.12.1 # via jsonschema label-studio-sdk==0.0.34 - # via -r ./test.in + # via + # -c ././deps/constraints.txt + # -r ./test.in label-studio-tools==0.0.4 # via label-studio-sdk liccheck==0.9.2 diff --git a/unstructured/__version__.py b/unstructured/__version__.py index a606608a74..986e0018a9 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.15.6-dev1" # pragma: no cover +__version__ = "0.15.6" # pragma: no cover diff --git a/unstructured/nlp/tokenize.py b/unstructured/nlp/tokenize.py index edbb276c62..fe39e3a77d 100644 --- a/unstructured/nlp/tokenize.py +++ b/unstructured/nlp/tokenize.py @@ -16,9 +16,9 @@ CACHE_MAX_SIZE: Final[int] = 128 -NLTK_DATA_FILENAME = "nltk_data.tgz" +NLTK_DATA_FILENAME = "nltk_data_3.8.2.tar.gz" NLTK_DATA_URL = f"https://utic-public-cf.s3.amazonaws.com/{NLTK_DATA_FILENAME}" -NLTK_DATA_SHA256 = "126faf671cd255a062c436b3d0f2d311dfeefcd92ffa43f7c3ab677309404d61" +NLTK_DATA_SHA256 = "ba2ca627c8fb1f1458c15d5a476377a5b664c19deeb99fd088ebf83e140c1663" # NOTE(robinson) - mimic default dir logic from NLTK @@ -114,13 +114,13 @@ def _download_nltk_packages_if_not_present(): tagger_available = check_for_nltk_package( package_category="taggers", - package_name="averaged_perceptron_tagger", + package_name="averaged_perceptron_tagger_eng", ) tokenizer_available = check_for_nltk_package( - package_category="tokenizers", package_name="punkt" + package_category="tokenizers", package_name="punkt_tab" ) - if not (tokenizer_available and tagger_available): + if (not tokenizer_available) or (not tagger_available): download_nltk_packages()