From a51be9ef16f1b55ea80f5776c201edde3722747e Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Tue, 25 Jun 2024 16:40:16 -0500 Subject: [PATCH 01/13] revert unstructured-client pin and make pip-compile --- requirements/base.txt | 38 ++++++++++++++++++++++++---- requirements/deps/constraints.txt | 2 +- requirements/dev.txt | 21 +++++++++++---- requirements/extra-pdf-image.txt | 4 ++- requirements/ingest/astra.txt | 18 ++++++++++--- requirements/ingest/box.txt | 4 ++- requirements/ingest/chroma.txt | 15 ++++++++--- requirements/ingest/embed-octoai.txt | 18 ++++++++++--- requirements/ingest/embed-openai.txt | 18 ++++++++++--- requirements/ingest/gitlab.txt | 4 ++- requirements/ingest/notion.txt | 18 ++++++++++--- requirements/ingest/qdrant.txt | 18 ++++++++++--- requirements/ingest/salesforce.txt | 4 ++- requirements/ingest/weaviate.txt | 18 ++++++++++--- requirements/test.txt | 4 ++- 15 files changed, 161 insertions(+), 43 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 9db7f2d124..71609801cd 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -4,6 +4,10 @@ # # pip-compile ./base.in # +anyio==3.7.1 + # via + # -c ././deps/constraints.txt + # httpx backoff==2.2.1 # via -r ./base.in beautifulsoup4==4.12.3 @@ -11,6 +15,8 @@ beautifulsoup4==4.12.3 certifi==2024.6.2 # via # -c ././deps/constraints.txt + # httpcore + # httpx # requests # unstructured-client chardet==5.2.0 @@ -22,15 +28,27 @@ charset-normalizer==3.3.2 click==8.1.7 # via nltk dataclasses-json==0.6.7 - # via -r ./base.in -dataclasses-json-speakeasy==0.5.11 + # via + # -r ./base.in + # unstructured-client +deepdiff==7.0.1 # via unstructured-client emoji==2.12.1 # via -r ./base.in +exceptiongroup==1.2.1 + # via anyio filetype==1.2.0 # via -r ./base.in +h11==0.14.0 + # via httpcore +httpcore==1.0.5 + # via httpx +httpx==0.27.0 + # via unstructured-client idna==3.7 # via + # anyio + # httpx # requests # unstructured-client joblib==1.4.2 @@ -44,12 +62,13 @@ lxml==5.2.2 marshmallow==3.21.3 # via # dataclasses-json - # dataclasses-json-speakeasy # unstructured-client mypy-extensions==1.0.0 # via # typing-inspect # unstructured-client +nest-asyncio==1.6.0 + # via unstructured-client nltk==3.8.1 # via -r ./base.in numpy==1.26.4 @@ -59,6 +78,8 @@ packaging==23.2 # -c ././deps/constraints.txt # marshmallow # unstructured-client +pypdf==4.2.0 + # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client python-iso639==2024.4.27 @@ -72,12 +93,19 @@ regex==2024.5.15 requests==2.32.3 # via # -r ./base.in + # requests-toolbelt # unstructured-client +requests-toolbelt==1.0.0 + # via unstructured-client six==1.16.0 # via # langdetect # python-dateutil # unstructured-client +sniffio==1.3.1 + # via + # anyio + # httpx soupsieve==2.5 # via beautifulsoup4 tabulate==0.9.0 @@ -90,14 +118,14 @@ typing-extensions==4.12.2 # via # -r ./base.in # emoji + # pypdf # typing-inspect # unstructured-client typing-inspect==0.9.0 # via # dataclasses-json - # dataclasses-json-speakeasy # unstructured-client -unstructured-client==0.18.0 +unstructured-client==0.23.7 # via # -c ././deps/constraints.txt # -r ./base.in diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 80b113046b..430f7211c8 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -49,7 +49,7 @@ urllib3<1.27 botocore<1.34.52 # NOTE(jennings): pinned due to later versions not supporting api_key_auth in UnstructuredClient -unstructured-client<=0.18.0 +unstructured-client>=0.15.1 fsspec==2024.5.0 diff --git a/requirements/dev.txt b/requirements/dev.txt index c6ded47394..72c046821d 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -7,6 +7,7 @@ anyio==3.7.1 # via # -c ././deps/constraints.txt + # -c ./base.txt # httpx # jupyter-server appnope==0.1.4 @@ -66,7 +67,7 @@ comm==0.2.2 # via # ipykernel # ipywidgets -debugpy==1.8.1 +debugpy==1.8.2 # via ipykernel decorator==5.1.1 # via ipython @@ -76,6 +77,7 @@ distlib==0.3.8 # via virtualenv exceptiongroup==1.2.1 # via + # -c ./base.txt # -c ./test.txt # anyio executing==2.0.1 @@ -87,11 +89,17 @@ filelock==3.15.4 fqdn==1.5.1 # via jsonschema h11==0.14.0 - # via httpcore + # via + # -c ./base.txt + # httpcore httpcore==1.0.5 - # via httpx + # via + # -c ./base.txt + # httpx httpx==0.27.0 - # via jupyterlab + # via + # -c ./base.txt + # jupyterlab identify==2.5.36 # via pre-commit idna==3.7 @@ -218,7 +226,9 @@ nbformat==5.10.4 # nbclient # nbconvert nest-asyncio==1.6.0 - # via ipykernel + # via + # -c ./base.txt + # ipykernel nodeenv==1.9.1 # via pre-commit notebook==7.2.1 @@ -348,6 +358,7 @@ six==1.16.0 # rfc3339-validator sniffio==1.3.1 # via + # -c ./base.txt # anyio # httpx soupsieve==2.5 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 709de44651..2b5d3e2958 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -199,7 +199,9 @@ pyparsing==3.0.9 # -c ././deps/constraints.txt # matplotlib pypdf==4.2.0 - # via -r ./extra-pdf-image.in + # via + # -c ./base.txt + # -r ./extra-pdf-image.in pypdfium2==4.30.0 # via pdfplumber pytesseract==0.3.10 diff --git a/requirements/ingest/astra.txt b/requirements/ingest/astra.txt index ed13e68542..f62a4faa4b 100644 --- a/requirements/ingest/astra.txt +++ b/requirements/ingest/astra.txt @@ -6,6 +6,7 @@ # anyio==3.7.1 # via + # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx astrapy==1.3.0 @@ -34,19 +35,27 @@ click==8.1.7 deprecation==2.1.0 # via astrapy exceptiongroup==1.2.1 - # via anyio + # via + # -c ./ingest/../base.txt + # anyio geomet==0.2.1.post1 # via cassandra-driver h11==0.14.0 - # via httpcore + # via + # -c ./ingest/../base.txt + # httpcore h2==4.1.0 # via httpx hpack==4.0.0 # via h2 httpcore==1.0.5 - # via httpx + # via + # -c ./ingest/../base.txt + # httpx httpx[http2]==0.27.0 - # via astrapy + # via + # -c ./ingest/../base.txt + # astrapy hyperframe==6.0.1 # via h2 idna==3.7 @@ -80,6 +89,7 @@ six==1.16.0 # python-dateutil sniffio==1.3.1 # via + # -c ./ingest/../base.txt # anyio # httpx toml==0.10.2 diff --git a/requirements/ingest/box.txt b/requirements/ingest/box.txt index 9f77102205..b7993ed989 100644 --- a/requirements/ingest/box.txt +++ b/requirements/ingest/box.txt @@ -46,7 +46,9 @@ requests==2.32.3 # boxsdk # requests-toolbelt requests-toolbelt==1.0.0 - # via boxsdk + # via + # -c ./ingest/../base.txt + # boxsdk six==1.16.0 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index 583ed9ce04..3711a3efaa 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -8,6 +8,7 @@ annotated-types==0.7.0 # via pydantic anyio==3.7.1 # via + # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx # starlette @@ -52,7 +53,9 @@ deprecated==1.2.14 # opentelemetry-api # opentelemetry-exporter-otlp-proto-grpc exceptiongroup==1.2.1 - # via anyio + # via + # -c ./ingest/../base.txt + # anyio fastapi==0.110.3 # via chromadb filelock==3.15.4 @@ -73,14 +76,19 @@ grpcio==1.64.1 # opentelemetry-exporter-otlp-proto-grpc h11==0.14.0 # via + # -c ./ingest/../base.txt # httpcore # uvicorn httpcore==1.0.5 - # via httpx + # via + # -c ./ingest/../base.txt + # httpx httptools==0.6.1 # via uvicorn httpx==0.27.0 - # via chromadb + # via + # -c ./ingest/../base.txt + # chromadb huggingface-hub==0.23.4 # via tokenizers humanfriendly==10.0 @@ -225,6 +233,7 @@ six==1.16.0 # python-dateutil sniffio==1.3.1 # via + # -c ./ingest/../base.txt # anyio # httpx starlette==0.37.2 diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index 5f18e2571d..4d1dfa364d 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -8,6 +8,7 @@ annotated-types==0.7.0 # via pydantic anyio==3.7.1 # via + # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx # openai @@ -25,13 +26,21 @@ charset-normalizer==3.3.2 distro==1.9.0 # via openai exceptiongroup==1.2.1 - # via anyio + # via + # -c ./ingest/../base.txt + # anyio h11==0.14.0 - # via httpcore + # via + # -c ./ingest/../base.txt + # httpcore httpcore==1.0.5 - # via httpx + # via + # -c ./ingest/../base.txt + # httpx httpx==0.27.0 - # via openai + # via + # -c ./ingest/../base.txt + # openai idna==3.7 # via # -c ./ingest/../base.txt @@ -54,6 +63,7 @@ requests==2.32.3 # tiktoken sniffio==1.3.1 # via + # -c ./ingest/../base.txt # anyio # httpx # openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 9b447de678..ed55a592aa 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -14,6 +14,7 @@ annotated-types==0.7.0 # via pydantic anyio==3.7.1 # via + # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx # openai @@ -41,17 +42,25 @@ dataclasses-json==0.6.7 distro==1.9.0 # via openai exceptiongroup==1.2.1 - # via anyio + # via + # -c ./ingest/../base.txt + # anyio frozenlist==1.4.1 # via # aiohttp # aiosignal h11==0.14.0 - # via httpcore + # via + # -c ./ingest/../base.txt + # httpcore httpcore==1.0.5 - # via httpx + # via + # -c ./ingest/../base.txt + # httpx httpx==0.27.0 - # via openai + # via + # -c ./ingest/../base.txt + # openai idna==3.7 # via # -c ./ingest/../base.txt @@ -133,6 +142,7 @@ requests==2.32.3 # tiktoken sniffio==1.3.1 # via + # -c ./ingest/../base.txt # anyio # httpx # openai diff --git a/requirements/ingest/gitlab.txt b/requirements/ingest/gitlab.txt index 67a0e4bcc4..2bca710ef0 100644 --- a/requirements/ingest/gitlab.txt +++ b/requirements/ingest/gitlab.txt @@ -25,7 +25,9 @@ requests==2.32.3 # python-gitlab # requests-toolbelt requests-toolbelt==1.0.0 - # via python-gitlab + # via + # -c ./ingest/../base.txt + # python-gitlab urllib3==1.26.19 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/notion.txt b/requirements/ingest/notion.txt index 5bf0237df7..daea2e4f04 100644 --- a/requirements/ingest/notion.txt +++ b/requirements/ingest/notion.txt @@ -6,6 +6,7 @@ # anyio==3.7.1 # via + # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx certifi==2024.6.2 @@ -15,15 +16,23 @@ certifi==2024.6.2 # httpcore # httpx exceptiongroup==1.2.1 - # via anyio + # via + # -c ./ingest/../base.txt + # anyio h11==0.14.0 - # via httpcore + # via + # -c ./ingest/../base.txt + # httpcore htmlbuilder==1.0.0 # via -r ./ingest/notion.in httpcore==1.0.5 - # via httpx + # via + # -c ./ingest/../base.txt + # httpx httpx==0.27.0 - # via notion-client + # via + # -c ./ingest/../base.txt + # notion-client idna==3.7 # via # -c ./ingest/../base.txt @@ -33,5 +42,6 @@ notion-client==2.2.1 # via -r ./ingest/notion.in sniffio==1.3.1 # via + # -c ./ingest/../base.txt # anyio # httpx diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index 1f77c24e83..a038d352ba 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -8,6 +8,7 @@ annotated-types==0.7.0 # via pydantic anyio==3.7.1 # via + # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx certifi==2024.6.2 @@ -17,7 +18,9 @@ certifi==2024.6.2 # httpcore # httpx exceptiongroup==1.2.1 - # via anyio + # via + # -c ./ingest/../base.txt + # anyio grpcio==1.64.1 # via # grpcio-tools @@ -25,15 +28,21 @@ grpcio==1.64.1 grpcio-tools==1.62.2 # via qdrant-client h11==0.14.0 - # via httpcore + # via + # -c ./ingest/../base.txt + # httpcore h2==4.1.0 # via httpx hpack==4.0.0 # via h2 httpcore==1.0.5 - # via httpx + # via + # -c ./ingest/../base.txt + # httpx httpx[http2]==0.27.0 - # via qdrant-client + # via + # -c ./ingest/../base.txt + # qdrant-client hyperframe==6.0.1 # via h2 idna==3.7 @@ -59,6 +68,7 @@ qdrant-client==1.9.2 # via -r ./ingest/qdrant.in sniffio==1.3.1 # via + # -c ./ingest/../base.txt # anyio # httpx typing-extensions==4.12.2 diff --git a/requirements/ingest/salesforce.txt b/requirements/ingest/salesforce.txt index 96dcbc5b8c..2933c3fbb9 100644 --- a/requirements/ingest/salesforce.txt +++ b/requirements/ingest/salesforce.txt @@ -51,7 +51,9 @@ requests==2.32.3 requests-file==2.1.0 # via zeep requests-toolbelt==1.0.0 - # via zeep + # via + # -c ./ingest/../base.txt + # zeep simple-salesforce==1.12.6 # via -r ./ingest/salesforce.in six==1.16.0 diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index d1489574ff..47e12cc510 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -8,6 +8,7 @@ annotated-types==0.7.0 # via pydantic anyio==3.7.1 # via + # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # httpx authlib==1.3.1 @@ -28,7 +29,9 @@ charset-normalizer==3.3.2 cryptography==42.0.8 # via authlib exceptiongroup==1.2.1 - # via anyio + # via + # -c ./ingest/../base.txt + # anyio grpcio==1.64.1 # via # grpcio-health-checking @@ -39,11 +42,17 @@ grpcio-health-checking==1.62.2 grpcio-tools==1.62.2 # via weaviate-client h11==0.14.0 - # via httpcore + # via + # -c ./ingest/../base.txt + # httpcore httpcore==1.0.5 - # via httpx + # via + # -c ./ingest/../base.txt + # httpx httpx==0.27.0 - # via weaviate-client + # via + # -c ./ingest/../base.txt + # weaviate-client idna==3.7 # via # -c ./ingest/../base.txt @@ -67,6 +76,7 @@ requests==2.32.3 # weaviate-client sniffio==1.3.1 # via + # -c ./ingest/../base.txt # anyio # httpx typing-extensions==4.12.2 diff --git a/requirements/test.txt b/requirements/test.txt index c5baa15679..af0d82b850 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -36,7 +36,9 @@ coverage[toml]==7.5.4 # -r ./test.in # pytest-cov exceptiongroup==1.2.1 - # via pytest + # via + # -c ./base.txt + # pytest flake8==7.1.0 # via # -r ./test.in From 874642f5c9180700bb702aa763d6f297bb89a6a5 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Wed, 26 Jun 2024 09:23:08 -0500 Subject: [PATCH 02/13] resolve conflict with main branch and run pip-compile again --- requirements/base.txt | 2 ++ requirements/ingest/embed-octoai.txt | 2 +- requirements/ingest/embed-openai.txt | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 71609801cd..3ead62cfe2 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -73,6 +73,8 @@ nltk==3.8.1 # via -r ./base.in numpy==1.26.4 # via -r ./base.in +ordered-set==4.1.0 + # via deepdiff packaging==23.2 # via # -c ././deps/constraints.txt diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index 4d1dfa364d..1f36e41a50 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -47,7 +47,7 @@ idna==3.7 # anyio # httpx # requests -openai==1.35.3 +openai==1.35.4 # via -r ./ingest/embed-octoai.in pydantic==2.7.4 # via openai diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index ed55a592aa..fbf6072ac7 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -106,7 +106,7 @@ numpy==1.26.4 # -c ./ingest/../base.txt # langchain # langchain-community -openai==1.35.3 +openai==1.35.4 # via -r ./ingest/embed-openai.in orjson==3.10.5 # via langsmith From cd0dabbbba515ea1b0c9c352b0b554a9fa9b4abc Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Wed, 26 Jun 2024 10:36:42 -0500 Subject: [PATCH 03/13] Remove test --- test_unstructured/partition/test_api.py | 28 ------------------------- 1 file changed, 28 deletions(-) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index 05dbe7fa7b..9477b19021 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -73,34 +73,6 @@ def test_partition_via_api_from_filename(monkeypatch): assert elements[0].metadata.filetype == "message/rfc822" -def test_partition_via_api_custom_url(monkeypatch): - """ - Assert that we can specify api_url and requests are sent to the right place - """ - mock_request = Mock(return_value=MockResponse(status_code=200)) - - monkeypatch.setattr(requests.Session, "request", mock_request) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) - custom_url = "http://localhost:8000/general/v0/general" - - with open(filename, "rb") as f: - partition_via_api(file=f, api_url=custom_url, metadata_filename=filename) - - mock_request.assert_called_with( - "POST", custom_url, data=ANY, files=ANY, headers=ANY, params=ANY - ) - - # The sdk uses the server url, so we should be able to pass that as well - base_url = "http://localhost:8000" - - with open(filename, "rb") as f: - partition_via_api(file=f, api_url=base_url, metadata_filename=filename) - - mock_request.assert_called_with( - "POST", custom_url, data=ANY, files=ANY, headers=ANY, params=ANY - ) - - def test_partition_via_api_from_file(monkeypatch): monkeypatch.setattr( General, From 07b0915897affefabc2f1b3b0e98a85d78b2c0e3 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Thu, 27 Jun 2024 12:26:53 -0500 Subject: [PATCH 04/13] remove integration test --- test_unstructured/partition/test_api.py | 33 ------------------------- 1 file changed, 33 deletions(-) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index 9477b19021..c07d564b2f 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -3,7 +3,6 @@ import json import os import pathlib -from unittest.mock import ANY, Mock import pytest import requests @@ -202,38 +201,6 @@ def test_partition_via_api_image_block_extraction(): assert isinstance(image_data, bytes) -def test_partition_via_api_pass_list_type_parameters(monkeypatch): - mock_request = Mock(return_value=MockResponse(status_code=200)) - monkeypatch.setattr(requests.Session, "request", mock_request) - - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf") - - partition_via_api( - filename=filename, - strategy="hi_res", - extract_image_block_types=["image", "table"], - skip_infer_table_types=["pdf", "docx"], - languages=["eng"], - ) - - mock_request.assert_called_with( - "POST", - ANY, - data=ANY, - files=[ - ["extract_image_block_types[]", [None, "image"]], - ["extract_image_block_types[]", [None, "table"]], - ["files", ANY], - ["languages[]", [None, "eng"]], - ["skip_infer_table_types[]", [None, "pdf"]], - ["skip_infer_table_types[]", [None, "docx"]], - ["strategy", [None, "hi_res"]], - ], - headers=ANY, - params=ANY, - ) - - # Note(austin) - This test is way too noisy against the hosted api # def test_partition_via_api_invalid_request_data_kwargs(): # filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf") From d3ffab92674b7a428ecefa6ca9545d7a1150d25d Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Thu, 27 Jun 2024 12:32:34 -0500 Subject: [PATCH 05/13] fix server url in test --- test_unstructured/partition/test_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index c07d564b2f..b6334bb7df 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -192,6 +192,8 @@ def test_partition_via_api_image_block_extraction(): strategy="hi_res", extract_image_block_types=["image", "table"], api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url="https://api.unstructuredapp.io/general/v0/general", ) image_elements = [el for el in elements if el.category == ElementType.IMAGE] for el in image_elements: From b97d8891dbeb9503f89c6ea8ef86da2edddd3570 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Thu, 27 Jun 2024 14:26:02 -0500 Subject: [PATCH 06/13] add decorators for test that requires api key --- test_unstructured/partition/test_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index b6334bb7df..e53ead61c5 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -185,6 +185,8 @@ def test_partition_via_api_valid_request_data_kwargs(): assert isinstance(elements, list) +@pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") +@pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_via_api_image_block_extraction(): filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf") elements = partition_via_api( From 8c5a8428acc90cf5c1fb8977639a8fdda911a8d8 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Fri, 28 Jun 2024 11:33:21 -0500 Subject: [PATCH 07/13] Linting, fix type hints, etc. --- test_unstructured/partition/test_api.py | 205 ++++++++++++++---------- test_unstructured/unit_utils.py | 16 +- unstructured/partition/api.py | 8 +- 3 files changed, 130 insertions(+), 99 deletions(-) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index e53ead61c5..1c2b7740a9 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -7,10 +7,14 @@ import pytest import requests from unstructured_client.general import General +from unstructured_client.models import shared +from unstructured_client.models.shared import PartitionParameters from unstructured.documents.elements import ElementType, NarrativeText from unstructured.partition.api import partition_multiple_via_api, partition_via_api +from ..unit_utils import ANY, FixtureRequest, example_doc_path, method_mock + DIRECTORY = pathlib.Path(__file__).parent.resolve() EML_TEST_FILE = "eml/fake-email.eml" @@ -38,13 +42,13 @@ ]""" -class MockResponse: - def __init__(self, status_code): +class FakeResponse: + def __init__(self, status_code: int): self.status_code = status_code - # string representation of partitioned elements is nested in an additional + # The string representation of partitioned elements is nested in an additional # layer in the new unstructured-client: # `elements_from_json(text=response.raw_response.text)` - self.raw_response = MockRawResponse() + self.raw_response = FakeRawResponse() self.headers = {"Content-Type": "application/json"} def json(self): @@ -55,101 +59,129 @@ def text(self): return MOCK_TEXT -class MockRawResponse: +class FakeRawResponse: def __init__(self): self.text = MOCK_TEXT -def test_partition_via_api_from_filename(monkeypatch): - monkeypatch.setattr( - General, - "partition", - lambda *args, **kwargs: MockResponse(status_code=200), +def test_partition_via_api_from_filename(request: FixtureRequest): + partition_ = method_mock( + request, General, "partition", return_value=FakeResponse(status_code=200) ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) - elements = partition_via_api(filename=filename) + elements = partition_via_api(example_doc_path(EML_TEST_FILE)) + with open(example_doc_path(EML_TEST_FILE), "rb") as f: + file_bytes = f.read() + + partition_.assert_called_once_with( + ANY, + PartitionParameters( + files=shared.Files( + content=file_bytes, + file_name="/Users/johnjennings/src/unstructured/example-docs/eml/fake-email.eml", + ), + chunking_strategy=None, + combine_under_n_chars=None, + coordinates=False, + encoding=None, + extract_image_block_types=None, + gz_uncompressed_content_type=None, + hi_res_model_name=None, + include_orig_elements=None, + include_page_breaks=False, + languages=None, + max_characters=None, + multipage_sections=True, + new_after_n_chars=None, + ocr_languages=None, + output_format=shared.OutputFormat.APPLICATION_JSON, + overlap=0, + overlap_all=False, + pdf_infer_table_structure=True, + similarity_threshold=None, + skip_infer_table_types=None, + split_pdf_concurrency_level=5, + split_pdf_page=True, + starting_page_number=None, + strategy=shared.Strategy.AUTO, + unique_element_ids=False, + xml_keep_tags=False, + ), + ) + assert isinstance(partition_.call_args_list[0].args[0], General) + assert len(elements) == 1 assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0].metadata.filetype == "message/rfc822" -def test_partition_via_api_from_file(monkeypatch): +def test_partition_via_api_from_file(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( - General, - "partition", - lambda *args, **kwargs: MockResponse(status_code=200), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) - with open(filename, "rb") as f: - elements = partition_via_api(file=f, metadata_filename=filename) + with open(example_doc_path(EML_TEST_FILE), "rb") as f: + elements = partition_via_api(file=f, metadata_filename=example_doc_path(EML_TEST_FILE)) assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0].metadata.filetype == "message/rfc822" -def test_partition_via_api_from_file_warns_with_file_filename(monkeypatch, caplog): +def test_partition_via_api_from_file_warns_with_file_filename( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +): monkeypatch.setattr( - General, - "partition", - lambda *args, **kwargs: MockResponse(status_code=200), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) - with open(filename, "rb") as f: - partition_via_api(file=f, file_filename=filename) + with open(example_doc_path(EML_TEST_FILE), "rb") as f: + partition_via_api(file=f, file_filename=example_doc_path(EML_TEST_FILE)) assert "WARNING" in caplog.text assert "The file_filename kwarg will be deprecated" in caplog.text -def test_partition_via_api_from_file_raises_with_metadata_and_file_filename(monkeypatch): +def test_partition_via_api_from_file_raises_with_metadata_and_file_filename( + monkeypatch: pytest.MonkeyPatch, +): monkeypatch.setattr( - General, - "partition", - lambda *args, **kwargs: MockResponse(status_code=200), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) + filename = example_doc_path(EML_TEST_FILE) with open(filename, "rb") as f, pytest.raises(ValueError): partition_via_api(file=f, file_filename=filename, metadata_filename=filename) -def test_partition_via_api_from_file_raises_without_filename(monkeypatch): +def test_partition_via_api_from_file_raises_without_filename(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( - General, - "partition", - lambda *args, **kwargs: MockResponse(status_code=200), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) - with open(filename, "rb") as f, pytest.raises(ValueError): + with open(example_doc_path(EML_TEST_FILE), "rb") as f, pytest.raises(ValueError): partition_via_api(file=f) -def test_partition_via_api_raises_with_bad_response(monkeypatch): +def test_partition_via_api_raises_with_bad_response(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( - General, - "partition", - lambda *args, **kwargs: MockResponse(status_code=500), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=500) # type: ignore ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) with pytest.raises(ValueError): - partition_via_api(filename=filename) + partition_via_api(filename=example_doc_path(EML_TEST_FILE)) @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_via_api_with_no_strategy(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf") - elements_no_strategy = partition_via_api( - filename=filename, + filename=example_doc_path("layout-parser-paper-fast.pdf"), strategy="auto", api_key=get_api_key(), skip_infer_table_types=["pdf"], ) elements_hi_res = partition_via_api( - filename=filename, strategy="hi_res", api_key=get_api_key(), skip_infer_table_types=["pdf"] + filename=example_doc_path("layout-parser-paper-fast.pdf"), + strategy="hi_res", + api_key=get_api_key(), + skip_infer_table_types=["pdf"], ) # confirm that hi_res strategy was not passed as default to partition by comparing outputs @@ -162,11 +194,9 @@ def test_partition_via_api_with_no_strategy(): @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg") - # coordinates not included by default to limit payload size elements = partition_via_api( - filename=filename, + filename=example_doc_path("layout-parser-paper-fast.pdf"), strategy="hi_res", coordinates="true", api_key=get_api_key(), @@ -178,9 +208,11 @@ def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates(): @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_via_api_valid_request_data_kwargs(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf") - - elements = partition_via_api(filename=filename, strategy="fast", api_key=get_api_key()) + elements = partition_via_api( + filename=example_doc_path("layout-parser-paper-fast.pdf"), + strategy="fast", + api_key=get_api_key(), + ) assert isinstance(elements, list) @@ -188,9 +220,8 @@ def test_partition_via_api_valid_request_data_kwargs(): @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_via_api_image_block_extraction(): - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "embedded-images-tables.pdf") elements = partition_via_api( - filename=filename, + filename=example_doc_path("embedded-images-tables.pdf"), strategy="hi_res", extract_image_block_types=["image", "table"], api_key=get_api_key(), @@ -212,8 +243,8 @@ def test_partition_via_api_image_block_extraction(): # partition_via_api(filename=filename, strategy="not_a_strategy") -class MockMultipleResponse: - def __init__(self, status_code): +class FakeMultipleResponse: + def __init__(self, status_code: int): self.status_code = status_code def json(self): @@ -261,11 +292,9 @@ def text(self): ]""" -def test_partition_multiple_via_api_with_single_filename(monkeypatch): +def test_partition_multiple_via_api_with_single_filename(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: MockResponse(status_code=200), + requests, "post", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) @@ -274,31 +303,30 @@ def test_partition_multiple_via_api_with_single_filename(monkeypatch): assert elements[0][0].metadata.filetype == "message/rfc822" -def test_partition_multiple_via_api_from_filenames(monkeypatch): +def test_partition_multiple_via_api_from_filenames(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( requests, "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=200), + lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore ) - filenames = [ os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), ] elements = partition_multiple_via_api(filenames=filenames) + assert len(elements) == 2 assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0][0].metadata.filetype == "message/rfc822" -def test_partition_multiple_via_api_from_files(monkeypatch): +def test_partition_multiple_via_api_from_files(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( requests, "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=200), + lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore ) - filenames = [ os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), @@ -315,13 +343,14 @@ def test_partition_multiple_via_api_from_files(monkeypatch): assert elements[0][0].metadata.filetype == "message/rfc822" -def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog): +def test_partition_multiple_via_api_warns_with_file_filename( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +): monkeypatch.setattr( requests, "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=200), + lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore ) - filenames = [ os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), @@ -337,11 +366,11 @@ def test_partition_multiple_via_api_warns_with_file_filename(monkeypatch, caplog assert "The file_filenames kwarg will be deprecated" in caplog.text -def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkeypatch): +def test_partition_multiple_via_api_warns_with_file_and_metadata_filename( + monkeypatch: pytest.MonkeyPatch, +): monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=200), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) filenames = [ @@ -359,13 +388,10 @@ def test_partition_multiple_via_api_warns_with_file_and_metadata_filename(monkey ) -def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch): +def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=500), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=500) # type: ignore ) - filenames = [ os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), @@ -375,13 +401,12 @@ def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch): partition_multiple_via_api(filenames=filenames) -def test_partition_multiple_via_api_raises_with_content_types_size_mismatch(monkeypatch): +def test_partition_multiple_via_api_raises_with_content_types_size_mismatch( + monkeypatch: pytest.MonkeyPatch, +): monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=500), + General, "partition", lambda *args, **kwargs: FakeResponse(status_code=500) # type: ignore ) - filenames = [ os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), @@ -394,11 +419,13 @@ def test_partition_multiple_via_api_raises_with_content_types_size_mismatch(monk ) -def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeypatch): +def test_partition_multiple_via_api_from_files_raises_with_size_mismatch( + monkeypatch: pytest.MonkeyPatch, +): monkeypatch.setattr( requests, "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=200), + lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore ) filenames = [ @@ -416,11 +443,13 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(monkeyp ) -def test_partition_multiple_via_api_from_files_raises_without_filenames(monkeypatch): +def test_partition_multiple_via_api_from_files_raises_without_filenames( + monkeypatch: pytest.MonkeyPatch, +): monkeypatch.setattr( requests, "post", - lambda *args, **kwargs: MockMultipleResponse(status_code=200), + lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore ) filenames = [ diff --git a/test_unstructured/unit_utils.py b/test_unstructured/unit_utils.py index c5a7c1c186..968ba6ab9a 100644 --- a/test_unstructured/unit_utils.py +++ b/test_unstructured/unit_utils.py @@ -128,7 +128,7 @@ def cls_attr_mock( attr_name: str, name: str | None = None, **kwargs: Any, -): +) -> Mock: """Return a mock for attribute `attr_name` on `cls`. Patch is reversed after pytest uses it. @@ -151,7 +151,9 @@ def function_mock( return _patch.start() -def initializer_mock(request: FixtureRequest, cls: type, autospec: bool = True, **kwargs: Any): +def initializer_mock( + request: FixtureRequest, cls: type, autospec: bool = True, **kwargs: Any +) -> Mock: """Return mock for __init__() method on `cls`. The patch is reversed after pytest uses it. @@ -167,7 +169,7 @@ def instance_mock( name: str | None = None, spec_set: bool = True, **kwargs: Any, -): +) -> Mock: """Return a mock for an instance of `cls` that draws its spec from the class. The mock does not allow new attributes to be set on the instance. If `name` is missing or @@ -178,7 +180,7 @@ def instance_mock( return create_autospec(cls, _name=name, spec_set=spec_set, instance=True, **kwargs) -def loose_mock(request: FixtureRequest, name: str | None = None, **kwargs: Any): +def loose_mock(request: FixtureRequest, name: str | None = None, **kwargs: Any) -> Mock: """Return a "loose" mock, meaning it has no spec to constrain calls on it. Additional keyword arguments are passed through to Mock(). If called without a name, it is @@ -195,7 +197,7 @@ def method_mock( method_name: str, autospec: bool = True, **kwargs: Any, -): +) -> Mock: """Return mock for method `method_name` on `cls`. The patch is reversed after pytest uses it. @@ -205,7 +207,7 @@ def method_mock( return _patch.start() -def open_mock(request: FixtureRequest, module_name: str, **kwargs: Any): +def open_mock(request: FixtureRequest, module_name: str, **kwargs: Any) -> Mock: """Return a mock for the builtin `open()` method in `module_name`.""" target = "%s.open" % module_name _patch = patch(target, mock_open(), create=True, **kwargs) @@ -223,7 +225,7 @@ def property_mock(request: FixtureRequest, cls: type, prop_name: str, **kwargs: return _patch.start() -def var_mock(request: FixtureRequest, q_var_name: str, **kwargs: Any): +def var_mock(request: FixtureRequest, q_var_name: str, **kwargs: Any) -> Mock: """Return a mock patching the variable with qualified name `q_var_name`. Patch is reversed after calling test returns. diff --git a/unstructured/partition/api.py b/unstructured/partition/api.py index bd1c5f78b2..c5094d77f1 100644 --- a/unstructured/partition/api.py +++ b/unstructured/partition/api.py @@ -1,7 +1,7 @@ from __future__ import annotations import contextlib -from typing import IO, Optional +from typing import IO, Any, Optional, Sequence import requests from unstructured_client import UnstructuredClient @@ -21,7 +21,7 @@ def partition_via_api( api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", metadata_filename: Optional[str] = None, - **request_kwargs, + **request_kwargs: Any, ) -> list[Element]: """Partitions a document using the Unstructured REST API. This is equivalent to running the document through partition. @@ -97,12 +97,12 @@ def partition_via_api( def partition_multiple_via_api( filenames: Optional[list[str]] = None, content_types: Optional[list[str]] = None, - files: Optional[list[str]] = None, + files: Optional[Sequence[IO[bytes]]] = None, file_filenames: Optional[list[str]] = None, api_url: str = "https://api.unstructured.io/general/v0/general", api_key: str = "", metadata_filenames: Optional[list[str]] = None, - **request_kwargs, + **request_kwargs: Any, ) -> list[list[Element]]: """Partitions multiple documents using the Unstructured REST API by batching the documents into a single HTTP request. From 27f8d3ead84b798c3d1e43b77a8e8f786a5bcfa2 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:56:11 -0500 Subject: [PATCH 08/13] enable chunking args to be passed to partition_via_api --- .../ingest/pipeline/reformat/chunking.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/unstructured/ingest/pipeline/reformat/chunking.py b/unstructured/ingest/pipeline/reformat/chunking.py index d8f85591dc..ac2a5cabea 100644 --- a/unstructured/ingest/pipeline/reformat/chunking.py +++ b/unstructured/ingest/pipeline/reformat/chunking.py @@ -107,23 +107,19 @@ def chunk(self, elements_json_file: str) -> Optional[list[Element]]: if self.partition_config.partition_by_api: return partition_via_api( filename=elements_json_file, - # -- If api_key or api_url are None, partition_via_api will raise an error, which - # -- will be caught and logged by Chunker.run() + # -- (jennings) If api_key or api_url are None, partition_via_api will raise an + # -- error, which will be caught and logged by Chunker.run() api_key=self.partition_config.api_key, # type: ignore api_url=self.partition_config.partition_endpoint, # type: ignore chunking_strategy=self.chunking_config.chunking_strategy, combine_under_n_chars=self.chunking_config.combine_text_under_n_chars, - # -- These are not currently supported by the sdk and raise an error. - # -- They will need to be updated once we bump the client version. - # combine_text_under_n_chars=self.chunking_config.combine_text_under_n_chars, - # include_orig_elements=self.chunking_config.include_orig_elements, + combine_text_under_n_chars=self.chunking_config.combine_text_under_n_chars, + include_orig_elements=self.chunking_config.include_orig_elements, max_characters=self.chunking_config.max_characters, multipage_sections=self.chunking_config.multipage_sections, new_after_n_chars=self.chunking_config.new_after_n_chars, - # -- These are not supported by older versions of unstructured-client, which we have - # -- pinned to <=0.18.0 - # overlap=self.chunking_config.overlap, - # overlap_all=self.chunking_config.overlap_all, + overlap=self.chunking_config.overlap, + overlap_all=self.chunking_config.overlap_all, ) # -- Warn that the defined chunking_strategy is not locally available -- logger.warning( From 535b2e413124679a6729bfde80354537efb01b78 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Fri, 28 Jun 2024 13:19:29 -0500 Subject: [PATCH 09/13] extract partition_mock_ to fixture and move other helper classes to bottom of the file --- test_unstructured/partition/test_api.py | 302 ++++++++++++------------ 1 file changed, 155 insertions(+), 147 deletions(-) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index 1c2b7740a9..465b2c9daa 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -3,6 +3,7 @@ import json import os import pathlib +from typing import Any import pytest import requests @@ -17,97 +18,19 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve() -EML_TEST_FILE = "eml/fake-email.eml" - skip_outside_ci = os.getenv("CI", "").lower() in {"", "false", "f", "0"} skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main" -MOCK_TEXT = """[ - { - "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", - "text": "This is a test email to use for unit tests.", - "type": "NarrativeText", - "metadata": { - "sent_from": [ - "Matthew Robinson " - ], - "sent_to": [ - "Matthew Robinson " - ], - "subject": "Test Email", - "filename": "fake-email.eml", - "filetype": "message/rfc822" - } - } -]""" - - -class FakeResponse: - def __init__(self, status_code: int): - self.status_code = status_code - # The string representation of partitioned elements is nested in an additional - # layer in the new unstructured-client: - # `elements_from_json(text=response.raw_response.text)` - self.raw_response = FakeRawResponse() - self.headers = {"Content-Type": "application/json"} - def json(self): - return json.loads(self.text) - - @property - def text(self): - return MOCK_TEXT - - -class FakeRawResponse: - def __init__(self): - self.text = MOCK_TEXT - - -def test_partition_via_api_from_filename(request: FixtureRequest): - partition_ = method_mock( +def test_partition_via_api_from_filename(request: FixtureRequest, expected_call: tuple[Any, Any]): + partition_mock_ = method_mock( request, General, "partition", return_value=FakeResponse(status_code=200) ) - elements = partition_via_api(example_doc_path(EML_TEST_FILE)) - with open(example_doc_path(EML_TEST_FILE), "rb") as f: - file_bytes = f.read() - partition_.assert_called_once_with( - ANY, - PartitionParameters( - files=shared.Files( - content=file_bytes, - file_name="/Users/johnjennings/src/unstructured/example-docs/eml/fake-email.eml", - ), - chunking_strategy=None, - combine_under_n_chars=None, - coordinates=False, - encoding=None, - extract_image_block_types=None, - gz_uncompressed_content_type=None, - hi_res_model_name=None, - include_orig_elements=None, - include_page_breaks=False, - languages=None, - max_characters=None, - multipage_sections=True, - new_after_n_chars=None, - ocr_languages=None, - output_format=shared.OutputFormat.APPLICATION_JSON, - overlap=0, - overlap_all=False, - pdf_infer_table_structure=True, - similarity_threshold=None, - skip_infer_table_types=None, - split_pdf_concurrency_level=5, - split_pdf_page=True, - starting_page_number=None, - strategy=shared.Strategy.AUTO, - unique_element_ids=False, - xml_keep_tags=False, - ), - ) - assert isinstance(partition_.call_args_list[0].args[0], General) + elements = partition_via_api(example_doc_path("eml/fake-email.eml")) + + partition_mock_.assert_called_once_with(*expected_call) + assert isinstance(partition_mock_.call_args_list[0].args[0], General) assert len(elements) == 1 assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0].metadata.filetype == "message/rfc822" @@ -118,8 +41,10 @@ def test_partition_via_api_from_file(monkeypatch: pytest.MonkeyPatch): General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - with open(example_doc_path(EML_TEST_FILE), "rb") as f: - elements = partition_via_api(file=f, metadata_filename=example_doc_path(EML_TEST_FILE)) + with open(example_doc_path("eml/fake-email.eml"), "rb") as f: + elements = partition_via_api( + file=f, metadata_filename=example_doc_path("eml/fake-email.eml") + ) assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0].metadata.filetype == "message/rfc822" @@ -131,8 +56,8 @@ def test_partition_via_api_from_file_warns_with_file_filename( General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - with open(example_doc_path(EML_TEST_FILE), "rb") as f: - partition_via_api(file=f, file_filename=example_doc_path(EML_TEST_FILE)) + with open(example_doc_path("eml/fake-email.eml"), "rb") as f: + partition_via_api(file=f, file_filename=example_doc_path("eml/fake-email.eml")) assert "WARNING" in caplog.text assert "The file_filename kwarg will be deprecated" in caplog.text @@ -144,7 +69,7 @@ def test_partition_via_api_from_file_raises_with_metadata_and_file_filename( monkeypatch.setattr( General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - filename = example_doc_path(EML_TEST_FILE) + filename = example_doc_path("eml/fake-email.eml") with open(filename, "rb") as f, pytest.raises(ValueError): partition_via_api(file=f, file_filename=filename, metadata_filename=filename) @@ -155,7 +80,7 @@ def test_partition_via_api_from_file_raises_without_filename(monkeypatch: pytest General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - with open(example_doc_path(EML_TEST_FILE), "rb") as f, pytest.raises(ValueError): + with open(example_doc_path("eml/fake-email.eml"), "rb") as f, pytest.raises(ValueError): partition_via_api(file=f) @@ -165,7 +90,7 @@ def test_partition_via_api_raises_with_bad_response(monkeypatch: pytest.MonkeyPa ) with pytest.raises(ValueError): - partition_via_api(filename=example_doc_path(EML_TEST_FILE)) + partition_via_api(filename=example_doc_path("eml/fake-email.eml")) @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") @@ -243,60 +168,11 @@ def test_partition_via_api_image_block_extraction(): # partition_via_api(filename=filename, strategy="not_a_strategy") -class FakeMultipleResponse: - def __init__(self, status_code: int): - self.status_code = status_code - - def json(self): - return json.loads(self.text) - - @property - def text(self): - return """[ - [ - { - "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", - "text": "This is a test email to use for unit tests.", - "type": "NarrativeText", - "metadata": { - "sent_from": [ - "Matthew Robinson " - ], - "sent_to": [ - "Matthew Robinson " - ], - "subject": "Test Email", - "filename": "fake-email.eml", - "filetype": "message/rfc822" - } - } - ], - [ - { - "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", - "text": "This is a test email to use for unit tests.", - "type": "NarrativeText", - "metadata": { - "sent_from": [ - "Matthew Robinson " - ], - "sent_to": [ - "Matthew Robinson " - ], - "subject": "Test Email", - "filename": "fake-email.eml", - "filetype": "message/rfc822" - } - } - ] -]""" - - def test_partition_multiple_via_api_with_single_filename(monkeypatch: pytest.MonkeyPatch): monkeypatch.setattr( requests, "post", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE) + filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml") elements = partition_multiple_via_api(filenames=[filename]) assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") @@ -328,7 +204,7 @@ def test_partition_multiple_via_api_from_files(monkeypatch: pytest.MonkeyPatch): lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore ) filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), ] @@ -352,7 +228,7 @@ def test_partition_multiple_via_api_warns_with_file_filename( lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore ) filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), ] @@ -374,7 +250,7 @@ def test_partition_multiple_via_api_warns_with_file_and_metadata_filename( ) filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), ] @@ -393,7 +269,7 @@ def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch: pytest General, "partition", lambda *args, **kwargs: FakeResponse(status_code=500) # type: ignore ) filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), ] @@ -429,7 +305,7 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch( ) filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), ] @@ -453,7 +329,7 @@ def test_partition_multiple_via_api_from_files_raises_without_filenames( ) filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", EML_TEST_FILE), + os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), ] @@ -500,3 +376,135 @@ def test_partition_multiple_via_api_invalid_request_data_kwargs(): strategy="not_a_strategy", api_key=get_api_key(), ) + + +MOCK_TEXT = """[ + { + "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", + "text": "This is a test email to use for unit tests.", + "type": "NarrativeText", + "metadata": { + "sent_from": [ + "Matthew Robinson " + ], + "sent_to": [ + "Matthew Robinson " + ], + "subject": "Test Email", + "filename": "fake-email.eml", + "filetype": "message/rfc822" + } + } +]""" + + +class FakeResponse: + def __init__(self, status_code: int): + self.status_code = status_code + # The string representation of partitioned elements is nested in an additional + # layer in the new unstructured-client: + # `elements_from_json(text=response.raw_response.text)` + self.raw_response = FakeRawResponse() + self.headers = {"Content-Type": "application/json"} + + def json(self): + return json.loads(self.text) + + @property + def text(self): + return MOCK_TEXT + + +class FakeRawResponse: + def __init__(self): + self.text = MOCK_TEXT + + +class FakeMultipleResponse: + def __init__(self, status_code: int): + self.status_code = status_code + + def json(self): + return json.loads(self.text) + + @property + def text(self): + return """[ + [ + { + "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", + "text": "This is a test email to use for unit tests.", + "type": "NarrativeText", + "metadata": { + "sent_from": [ + "Matthew Robinson " + ], + "sent_to": [ + "Matthew Robinson " + ], + "subject": "Test Email", + "filename": "fake-email.eml", + "filetype": "message/rfc822" + } + } + ], + [ + { + "element_id": "f49fbd614ddf5b72e06f59e554e6ae2b", + "text": "This is a test email to use for unit tests.", + "type": "NarrativeText", + "metadata": { + "sent_from": [ + "Matthew Robinson " + ], + "sent_to": [ + "Matthew Robinson " + ], + "subject": "Test Email", + "filename": "fake-email.eml", + "filetype": "message/rfc822" + } + } + ] +]""" + + +@pytest.fixture() +def expected_call(): + with open(example_doc_path("eml/fake-email.eml"), "rb") as f: + file_bytes = f.read() + return ( + ANY, + PartitionParameters( + files=shared.Files( + content=file_bytes, + file_name=example_doc_path("eml/fake-email.eml"), + ), + chunking_strategy=None, + combine_under_n_chars=None, + coordinates=False, + encoding=None, + extract_image_block_types=None, + gz_uncompressed_content_type=None, + hi_res_model_name=None, + include_orig_elements=None, + include_page_breaks=False, + languages=None, + max_characters=None, + multipage_sections=True, + new_after_n_chars=None, + ocr_languages=None, + output_format=shared.OutputFormat.APPLICATION_JSON, + overlap=0, + overlap_all=False, + pdf_infer_table_structure=True, + similarity_threshold=None, + skip_infer_table_types=None, + split_pdf_concurrency_level=5, + split_pdf_page=True, + starting_page_number=None, + strategy=shared.Strategy.AUTO, + unique_element_ids=False, + xml_keep_tags=False, + ), + ) From 9e09148a716d287995b5822d8446a08778b4d46a Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:49:48 -0500 Subject: [PATCH 10/13] refactor tests to remove monkeypatches and use real mocks that we can assert on --- test_unstructured/partition/test_api.py | 241 ++++++++++++------------ 1 file changed, 125 insertions(+), 116 deletions(-) diff --git a/test_unstructured/partition/test_api.py b/test_unstructured/partition/test_api.py index 465b2c9daa..85be972102 100644 --- a/test_unstructured/partition/test_api.py +++ b/test_unstructured/partition/test_api.py @@ -22,75 +22,84 @@ skip_not_on_main = os.getenv("GITHUB_REF_NAME", "").lower() != "main" -def test_partition_via_api_from_filename(request: FixtureRequest, expected_call: tuple[Any, Any]): +def test_partition_via_api_with_filename_correctly_calls_sdk( + request: FixtureRequest, expected_call_: list[Any] +): partition_mock_ = method_mock( request, General, "partition", return_value=FakeResponse(status_code=200) ) - elements = partition_via_api(example_doc_path("eml/fake-email.eml")) + elements = partition_via_api(filename=example_doc_path("eml/fake-email.eml")) - partition_mock_.assert_called_once_with(*expected_call) + partition_mock_.assert_called_once_with(*expected_call_) assert isinstance(partition_mock_.call_args_list[0].args[0], General) assert len(elements) == 1 assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0].metadata.filetype == "message/rfc822" -def test_partition_via_api_from_file(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore +def test_partition_via_api_with_file_correctly_calls_sdk( + request: FixtureRequest, expected_call_: list[Any] +): + partition_mock_ = method_mock( + request, General, "partition", return_value=FakeResponse(status_code=200) ) with open(example_doc_path("eml/fake-email.eml"), "rb") as f: elements = partition_via_api( file=f, metadata_filename=example_doc_path("eml/fake-email.eml") ) + + # Update the fixture content to match the format passed to partition_via_api + modified_expected_call = expected_call_[:] + modified_expected_call[1].files.content = f + + partition_mock_.assert_called_once_with(*modified_expected_call) + assert isinstance(partition_mock_.call_args_list[0].args[0], General) + assert len(elements) == 1 assert elements[0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0].metadata.filetype == "message/rfc822" -def test_partition_via_api_from_file_warns_with_file_filename( - monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +def test_partition_via_api_warns_with_file_and_filename_and_calls_sdk( + request: FixtureRequest, expected_call_: list[Any], caplog: pytest.LogCaptureFixture ): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore + partition_mock_ = method_mock( + request, General, "partition", return_value=FakeResponse(status_code=200) ) with open(example_doc_path("eml/fake-email.eml"), "rb") as f: partition_via_api(file=f, file_filename=example_doc_path("eml/fake-email.eml")) + # Update the fixture content to match the format passed to partition_via_api + modified_expected_call = expected_call_[:] + modified_expected_call[1].files.content = f + + partition_mock_.assert_called_once_with(*modified_expected_call) assert "WARNING" in caplog.text assert "The file_filename kwarg will be deprecated" in caplog.text -def test_partition_via_api_from_file_raises_with_metadata_and_file_filename( - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore - ) +def test_partition_via_api_from_file_raises_with_metadata_and_file_and_filename(): filename = example_doc_path("eml/fake-email.eml") with open(filename, "rb") as f, pytest.raises(ValueError): partition_via_api(file=f, file_filename=filename, metadata_filename=filename) -def test_partition_via_api_from_file_raises_without_filename(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore - ) - +def test_partition_via_api_from_file_raises_without_filename(): with open(example_doc_path("eml/fake-email.eml"), "rb") as f, pytest.raises(ValueError): partition_via_api(file=f) -def test_partition_via_api_raises_with_bad_response(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=500) # type: ignore +def test_partition_via_api_raises_with_bad_response(request: FixtureRequest): + partition_mock_ = method_mock( + request, General, "partition", return_value=FakeResponse(status_code=500) ) with pytest.raises(ValueError): partition_via_api(filename=example_doc_path("eml/fake-email.eml")) + partition_mock_.assert_called_once() @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") @@ -100,12 +109,16 @@ def test_partition_via_api_with_no_strategy(): filename=example_doc_path("layout-parser-paper-fast.pdf"), strategy="auto", api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url="https://api.unstructuredapp.io/general/v0/general", skip_infer_table_types=["pdf"], ) elements_hi_res = partition_via_api( filename=example_doc_path("layout-parser-paper-fast.pdf"), strategy="hi_res", api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url="https://api.unstructuredapp.io/general/v0/general", skip_infer_table_types=["pdf"], ) @@ -125,6 +138,8 @@ def test_partition_via_api_with_image_hi_res_strategy_includes_coordinates(): strategy="hi_res", coordinates="true", api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url="https://api.unstructuredapp.io/general/v0/general", ) assert elements[0].metadata.coordinates is not None @@ -137,6 +152,8 @@ def test_partition_via_api_valid_request_data_kwargs(): filename=example_doc_path("layout-parser-paper-fast.pdf"), strategy="fast", api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url="https://api.unstructuredapp.io/general/v0/general", ) assert isinstance(elements, list) @@ -168,45 +185,51 @@ def test_partition_via_api_image_block_extraction(): # partition_via_api(filename=filename, strategy="not_a_strategy") -def test_partition_multiple_via_api_with_single_filename(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( - requests, "post", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore +def test_partition_multiple_via_api_with_single_filename(request: FixtureRequest): + partition_mock_ = method_mock( + request, requests, "post", return_value=FakeResponse(status_code=200) ) - filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml") + filename = example_doc_path("eml/fake-email.eml") elements = partition_multiple_via_api(filenames=[filename]) + + partition_mock_.assert_called_once_with( + "https://api.unstructured.io/general/v0/general", + headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY}, + data={}, + files=[("files", (example_doc_path("eml/fake-email.eml"), ANY, None))], + ) assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0][0].metadata.filetype == "message/rfc822" -def test_partition_multiple_via_api_from_filenames(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore +def test_partition_multiple_via_api_from_filenames(request: FixtureRequest): + partition_mock_ = method_mock( + request, requests, "post", return_value=FakeMultipleResponse(status_code=200) ) - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] elements = partition_multiple_via_api(filenames=filenames) + partition_mock_.assert_called_once_with( + "https://api.unstructured.io/general/v0/general", + headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY}, + data={}, + files=[ + ("files", (example_doc_path("eml/fake-email.eml"), ANY, None)), + ("files", (example_doc_path("fake.docx"), ANY, None)), + ], + ) assert len(elements) == 2 assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0][0].metadata.filetype == "message/rfc822" -def test_partition_multiple_via_api_from_files(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore +def test_partition_multiple_via_api_from_files(request: FixtureRequest): + partition_mock_ = method_mock( + request, requests, "post", return_value=FakeMultipleResponse(status_code=200) ) - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] with contextlib.ExitStack() as stack: files = [stack.enter_context(open(filename, "rb")) for filename in filenames] @@ -214,23 +237,28 @@ def test_partition_multiple_via_api_from_files(monkeypatch: pytest.MonkeyPatch): files=files, metadata_filenames=filenames, ) + + partition_mock_.assert_called_once_with( + "https://api.unstructured.io/general/v0/general", + headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY}, + data={}, + files=[ + ("files", (example_doc_path("eml/fake-email.eml"), ANY, None)), + ("files", (example_doc_path("fake.docx"), ANY, None)), + ], + ) assert len(elements) == 2 assert elements[0][0] == NarrativeText("This is a test email to use for unit tests.") assert elements[0][0].metadata.filetype == "message/rfc822" def test_partition_multiple_via_api_warns_with_file_filename( - monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture + caplog: pytest.LogCaptureFixture, request: FixtureRequest ): - monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore + partition_mock_ = method_mock( + request, requests, "post", return_value=FakeMultipleResponse(status_code=200) ) - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] with contextlib.ExitStack() as stack: files = [stack.enter_context(open(filename, "rb")) for filename in filenames] @@ -238,21 +266,22 @@ def test_partition_multiple_via_api_warns_with_file_filename( files=files, file_filenames=filenames, ) + + partition_mock_.assert_called_once_with( + "https://api.unstructured.io/general/v0/general", + headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY}, + data={}, + files=[ + ("files", (example_doc_path("eml/fake-email.eml"), ANY, None)), + ("files", (example_doc_path("fake.docx"), ANY, None)), + ], + ) assert "WARNING" in caplog.text assert "The file_filenames kwarg will be deprecated" in caplog.text -def test_partition_multiple_via_api_warns_with_file_and_metadata_filename( - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=200) # type: ignore - ) - - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] +def test_partition_multiple_via_api_raises_with_file_and_metadata_filename(): + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] with contextlib.ExitStack() as stack: files = [stack.enter_context(open(filename, "rb")) for filename in filenames] @@ -264,29 +293,27 @@ def test_partition_multiple_via_api_warns_with_file_and_metadata_filename( ) -def test_partition_multiple_via_api_raises_with_bad_response(monkeypatch: pytest.MonkeyPatch): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=500) # type: ignore +def test_partition_multiple_via_api_raises_with_bad_response(request: FixtureRequest): + partition_mock_ = method_mock( + request, requests, "post", return_value=FakeMultipleResponse(status_code=500) ) - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] with pytest.raises(ValueError): partition_multiple_via_api(filenames=filenames) + partition_mock_.assert_called_once_with( + "https://api.unstructured.io/general/v0/general", + headers={"ACCEPT": "application/json", "UNSTRUCTURED-API-KEY": ANY}, + data={}, + files=[ + ("files", (example_doc_path("eml/fake-email.eml"), ANY, None)), + ("files", (example_doc_path("fake.docx"), ANY, None)), + ], + ) -def test_partition_multiple_via_api_raises_with_content_types_size_mismatch( - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setattr( - General, "partition", lambda *args, **kwargs: FakeResponse(status_code=500) # type: ignore - ) - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] +def test_partition_multiple_via_api_raises_with_content_types_size_mismatch(): + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] with pytest.raises(ValueError): partition_multiple_via_api( @@ -295,19 +322,8 @@ def test_partition_multiple_via_api_raises_with_content_types_size_mismatch( ) -def test_partition_multiple_via_api_from_files_raises_with_size_mismatch( - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore - ) - - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] +def test_partition_multiple_via_api_from_files_raises_with_size_mismatch(): + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] with contextlib.ExitStack() as stack: files = [stack.enter_context(open(filename, "rb")) for filename in filenames] @@ -319,19 +335,8 @@ def test_partition_multiple_via_api_from_files_raises_with_size_mismatch( ) -def test_partition_multiple_via_api_from_files_raises_without_filenames( - monkeypatch: pytest.MonkeyPatch, -): - monkeypatch.setattr( - requests, - "post", - lambda *args, **kwargs: FakeMultipleResponse(status_code=200), # type: ignore - ) - - filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "eml/fake-email.eml"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "fake.docx"), - ] +def test_partition_multiple_via_api_from_files_raises_without_filenames(): + filenames = [example_doc_path("eml/fake-email.eml"), example_doc_path("fake.docx")] with contextlib.ExitStack() as stack: files = [stack.enter_context(open(filename, "rb")) for filename in filenames] @@ -352,14 +357,16 @@ def get_api_key(): @pytest.mark.skipif(skip_not_on_main, reason="Skipping test run outside of main branch") def test_partition_multiple_via_api_valid_request_data_kwargs(): filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg"), + example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("layout-parser-paper-fast.jpg"), ] elements = partition_multiple_via_api( filenames=filenames, strategy="auto", api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url="https://api.unstructuredapp.io/general/v0/general", ) assert isinstance(elements, list) @@ -367,14 +374,16 @@ def test_partition_multiple_via_api_valid_request_data_kwargs(): @pytest.mark.skipif(skip_outside_ci, reason="Skipping test run outside of CI") def test_partition_multiple_via_api_invalid_request_data_kwargs(): filenames = [ - os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.pdf"), - os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg"), + example_doc_path("layout-parser-paper-fast.pdf"), + example_doc_path("layout-parser-paper-fast.jpg"), ] with pytest.raises(ValueError): partition_multiple_via_api( filenames=filenames, strategy="not_a_strategy", api_key=get_api_key(), + # The url has changed since the 06/24 API release while the sdk defaults to the old url + api_url="https://api.unstructuredapp.io/general/v0/general", ) @@ -470,10 +479,10 @@ def text(self): @pytest.fixture() -def expected_call(): +def expected_call_(): with open(example_doc_path("eml/fake-email.eml"), "rb") as f: file_bytes = f.read() - return ( + return [ ANY, PartitionParameters( files=shared.Files( @@ -507,4 +516,4 @@ def expected_call(): unique_element_ids=False, xml_keep_tags=False, ), - ) + ] From 6ab2a5c570787a4f6af133d774fbe64cc23663e0 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Mon, 1 Jul 2024 10:52:48 -0500 Subject: [PATCH 11/13] changelog and version --- CHANGELOG.md | 4 +++- unstructured/__version__.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ea3bf18d9..2be2dbd04b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,8 @@ -## 0.14.10-dev0 +## 0.14.10-dev1 ### Enhancements +* **Update unstructured-client dependency** Change unstructured-client dependency pin back to + greater than min version and updated tests that were failing given the update. ### Features diff --git a/unstructured/__version__.py b/unstructured/__version__.py index abe782aca5..1e437bbfd4 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.10-dev0" # pragma: no cover +__version__ = "0.14.10-dev1" # pragma: no cover From eca35a9e86f9f079b88f4b4919ce01b9f9d7c00f Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Mon, 1 Jul 2024 11:50:57 -0500 Subject: [PATCH 12/13] update failing ingest chunking test --- .../unit/pipeline/reformat/test_chunking.py | 7 +++++-- unstructured/ingest/pipeline/reformat/chunking.py | 1 - 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/test_unstructured_ingest/unit/pipeline/reformat/test_chunking.py b/test_unstructured_ingest/unit/pipeline/reformat/test_chunking.py index 0c57c5dd5c..433ee810d5 100644 --- a/test_unstructured_ingest/unit/pipeline/reformat/test_chunking.py +++ b/test_unstructured_ingest/unit/pipeline/reformat/test_chunking.py @@ -132,12 +132,15 @@ def it_chunks_remotely(self, _ingest_docs_map_: Mock, _partition_via_api_: Mock) api_key="aaaaaaaaaaaaaaaaaaaaa", api_url="https://api.unstructured.io/general/v0/general", chunking_strategy="by_similarity", + # (jennings) the sdk uses combine_under_n_chars but the ChunkingConfig param is + # combine_text_under_n_chars combine_under_n_chars=None, + include_orig_elements=None, max_characters=None, multipage_sections=None, new_after_n_chars=None, - # overlap=None, - # overlap_all=None, + overlap=None, + overlap_all=None, ) # -- fixtures -------------------------------------------------------------------------------- diff --git a/unstructured/ingest/pipeline/reformat/chunking.py b/unstructured/ingest/pipeline/reformat/chunking.py index ac2a5cabea..b061cfa1c2 100644 --- a/unstructured/ingest/pipeline/reformat/chunking.py +++ b/unstructured/ingest/pipeline/reformat/chunking.py @@ -113,7 +113,6 @@ def chunk(self, elements_json_file: str) -> Optional[list[Element]]: api_url=self.partition_config.partition_endpoint, # type: ignore chunking_strategy=self.chunking_config.chunking_strategy, combine_under_n_chars=self.chunking_config.combine_text_under_n_chars, - combine_text_under_n_chars=self.chunking_config.combine_text_under_n_chars, include_orig_elements=self.chunking_config.include_orig_elements, max_characters=self.chunking_config.max_characters, multipage_sections=self.chunking_config.multipage_sections, From 4dc1f85c63251652fa95965d794b634b729fb264 Mon Sep 17 00:00:00 2001 From: John <43506685+Coniferish@users.noreply.github.com> Date: Tue, 2 Jul 2024 10:16:45 -0500 Subject: [PATCH 13/13] make pip-compile --- requirements/base.txt | 6 +++--- requirements/dev.txt | 2 +- requirements/extra-pdf-image.txt | 4 ++-- requirements/ingest/airtable.txt | 4 ++-- requirements/ingest/chroma.txt | 6 +++--- requirements/ingest/databricks-volumes.txt | 2 +- requirements/ingest/embed-aws-bedrock.txt | 6 +++--- requirements/ingest/embed-huggingface.txt | 6 +++--- requirements/ingest/embed-octoai.txt | 6 +++--- requirements/ingest/embed-openai.txt | 8 ++++---- requirements/ingest/embed-vertexai.txt | 8 ++++---- requirements/ingest/embed-voyageai.txt | 6 +++--- requirements/ingest/gcs.txt | 2 +- requirements/ingest/google-drive.txt | 2 +- requirements/ingest/qdrant.txt | 6 +++--- requirements/ingest/weaviate.txt | 4 ++-- requirements/test.txt | 4 ++-- 17 files changed, 41 insertions(+), 41 deletions(-) diff --git a/requirements/base.txt b/requirements/base.txt index 28ae421068..6df8e864d4 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -80,10 +80,10 @@ packaging==23.2 # -c ././deps/constraints.txt # marshmallow # unstructured-client -pypdf==4.2.0 - # via unstructured-client psutil==6.0.0 # via -r ./base.in +pypdf==4.2.0 + # via unstructured-client python-dateutil==2.9.0.post0 # via unstructured-client python-iso639==2024.4.27 @@ -129,7 +129,7 @@ typing-inspect==0.9.0 # via # dataclasses-json # unstructured-client -unstructured-client==0.23.7 +unstructured-client==0.23.8 # via # -c ././deps/constraints.txt # -r ./base.in diff --git a/requirements/dev.txt b/requirements/dev.txt index 3402c52817..4dd8fa7e9c 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -119,7 +119,7 @@ importlib-metadata==7.1.0 # jupyterlab # jupyterlab-server # nbconvert -ipykernel==6.29.4 +ipykernel==6.29.5 # via # jupyter # jupyter-console diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 43b5c3f0ec..aa20dd37cd 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -48,7 +48,7 @@ fsspec==2024.5.0 # torch google-api-core[grpc]==2.19.1 # via google-cloud-vision -google-auth==2.30.0 +google-auth==2.31.0 # via # google-api-core # google-cloud-vision @@ -164,7 +164,7 @@ pillow==10.4.0 # pytesseract # torchvision # unstructured-pytesseract -pillow-heif==0.16.0 +pillow-heif==0.17.0 # via -r ./extra-pdf-image.in portalocker==2.10.0 # via iopath diff --git a/requirements/ingest/airtable.txt b/requirements/ingest/airtable.txt index 75d18f8e75..f0b71114fe 100644 --- a/requirements/ingest/airtable.txt +++ b/requirements/ingest/airtable.txt @@ -23,9 +23,9 @@ inflection==0.5.1 # via pyairtable pyairtable==2.3.3 # via -r ./ingest/airtable.in -pydantic==2.7.4 +pydantic==2.8.0 # via pyairtable -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic requests==2.32.3 # via diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index e7bdc1d5e6..82143ed27b 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -66,7 +66,7 @@ fsspec==2024.5.0 # via # -c ./ingest/../deps/constraints.txt # huggingface-hub -google-auth==2.30.0 +google-auth==2.31.0 # via kubernetes googleapis-common-protos==1.63.2 # via opentelemetry-exporter-otlp-proto-grpc @@ -190,11 +190,11 @@ pyasn1==0.6.0 # rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.4 +pydantic==2.8.0 # via # chromadb # fastapi -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic pypika==0.48.9 # via chromadb diff --git a/requirements/ingest/databricks-volumes.txt b/requirements/ingest/databricks-volumes.txt index b21971f275..bc1cdd9353 100644 --- a/requirements/ingest/databricks-volumes.txt +++ b/requirements/ingest/databricks-volumes.txt @@ -17,7 +17,7 @@ charset-normalizer==3.3.2 # requests databricks-sdk==0.29.0 # via -r ./ingest/databricks-volumes.in -google-auth==2.30.0 +google-auth==2.31.0 # via databricks-sdk idna==3.7 # via diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index 171d1cccdc..1885651c27 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -68,7 +68,7 @@ langchain-core==0.2.10 # langchain-text-splitters langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.83 # via # langchain # langchain-community @@ -98,12 +98,12 @@ packaging==23.2 # -c ./ingest/../deps/constraints.txt # langchain-core # marshmallow -pydantic==2.7.4 +pydantic==2.8.0 # via # langchain # langchain-core # langsmith -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic python-dateutil==2.9.0.post0 # via diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index c2031fefae..c191bff668 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -80,7 +80,7 @@ langchain-core==0.2.10 # langchain-text-splitters langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.83 # via # langchain # langchain-community @@ -124,12 +124,12 @@ packaging==23.2 # transformers pillow==10.4.0 # via sentence-transformers -pydantic==2.7.4 +pydantic==2.8.0 # via # langchain # langchain-core # langsmith -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic pyyaml==6.0.1 # via diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index 089ada41ce..a114f01488 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -47,11 +47,11 @@ idna==3.7 # anyio # httpx # requests -openai==1.35.7 +openai==1.35.8 # via -r ./ingest/embed-octoai.in -pydantic==2.7.4 +pydantic==2.8.0 # via openai -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic regex==2024.5.15 # via diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 6201781309..f887b3a987 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -85,7 +85,7 @@ langchain-core==0.2.10 # langchain-text-splitters langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.83 # via # langchain # langchain-community @@ -107,7 +107,7 @@ numpy==1.26.4 # -c ./ingest/../base.txt # langchain # langchain-community -openai==1.35.7 +openai==1.35.8 # via -r ./ingest/embed-openai.in orjson==3.10.5 # via langsmith @@ -117,13 +117,13 @@ packaging==23.2 # -c ./ingest/../deps/constraints.txt # langchain-core # marshmallow -pydantic==2.7.4 +pydantic==2.8.0 # via # langchain # langchain-core # langsmith # openai -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic pyyaml==6.0.1 # via diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index d229e0599b..058b5894ae 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -46,7 +46,7 @@ google-api-core[grpc]==2.19.1 # google-cloud-core # google-cloud-resource-manager # google-cloud-storage -google-auth==2.30.0 +google-auth==2.31.0 # via # google-api-core # google-cloud-aiplatform @@ -118,7 +118,7 @@ langchain-google-vertexai==1.0.6 # via -r ./ingest/embed-vertexai.in langchain-text-splitters==0.2.2 # via langchain -langsmith==0.1.82 +langsmith==0.1.83 # via # langchain # langchain-community @@ -172,13 +172,13 @@ pyasn1==0.6.0 # rsa pyasn1-modules==0.4.0 # via google-auth -pydantic==2.7.4 +pydantic==2.8.0 # via # google-cloud-aiplatform # langchain # langchain-core # langsmith -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic python-dateutil==2.9.0.post0 # via diff --git a/requirements/ingest/embed-voyageai.txt b/requirements/ingest/embed-voyageai.txt index 0c1e66ea1b..44c33e36fa 100644 --- a/requirements/ingest/embed-voyageai.txt +++ b/requirements/ingest/embed-voyageai.txt @@ -53,7 +53,7 @@ langchain-text-splitters==0.2.2 # via langchain langchain-voyageai==0.1.1 # via -r ./ingest/embed-voyageai.in -langsmith==0.1.82 +langsmith==0.1.83 # via # langchain # langchain-core @@ -73,12 +73,12 @@ packaging==23.2 # -c ./ingest/../base.txt # -c ./ingest/../deps/constraints.txt # langchain-core -pydantic==2.7.4 +pydantic==2.8.0 # via # langchain # langchain-core # langsmith -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic pyyaml==6.0.1 # via diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt index d48457e66d..3d257f2d7c 100644 --- a/requirements/ingest/gcs.txt +++ b/requirements/ingest/gcs.txt @@ -46,7 +46,7 @@ google-api-core==2.19.1 # via # google-cloud-core # google-cloud-storage -google-auth==2.30.0 +google-auth==2.31.0 # via # gcsfs # google-api-core diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index ce87dec72c..207a2d0b6a 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -19,7 +19,7 @@ google-api-core==2.19.1 # via google-api-python-client google-api-python-client==2.135.0 # via -r ./ingest/google-drive.in -google-auth==2.30.0 +google-auth==2.31.0 # via # google-api-core # google-api-python-client diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index a038d352ba..a18ec1cb8d 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -60,11 +60,11 @@ protobuf==4.23.4 # via # -c ./ingest/../deps/constraints.txt # grpcio-tools -pydantic==2.7.4 +pydantic==2.8.0 # via qdrant-client -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic -qdrant-client==1.9.2 +qdrant-client==1.10.0 # via -r ./ingest/qdrant.in sniffio==1.3.1 # via diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index 47e12cc510..afa061d26b 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -66,9 +66,9 @@ protobuf==4.23.4 # grpcio-tools pycparser==2.22 # via cffi -pydantic==2.7.4 +pydantic==2.8.0 # via weaviate-client -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic requests==2.32.3 # via diff --git a/requirements/test.txt b/requirements/test.txt index 745a4003dc..4aa1676de7 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -98,11 +98,11 @@ pycodestyle==2.12.0 # via # flake8 # flake8-print -pydantic==2.7.4 +pydantic==2.8.0 # via # -r ./test.in # label-studio-sdk -pydantic-core==2.18.4 +pydantic-core==2.20.0 # via pydantic pyflakes==3.2.0 # via