From 305247b4e1a67b218c624357ad8427dd7d19bf0c Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Sat, 20 Apr 2024 20:08:20 -0700 Subject: [PATCH 1/2] chore: bump unstructured-inference pin (#2913) **Summary** Update dependencies to use the new version of `unstructured-inference` released yesterday. Remedy a few small problems with `make pip-compile` that stood in the way. --- CHANGELOG.md | 2 +- requirements/Makefile | 4 ++-- requirements/base.txt | 2 +- requirements/deps/constraints.txt | 3 --- requirements/dev.txt | 14 ++++++------- requirements/extra-paddleocr.txt | 6 +++--- requirements/extra-pdf-image.in | 2 +- requirements/extra-pdf-image.txt | 19 ++++++++---------- requirements/huggingface.txt | 12 +++++------- requirements/ingest/astra.txt | 8 +++++--- requirements/ingest/azure.txt | 8 +++++--- requirements/ingest/box.txt | 4 +++- requirements/ingest/chroma.txt | 20 +++++++++---------- requirements/ingest/clarifai.txt | 4 ++-- requirements/ingest/delta-table.txt | 2 +- requirements/ingest/discord.txt | 2 +- requirements/ingest/embed-aws-bedrock.txt | 10 +++++----- requirements/ingest/embed-huggingface.txt | 24 +++++++++++------------ requirements/ingest/embed-octoai.txt | 6 +++--- requirements/ingest/embed-openai.txt | 16 +++++++-------- requirements/ingest/embed-vertexai.txt | 23 +++++++++++----------- requirements/ingest/gcs.txt | 2 +- requirements/ingest/github.txt | 4 +++- requirements/ingest/google-drive.txt | 2 +- requirements/ingest/notion.txt | 2 +- requirements/ingest/onedrive.txt | 4 +++- requirements/ingest/outlook.txt | 4 +++- requirements/ingest/qdrant.txt | 10 ++++++---- requirements/ingest/s3.txt | 2 +- requirements/ingest/salesforce.txt | 22 +++++++-------------- requirements/ingest/sharepoint.txt | 4 +++- requirements/ingest/weaviate.txt | 8 ++++---- requirements/test.txt | 10 +++++----- unstructured/__version__.py | 2 +- 34 files changed, 133 insertions(+), 134 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f3080a3c43..ab0f43591b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.13.3-dev9 +## 0.13.3 ### Enhancements diff --git a/requirements/Makefile b/requirements/Makefile index 2a6eee56d0..9c4175401f 100644 --- a/requirements/Makefile +++ b/requirements/Makefile @@ -24,11 +24,11 @@ compile-base: .PHONY: compile-all-base compile-all-base: compile-base compile-test compile-dev - @$(foreach file,$(BASE_REQUIREMENTS),echo "compiling: $(file)" && pip-compile --upgrade $(file);) + @$(foreach file,$(BASE_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;) .PHONY: compile-ingest compile-ingest: - @$(foreach file,$(INGEST_REQUIREMENTS),echo "compiling: $(file)" && pip-compile --upgrade $(file);) + @$(foreach file,$(INGEST_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;) .PHONY: clean clean: clean-base clean-ingest diff --git a/requirements/base.txt b/requirements/base.txt index 727efcbf8e..87ae0d05fa 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -69,7 +69,7 @@ python-magic==0.4.27 # via -r ./base.in rapidfuzz==3.8.1 # via -r ./base.in -regex==2023.12.25 +regex==2024.4.16 # via nltk requests==2.31.0 # via diff --git a/requirements/deps/constraints.txt b/requirements/deps/constraints.txt index 18df4c7460..de28fc6357 100644 --- a/requirements/deps/constraints.txt +++ b/requirements/deps/constraints.txt @@ -22,8 +22,6 @@ Office365-REST-Python-Client<2.4.3 # unstructured-inference to be upgraded when unstructured library is upgraded # https://github.com/Unstructured-IO/unstructured/issues/1458 # unstructured-inference -# unable to build wheel for arm on 0.3.3+ -safetensors<=0.3.2 # use the known compatible version of weaviate and unstructured.pytesseract unstructured.pytesseract>=0.3.12 weaviate-client>3.25.0 @@ -38,7 +36,6 @@ torch>2 # pinned in unstructured paddleocr opencv-python==4.8.0.76 opencv-contrib-python==4.8.0.76 -onnxruntime==1.15.1 platformdirs==3.10.0 # TODO: Constraint due to langchain, remove when that gets updated: diff --git a/requirements/dev.txt b/requirements/dev.txt index 4fe5b454bc..477172e78f 100644 --- a/requirements/dev.txt +++ b/requirements/dev.txt @@ -73,7 +73,7 @@ defusedxml==0.7.1 # via nbconvert distlib==0.3.8 # via virtualenv -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via # -c ./test.txt # anyio @@ -91,7 +91,7 @@ httpcore==1.0.5 # via httpx httpx==0.27.0 # via jupyterlab -identify==2.5.35 +identify==2.5.36 # via pre-commit idna==3.7 # via @@ -195,7 +195,7 @@ markupsafe==2.1.5 # via # jinja2 # nbconvert -matplotlib-inline==0.1.6 +matplotlib-inline==0.1.7 # via # ipykernel # ipython @@ -216,7 +216,7 @@ nest-asyncio==1.6.0 # via ipykernel nodeenv==1.8.0 # via pre-commit -notebook==7.1.2 +notebook==7.1.3 # via jupyter notebook-shim==0.2.4 # via @@ -294,7 +294,7 @@ pyyaml==6.0.1 # -c ./test.txt # jupyter-events # pre-commit -pyzmq==25.1.2 +pyzmq==26.0.2 # via # ipykernel # jupyter-client @@ -368,7 +368,7 @@ tornado==6.4 # jupyterlab # notebook # terminado -traitlets==5.14.2 +traitlets==5.14.3 # via # comm # ipykernel @@ -401,7 +401,7 @@ urllib3==1.26.18 # -c ./base.txt # -c ./test.txt # requests -virtualenv==20.25.1 +virtualenv==20.25.3 # via pre-commit wcwidth==0.2.13 # via prompt-toolkit diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt index d93facdad5..114316e73f 100644 --- a/requirements/extra-paddleocr.txt +++ b/requirements/extra-paddleocr.txt @@ -63,7 +63,7 @@ importlib-metadata==7.1.0 # via flask importlib-resources==6.4.0 # via matplotlib -itsdangerous==2.1.2 +itsdangerous==2.2.0 # via flask jinja2==3.1.3 # via @@ -188,7 +188,7 @@ scipy==1.10.1 # -c ././deps/constraints.txt # imgaug # scikit-image -shapely==2.0.3 +shapely==2.0.4 # via # imgaug # unstructured-paddleocr @@ -200,7 +200,7 @@ six==1.16.0 # imgaug # python-dateutil # visualdl -tifffile==2024.2.12 +tifffile==2024.4.18 # via scikit-image tqdm==4.66.2 # via diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in index 208569c43e..f6e003d1a3 100644 --- a/requirements/extra-pdf-image.in +++ b/requirements/extra-pdf-image.in @@ -9,7 +9,7 @@ pillow_heif pypdf # Do not move to constraints.in, otherwise unstructured-inference will not be upgraded # when unstructured library is. -unstructured-inference==0.7.25 +unstructured-inference==0.7.27 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats # from one tesseract call unstructured.pytesseract>=0.3.12 diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt index 53dc5e40d5..2d902f0194 100644 --- a/requirements/extra-pdf-image.txt +++ b/requirements/extra-pdf-image.txt @@ -100,10 +100,8 @@ onnx==1.16.0 # via # -r ./extra-pdf-image.in # unstructured-inference -onnxruntime==1.15.1 - # via - # -c ././deps/constraints.txt - # unstructured-inference +onnxruntime==1.17.3 + # via unstructured-inference opencv-python==4.8.0.76 # via # -c ././deps/constraints.txt @@ -132,7 +130,7 @@ pdfminer-six==20231228 # pdfplumber pdfplumber==0.11.0 # via layoutparser -pikepdf==8.15.0 +pikepdf==8.15.1 # via -r ./extra-pdf-image.in pillow==10.3.0 # via @@ -190,7 +188,7 @@ rapidfuzz==3.8.1 # via # -c ./base.txt # unstructured-inference -regex==2023.12.25 +regex==2024.4.16 # via # -c ./base.txt # transformers @@ -199,9 +197,8 @@ requests==2.31.0 # -c ./base.txt # huggingface-hub # transformers -safetensors==0.3.2 +safetensors==0.4.3 # via - # -c ././deps/constraints.txt # timm # transformers scipy==1.10.1 @@ -218,7 +215,7 @@ sympy==1.12 # torch timm==0.9.16 # via effdet -tokenizers==0.15.2 +tokenizers==0.19.1 # via transformers torch==2.2.2 # via @@ -238,7 +235,7 @@ tqdm==4.66.2 # huggingface-hub # iopath # transformers -transformers==4.37.1 +transformers==4.40.0 # via unstructured-inference typing-extensions==4.11.0 # via @@ -249,7 +246,7 @@ typing-extensions==4.11.0 # torch tzdata==2024.1 # via pandas -unstructured-inference==0.7.25 +unstructured-inference==0.7.27 # via -r ./extra-pdf-image.in unstructured-pytesseract==0.3.12 # via diff --git a/requirements/huggingface.txt b/requirements/huggingface.txt index 9b58d6197f..73e03b0337 100644 --- a/requirements/huggingface.txt +++ b/requirements/huggingface.txt @@ -64,7 +64,7 @@ pyyaml==6.0.1 # via # huggingface-hub # transformers -regex==2023.12.25 +regex==2024.4.16 # via # -c ./base.txt # sacremoses @@ -76,10 +76,8 @@ requests==2.31.0 # transformers sacremoses==0.1.1 # via -r ./huggingface.in -safetensors==0.3.2 - # via - # -c ././deps/constraints.txt - # transformers +safetensors==0.4.3 + # via transformers sentencepiece==0.2.0 # via -r ./huggingface.in six==1.16.0 @@ -88,7 +86,7 @@ six==1.16.0 # langdetect sympy==1.12 # via torch -tokenizers==0.15.2 +tokenizers==0.19.1 # via transformers torch==2.2.2 # via @@ -100,7 +98,7 @@ tqdm==4.66.2 # huggingface-hub # sacremoses # transformers -transformers==4.37.1 +transformers==4.40.0 # via -r ./huggingface.in typing-extensions==4.11.0 # via diff --git a/requirements/ingest/astra.txt b/requirements/ingest/astra.txt index 184d4422f8..0e8c50605b 100644 --- a/requirements/ingest/astra.txt +++ b/requirements/ingest/astra.txt @@ -14,7 +14,7 @@ bson==0.5.10 # via astrapy cassandra-driver==3.29.1 # via cassio -cassio==0.1.5 +cassio==0.1.6 # via astrapy certifi==2024.2.2 # via @@ -33,7 +33,7 @@ click==8.1.7 # geomet deprecation==2.1.0 # via astrapy -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio geomet==0.2.1.post1 # via cassandra-driver @@ -46,7 +46,9 @@ hpack==4.0.0 httpcore==1.0.5 # via httpx httpx[http2]==0.27.0 - # via astrapy + # via + # astrapy + # httpx hyperframe==6.0.1 # via h2 idna==3.7 diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt index 5aebf17a27..2c48b6950e 100644 --- a/requirements/ingest/azure.txt +++ b/requirements/ingest/azure.txt @@ -4,9 +4,9 @@ # # pip-compile ./ingest/azure.in # -adlfs==2024.2.0 +adlfs==2024.4.1 # via -r ./ingest/azure.in -aiohttp==3.9.4 +aiohttp==3.9.5 # via adlfs aiosignal==1.3.1 # via aiohttp @@ -80,7 +80,9 @@ portalocker==2.8.2 pycparser==2.22 # via cffi pyjwt[crypto]==2.8.0 - # via msal + # via + # msal + # pyjwt requests==2.31.0 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/box.txt b/requirements/ingest/box.txt index 80244e0885..2f3c8980ab 100644 --- a/requirements/ingest/box.txt +++ b/requirements/ingest/box.txt @@ -9,7 +9,9 @@ attrs==23.2.0 boxfs==0.3.0 # via -r ./ingest/box.in boxsdk[jwt]==3.9.2 - # via boxfs + # via + # boxfs + # boxsdk certifi==2024.2.2 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt index 25a35a2941..d4acacdc3a 100644 --- a/requirements/ingest/chroma.txt +++ b/requirements/ingest/chroma.txt @@ -44,9 +44,9 @@ coloredlogs==15.0.1 # via onnxruntime deprecated==1.2.14 # via opentelemetry-api -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio -fastapi==0.110.1 +fastapi==0.110.2 # via chromadb filelock==3.13.4 # via huggingface-hub @@ -58,7 +58,7 @@ google-auth==2.29.0 # via kubernetes googleapis-common-protos==1.63.0 # via opentelemetry-exporter-otlp-proto-grpc -grpcio==1.62.1 +grpcio==1.62.2 # via # chromadb # opentelemetry-exporter-otlp-proto-grpc @@ -95,10 +95,8 @@ oauthlib==3.2.2 # via # kubernetes # requests-oauthlib -onnxruntime==1.15.1 - # via - # -c ./ingest/../deps/constraints.txt - # chromadb +onnxruntime==1.17.3 + # via chromadb opentelemetry-api==1.16.0 # via # chromadb @@ -131,7 +129,7 @@ protobuf==4.23.4 # googleapis-common-protos # onnxruntime # opentelemetry-proto -pulsar-client==3.4.0 +pulsar-client==3.5.0 # via chromadb pyasn1==0.6.0 # via @@ -186,7 +184,7 @@ sympy==1.12 # via onnxruntime tenacity==8.2.3 # via chromadb -tokenizers==0.15.2 +tokenizers==0.19.1 # via chromadb tqdm==4.66.2 # via @@ -216,7 +214,9 @@ urllib3==1.26.18 # kubernetes # requests uvicorn[standard]==0.29.0 - # via chromadb + # via + # chromadb + # uvicorn uvloop==0.19.0 # via uvicorn watchfiles==0.21.0 diff --git a/requirements/ingest/clarifai.txt b/requirements/ingest/clarifai.txt index e58f80efc8..374b977da0 100644 --- a/requirements/ingest/clarifai.txt +++ b/requirements/ingest/clarifai.txt @@ -13,7 +13,7 @@ charset-normalizer==3.3.2 # via # -c ./ingest/../base.txt # requests -clarifai==10.3.0 +clarifai==10.3.1 # via -r ./ingest/clarifai.in clarifai-grpc==10.2.3 # via clarifai @@ -21,7 +21,7 @@ contextlib2==21.6.0 # via schema googleapis-common-protos==1.63.0 # via clarifai-grpc -grpcio==1.62.1 +grpcio==1.62.2 # via clarifai-grpc idna==3.7 # via diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt index c6bea36584..1053728f11 100644 --- a/requirements/ingest/delta-table.txt +++ b/requirements/ingest/delta-table.txt @@ -12,7 +12,7 @@ numpy==1.26.4 # via # -c ./ingest/../base.txt # pyarrow -pyarrow==15.0.2 +pyarrow==16.0.0 # via deltalake pyarrow-hotfix==0.6 # via deltalake diff --git a/requirements/ingest/discord.txt b/requirements/ingest/discord.txt index 5ac3a1937e..5187978136 100644 --- a/requirements/ingest/discord.txt +++ b/requirements/ingest/discord.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/discord.in # -aiohttp==3.9.4 +aiohttp==3.9.5 # via discord-py aiosignal==1.3.1 # via aiohttp diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt index a407d81f93..31052577e9 100644 --- a/requirements/ingest/embed-aws-bedrock.txt +++ b/requirements/ingest/embed-aws-bedrock.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/embed-aws-bedrock.in # -aiohttp==3.9.4 +aiohttp==3.9.5 # via langchain-community aiosignal==1.3.1 # via aiohttp @@ -51,11 +51,11 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -langchain-community==0.0.32 +langchain-community==0.0.34 # via -r ./ingest/embed-aws-bedrock.in -langchain-core==0.1.42 +langchain-core==0.1.45 # via langchain-community -langsmith==0.1.46 +langsmith==0.1.49 # via # langchain-community # langchain-core @@ -75,7 +75,7 @@ numpy==1.26.4 # via # -c ./ingest/../base.txt # langchain-community -orjson==3.10.0 +orjson==3.10.1 # via langsmith packaging==23.2 # via diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt index bbdfee7bd9..ce967ba576 100644 --- a/requirements/ingest/embed-huggingface.txt +++ b/requirements/ingest/embed-huggingface.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/embed-huggingface.in # -aiohttp==3.9.4 +aiohttp==3.9.5 # via langchain-community aiosignal==1.3.1 # via aiohttp @@ -62,11 +62,11 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -langchain-community==0.0.32 +langchain-community==0.0.34 # via -r ./ingest/embed-huggingface.in -langchain-core==0.1.42 +langchain-core==0.1.45 # via langchain-community -langsmith==0.1.46 +langsmith==0.1.49 # via # langchain-community # langchain-core @@ -96,7 +96,7 @@ numpy==1.26.4 # scipy # sentence-transformers # transformers -orjson==3.10.0 +orjson==3.10.1 # via langsmith packaging==23.2 # via @@ -120,7 +120,7 @@ pyyaml==6.0.1 # langchain-community # langchain-core # transformers -regex==2023.12.25 +regex==2024.4.16 # via # -c ./ingest/../base.txt # transformers @@ -131,10 +131,8 @@ requests==2.31.0 # langchain-community # langsmith # transformers -safetensors==0.3.2 - # via - # -c ./ingest/../deps/constraints.txt - # transformers +safetensors==0.4.3 + # via transformers scikit-learn==1.4.2 # via sentence-transformers scipy==1.10.1 @@ -142,7 +140,7 @@ scipy==1.10.1 # -c ./ingest/../deps/constraints.txt # scikit-learn # sentence-transformers -sentence-transformers==2.6.1 +sentence-transformers==2.7.0 # via -r ./ingest/embed-huggingface.in sqlalchemy==2.0.29 # via langchain-community @@ -154,7 +152,7 @@ tenacity==8.2.3 # langchain-core threadpoolctl==3.4.0 # via scikit-learn -tokenizers==0.15.2 +tokenizers==0.19.1 # via transformers torch==2.2.2 # via @@ -166,7 +164,7 @@ tqdm==4.66.2 # huggingface-hub # sentence-transformers # transformers -transformers==4.37.1 +transformers==4.40.0 # via sentence-transformers typing-extensions==4.11.0 # via diff --git a/requirements/ingest/embed-octoai.txt b/requirements/ingest/embed-octoai.txt index f8a265c61c..d43b301b14 100644 --- a/requirements/ingest/embed-octoai.txt +++ b/requirements/ingest/embed-octoai.txt @@ -24,7 +24,7 @@ charset-normalizer==3.3.2 # requests distro==1.9.0 # via openai -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio h11==0.14.0 # via httpcore @@ -38,13 +38,13 @@ idna==3.7 # anyio # httpx # requests -openai==1.17.0 +openai==1.23.2 # via -r ./ingest/embed-octoai.in pydantic==2.7.0 # via openai pydantic-core==2.18.1 # via pydantic -regex==2023.12.25 +regex==2024.4.16 # via # -c ./ingest/../base.txt # tiktoken diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt index 13cd1a6802..dae330e673 100644 --- a/requirements/ingest/embed-openai.txt +++ b/requirements/ingest/embed-openai.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/embed-openai.in # -aiohttp==3.9.4 +aiohttp==3.9.5 # via langchain-community aiosignal==1.3.1 # via aiohttp @@ -36,7 +36,7 @@ dataclasses-json==0.6.4 # langchain-community distro==1.9.0 # via openai -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio frozenlist==1.4.1 # via @@ -59,11 +59,11 @@ jsonpatch==1.33 # via langchain-core jsonpointer==2.4 # via jsonpatch -langchain-community==0.0.32 +langchain-community==0.0.34 # via -r ./ingest/embed-openai.in -langchain-core==0.1.42 +langchain-core==0.1.45 # via langchain-community -langsmith==0.1.46 +langsmith==0.1.49 # via # langchain-community # langchain-core @@ -83,9 +83,9 @@ numpy==1.26.4 # via # -c ./ingest/../base.txt # langchain-community -openai==1.17.0 +openai==1.23.2 # via -r ./ingest/embed-openai.in -orjson==3.10.0 +orjson==3.10.1 # via langsmith packaging==23.2 # via @@ -104,7 +104,7 @@ pyyaml==6.0.1 # via # langchain-community # langchain-core -regex==2023.12.25 +regex==2024.4.16 # via # -c ./ingest/../base.txt # tiktoken diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt index c5442f9ed4..39aad94d13 100644 --- a/requirements/ingest/embed-vertexai.txt +++ b/requirements/ingest/embed-vertexai.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/embed-vertexai.in # -aiohttp==3.9.4 +aiohttp==3.9.5 # via # langchain # langchain-community @@ -42,6 +42,7 @@ frozenlist==1.4.1 # aiosignal google-api-core[grpc]==2.18.0 # via + # google-api-core # google-cloud-aiplatform # google-cloud-bigquery # google-cloud-core @@ -55,9 +56,9 @@ google-auth==2.29.0 # google-cloud-core # google-cloud-resource-manager # google-cloud-storage -google-cloud-aiplatform==1.47.0 +google-cloud-aiplatform==1.48.0 # via langchain-google-vertexai -google-cloud-bigquery==3.20.1 +google-cloud-bigquery==3.21.0 # via google-cloud-aiplatform google-cloud-core==2.4.1 # via @@ -84,13 +85,13 @@ googleapis-common-protos[grpc]==1.63.0 # grpcio-status grpc-google-iam-v1==0.13.0 # via google-cloud-resource-manager -grpcio==1.62.1 +grpcio==1.62.2 # via # google-api-core # googleapis-common-protos # grpc-google-iam-v1 # grpcio-status -grpcio-status==1.62.1 +grpcio-status==1.62.2 # via google-api-core idna==3.7 # via @@ -105,11 +106,11 @@ jsonpointer==2.4 # via jsonpatch langchain==0.1.16 # via -r ./ingest/embed-vertexai.in -langchain-community==0.0.32 +langchain-community==0.0.34 # via # -r ./ingest/embed-vertexai.in # langchain -langchain-core==0.1.42 +langchain-core==0.1.45 # via # langchain # langchain-community @@ -119,7 +120,7 @@ langchain-google-vertexai==1.0.1 # via -r ./ingest/embed-vertexai.in langchain-text-splitters==0.0.1 # via langchain -langsmith==0.1.46 +langsmith==0.1.49 # via # langchain # langchain-community @@ -142,7 +143,7 @@ numpy==1.26.4 # langchain # langchain-community # shapely -orjson==3.10.0 +orjson==3.10.1 # via langsmith packaging==23.2 # via @@ -201,7 +202,7 @@ requests==2.31.0 # langsmith rsa==4.9 # via google-auth -shapely==2.0.3 +shapely==2.0.4 # via google-cloud-aiplatform six==1.16.0 # via @@ -216,7 +217,7 @@ tenacity==8.2.3 # langchain # langchain-community # langchain-core -types-protobuf==4.25.0.20240410 +types-protobuf==4.25.0.20240417 # via langchain-google-vertexai types-requests==2.31.0.6 # via langchain-google-vertexai diff --git a/requirements/ingest/gcs.txt b/requirements/ingest/gcs.txt index 93cd3360e2..48c5c7f6d2 100644 --- a/requirements/ingest/gcs.txt +++ b/requirements/ingest/gcs.txt @@ -4,7 +4,7 @@ # # pip-compile ./ingest/gcs.in # -aiohttp==3.9.4 +aiohttp==3.9.5 # via gcsfs aiosignal==1.3.1 # via aiohttp diff --git a/requirements/ingest/github.txt b/requirements/ingest/github.txt index ed8ec5fdb1..18e29fc3c4 100644 --- a/requirements/ingest/github.txt +++ b/requirements/ingest/github.txt @@ -30,7 +30,9 @@ pycparser==2.22 pygithub==2.3.0 # via -r ./ingest/github.in pyjwt[crypto]==2.8.0 - # via pygithub + # via + # pygithub + # pyjwt pynacl==1.5.0 # via pygithub requests==2.31.0 diff --git a/requirements/ingest/google-drive.txt b/requirements/ingest/google-drive.txt index 297374a0f8..86aa1c2df7 100644 --- a/requirements/ingest/google-drive.txt +++ b/requirements/ingest/google-drive.txt @@ -17,7 +17,7 @@ charset-normalizer==3.3.2 # requests google-api-core==2.18.0 # via google-api-python-client -google-api-python-client==2.125.0 +google-api-python-client==2.126.0 # via -r ./ingest/google-drive.in google-auth==2.29.0 # via diff --git a/requirements/ingest/notion.txt b/requirements/ingest/notion.txt index cb19ab6d29..fcc3ac65c5 100644 --- a/requirements/ingest/notion.txt +++ b/requirements/ingest/notion.txt @@ -14,7 +14,7 @@ certifi==2024.2.2 # -c ./ingest/../deps/constraints.txt # httpcore # httpx -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio h11==0.14.0 # via httpcore diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt index ced2374b7d..8922ec418c 100644 --- a/requirements/ingest/onedrive.txt +++ b/requirements/ingest/onedrive.txt @@ -40,7 +40,9 @@ office365-rest-python-client==2.4.2 pycparser==2.22 # via cffi pyjwt[crypto]==2.8.0 - # via msal + # via + # msal + # pyjwt pytz==2024.1 # via office365-rest-python-client requests==2.31.0 diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt index 9a6ecbe3e4..2129b31be5 100644 --- a/requirements/ingest/outlook.txt +++ b/requirements/ingest/outlook.txt @@ -34,7 +34,9 @@ office365-rest-python-client==2.4.2 pycparser==2.22 # via cffi pyjwt[crypto]==2.8.0 - # via msal + # via + # msal + # pyjwt pytz==2024.1 # via office365-rest-python-client requests==2.31.0 diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt index 277a5ad599..41b0c25d26 100644 --- a/requirements/ingest/qdrant.txt +++ b/requirements/ingest/qdrant.txt @@ -16,13 +16,13 @@ certifi==2024.2.2 # -c ./ingest/../deps/constraints.txt # httpcore # httpx -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio -grpcio==1.62.1 +grpcio==1.62.2 # via # grpcio-tools # qdrant-client -grpcio-tools==1.62.1 +grpcio-tools==1.62.2 # via qdrant-client h11==0.14.0 # via httpcore @@ -33,7 +33,9 @@ hpack==4.0.0 httpcore==1.0.5 # via httpx httpx[http2]==0.27.0 - # via qdrant-client + # via + # httpx + # qdrant-client hyperframe==6.0.1 # via h2 idna==3.7 diff --git a/requirements/ingest/s3.txt b/requirements/ingest/s3.txt index d5ad8f8589..2be658633e 100644 --- a/requirements/ingest/s3.txt +++ b/requirements/ingest/s3.txt @@ -6,7 +6,7 @@ # aiobotocore==2.12.3 # via s3fs -aiohttp==3.9.4 +aiohttp==3.9.5 # via # aiobotocore # s3fs diff --git a/requirements/ingest/salesforce.txt b/requirements/ingest/salesforce.txt index 2d8f69bd86..07e2598555 100644 --- a/requirements/ingest/salesforce.txt +++ b/requirements/ingest/salesforce.txt @@ -18,7 +18,7 @@ charset-normalizer==3.3.2 # -c ./ingest/../base.txt # requests cryptography==42.0.5 - # via simple-salesforce + # via pyjwt idna==3.7 # via # -c ./ingest/../base.txt @@ -32,21 +32,14 @@ lxml==4.9.4 # zeep more-itertools==10.2.0 # via simple-salesforce -pendulum==3.0.0 - # via simple-salesforce platformdirs==3.10.0 # via # -c ./ingest/../deps/constraints.txt # zeep pycparser==2.22 # via cffi -pyjwt==2.8.0 +pyjwt[crypto]==2.8.0 # via simple-salesforce -python-dateutil==2.9.0.post0 - # via - # -c ./ingest/../base.txt - # pendulum - # time-machine pytz==2024.1 # via zeep requests==2.31.0 @@ -60,17 +53,16 @@ requests-file==2.0.0 # via zeep requests-toolbelt==1.0.0 # via zeep -simple-salesforce==1.12.5 +simple-salesforce==1.12.6 # via -r ./ingest/salesforce.in six==1.16.0 # via # -c ./ingest/../base.txt # isodate - # python-dateutil -time-machine==2.14.1 - # via pendulum -tzdata==2024.1 - # via pendulum +typing-extensions==4.11.0 + # via + # -c ./ingest/../base.txt + # simple-salesforce urllib3==1.26.18 # via # -c ./ingest/../base.txt diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt index 4eb8e6b15e..9167a159ed 100644 --- a/requirements/ingest/sharepoint.txt +++ b/requirements/ingest/sharepoint.txt @@ -34,7 +34,9 @@ office365-rest-python-client==2.4.2 pycparser==2.22 # via cffi pyjwt[crypto]==2.8.0 - # via msal + # via + # msal + # pyjwt pytz==2024.1 # via office365-rest-python-client requests==2.31.0 diff --git a/requirements/ingest/weaviate.txt b/requirements/ingest/weaviate.txt index bebb5b6912..41a3bf3115 100644 --- a/requirements/ingest/weaviate.txt +++ b/requirements/ingest/weaviate.txt @@ -27,16 +27,16 @@ charset-normalizer==3.3.2 # requests cryptography==42.0.5 # via authlib -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via anyio -grpcio==1.62.1 +grpcio==1.62.2 # via # grpcio-health-checking # grpcio-tools # weaviate-client -grpcio-health-checking==1.62.1 +grpcio-health-checking==1.62.2 # via weaviate-client -grpcio-tools==1.62.1 +grpcio-tools==1.62.2 # via weaviate-client h11==0.14.0 # via httpcore diff --git a/requirements/test.txt b/requirements/test.txt index 5833f0e2bc..9034cd9818 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -10,7 +10,7 @@ appdirs==1.4.4 # via label-studio-tools autoflake==2.3.1 # via -r ./test.in -black==24.3.0 +black==24.4.0 # via -r ./test.in certifi==2024.2.2 # via @@ -29,7 +29,7 @@ coverage[toml]==7.4.4 # via # -r ./test.in # pytest-cov -exceptiongroup==1.2.0 +exceptiongroup==1.2.1 # via pytest flake8==7.0.0 # via @@ -39,7 +39,7 @@ flake8-print==5.0.0 # via -r ./test.in freezegun==1.4.0 # via -r ./test.in -grpcio==1.62.1 +grpcio==1.62.2 # via -r ./test.in idna==3.7 # via @@ -81,7 +81,7 @@ platformdirs==3.10.0 # via # -c ././deps/constraints.txt # black -pluggy==1.4.0 +pluggy==1.5.0 # via pytest pycodestyle==2.11.1 # via @@ -115,7 +115,7 @@ requests==2.31.0 # via # -c ./base.txt # label-studio-sdk -ruff==0.3.7 +ruff==0.4.1 # via -r ./test.in six==1.16.0 # via diff --git a/unstructured/__version__.py b/unstructured/__version__.py index d0b6ef94a7..10b9b96d79 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.3-dev9" # pragma: no cover +__version__ = "0.13.3" # pragma: no cover From 05ff9750813b1e18121a66143ef274ac271358b0 Mon Sep 17 00:00:00 2001 From: Steve Canny Date: Mon, 22 Apr 2024 16:58:17 -0700 Subject: [PATCH 2/2] fix: remove unused `ElementMetadata.section` (#2921) **Summary** The `.section` field in `ElementMetadata` is dead code, possibly a remainder from a prior iteration of `partition_epub()`. In any case, it is not populated by any partitioner. Remove it and any code that uses it. --- CHANGELOG.md | 10 +++ docs/source/core/chunking.rst | 7 --- test_unstructured/chunking/test_base.py | 63 ------------------- test_unstructured/chunking/test_title.py | 37 ----------- test_unstructured/partition/epub/test_epub.py | 2 - test_unstructured/staging/test_base.py | 3 - unstructured/__version__.py | 2 +- unstructured/chunking/base.py | 45 ------------- unstructured/chunking/title.py | 2 - unstructured/documents/elements.py | 18 +++--- unstructured/partition/common.py | 2 - 11 files changed, 21 insertions(+), 170 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab0f43591b..c484e9eca1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +## 0.13.4-dev0 + +### Enhancements + +### Features + +### Fixes + +* **Remove ElementMetadata.section field.**. This field was unused, not populated by any partitioners. + ## 0.13.3 ### Enhancements diff --git a/docs/source/core/chunking.rst b/docs/source/core/chunking.rst index ee23620d59..7a9bfef11e 100644 --- a/docs/source/core/chunking.rst +++ b/docs/source/core/chunking.rst @@ -152,13 +152,6 @@ following behaviors: ``Title`` element would fit in the prior chunk. This implements the first aspect of the "preserve section boundaries" contract. -- **Detect metadata.section change.** An element with a new value in ``element.metadata.section`` is - considered to start a new section. When a change in this value is encountered a new chunk is - started. This implements the second aspect of preserving section boundaries. This metadata is not - present in all document formats so is not used alone. An element having ``None`` for this metadata - field is considered to be part of the prior section; a section break is only detected on an - explicit change in value. - - **Respect page boundaries.** Page boundaries can optionally also be respected using the ``multipage_sections`` argument. This defaults to ``True`` meaning that a page break does *not* start a new chunk. Setting this to ``False`` will separate elements that occur on different pages diff --git a/test_unstructured/chunking/test_base.py b/test_unstructured/chunking/test_base.py index dd9a68fa78..ab90d38d18 100644 --- a/test_unstructured/chunking/test_base.py +++ b/test_unstructured/chunking/test_base.py @@ -17,7 +17,6 @@ TextPreChunk, TextPreChunkAccumulator, _TextSplitter, - is_in_next_section, is_on_next_page, is_title, ) @@ -1514,68 +1513,6 @@ def but_it_does_not_generate_a_TextPreChunk_on_flush_when_empty(self): # ================================================================================================ -class Describe_is_in_next_section: - """Unit-test suite for `unstructured.chunking.base.is_in_next_section()` function. - - `is_in_next_section()` is not itself a predicate, rather it returns a predicate on Element - (`Callable[[Element], bool]`) that can be called repeatedly to detect section changes in an - element stream. - """ - - def it_is_false_for_the_first_element_when_it_has_a_non_None_section(self): - """This is an explicit first-section; first-section does not represent a section break.""" - pred = is_in_next_section() - assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction"))) - - def and_it_is_false_for_the_first_element_when_it_has_a_None_section(self): - """This is an anonymous first-section; still doesn't represent a section break.""" - pred = is_in_next_section() - assert not pred(Text("abcd")) - - def it_is_false_for_None_section_elements_that_follow_an_explicit_first_section(self): - """A `None` section element is considered to continue the prior section.""" - pred = is_in_next_section() - assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction"))) - assert not pred(Text("efgh")) - assert not pred(Text("ijkl")) - - def and_it_is_false_for_None_section_elements_that_follow_an_anonymous_first_section(self): - """A `None` section element is considered to continue the prior section.""" - pred = is_in_next_section() - assert not pred(Text("abcd")) - assert not pred(Text("efgh")) - assert not pred(Text("ijkl")) - - def it_is_false_for_matching_section_elements_that_follow_an_explicit_first_section(self): - pred = is_in_next_section() - assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction"))) - assert not pred(Text("efgh", metadata=ElementMetadata(section="Introduction"))) - assert not pred(Text("ijkl", metadata=ElementMetadata(section="Introduction"))) - - def it_is_true_for_an_explicit_section_element_that_follows_an_anonymous_first_section(self): - pred = is_in_next_section() - assert not pred(Text("abcd")) - assert not pred(Text("efgh")) - assert pred(Text("ijkl", metadata=ElementMetadata(section="Introduction"))) - - def and_it_is_true_for_a_different_explicit_section_that_follows_an_explicit_section(self): - pred = is_in_next_section() - assert not pred(Text("abcd", metadata=ElementMetadata(section="Introduction"))) - assert pred(Text("efgh", metadata=ElementMetadata(section="Summary"))) - - def it_is_true_whenever_the_section_explicitly_changes_except_at_the_start(self): - pred = is_in_next_section() - assert not pred(Text("abcd")) - assert pred(Text("efgh", metadata=ElementMetadata(section="Introduction"))) - assert not pred(Text("ijkl")) - assert not pred(Text("mnop", metadata=ElementMetadata(section="Introduction"))) - assert not pred(Text("qrst")) - assert pred(Text("uvwx", metadata=ElementMetadata(section="Summary"))) - assert not pred(Text("yzab", metadata=ElementMetadata(section="Summary"))) - assert not pred(Text("cdef")) - assert pred(Text("ghij", metadata=ElementMetadata(section="Appendix"))) - - class Describe_is_on_next_page: """Unit-test suite for `unstructured.chunking.base.is_on_next_page()` function. diff --git a/test_unstructured/chunking/test_title.py b/test_unstructured/chunking/test_title.py index be5b82e25a..7ffa652b06 100644 --- a/test_unstructured/chunking/test_title.py +++ b/test_unstructured/chunking/test_title.py @@ -139,43 +139,6 @@ def test_chunk_by_title(): ) -def test_chunk_by_title_respects_section_change(): - elements: list[Element] = [ - Title("A Great Day", metadata=ElementMetadata(section="first")), - Text("Today is a great day.", metadata=ElementMetadata(section="second")), - Text("It is sunny outside.", metadata=ElementMetadata(section="second")), - Table("Heading\nCell text"), - Title("An Okay Day"), - Text("Today is an okay day."), - Text("It is rainy outside."), - Title("A Bad Day"), - Text( - "Today is a bad day.", - metadata=ElementMetadata( - regex_metadata={"a": [RegexMetadata(text="A", start=0, end=1)]}, - ), - ), - Text("It is storming outside."), - CheckBox(), - ] - - chunks = chunk_by_title(elements, combine_text_under_n_chars=0) - - assert chunks == [ - CompositeElement( - "A Great Day", - ), - CompositeElement( - "Today is a great day.\n\nIt is sunny outside.", - ), - Table("Heading\nCell text"), - CompositeElement("An Okay Day\n\nToday is an okay day.\n\nIt is rainy outside."), - CompositeElement( - "A Bad Day\n\nToday is a bad day.\n\nIt is storming outside.", - ), - ] - - def test_chunk_by_title_separates_by_page_number(): elements: list[Element] = [ Title("A Great Day", metadata=ElementMetadata(page_number=1)), diff --git a/test_unstructured/partition/epub/test_epub.py b/test_unstructured/partition/epub/test_epub.py index 5af52ac1c6..95eb68ae02 100644 --- a/test_unstructured/partition/epub/test_epub.py +++ b/test_unstructured/partition/epub/test_epub.py @@ -77,7 +77,6 @@ def test_partition_epub_from_filename_exclude_metadata(): assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None assert elements[0].metadata.filename is None - assert elements[0].metadata.section is None def test_partition_epub_from_file_exlcude_metadata(): @@ -87,7 +86,6 @@ def test_partition_epub_from_file_exlcude_metadata(): assert elements[0].metadata.filetype is None assert elements[0].metadata.page_name is None assert elements[0].metadata.filename is None - assert elements[0].metadata.section is None def test_partition_epub_metadata_date( diff --git a/test_unstructured/staging/test_base.py b/test_unstructured/staging/test_base.py index 0aa0fa5ed9..105e1b4615 100644 --- a/test_unstructured/staging/test_base.py +++ b/test_unstructured/staging/test_base.py @@ -166,7 +166,6 @@ def test_default_pandas_dtypes(): sent_from=["sent", "from"], sent_to=["sent", "to"], subject="subject", - section="section", header_footer_type="header_footer_type", emphasized_text_contents=["emphasized", "text", "contents"], emphasized_text_tags=["emphasized", "text", "tags"], @@ -321,7 +320,6 @@ def test_convert_to_coco(): sent_from=["sent", "from"], sent_to=["sent", "to"], subject="subject", - section="section", header_footer_type="header_footer_type", emphasized_text_contents=["emphasized", "text", "contents"], emphasized_text_tags=["emphasized", "text", "tags"], @@ -366,7 +364,6 @@ def test_convert_to_coco(): sent_from=["sent", "from"], sent_to=["sent", "to"], subject="subject", - section="section", header_footer_type="header_footer_type", emphasized_text_contents=["emphasized", "text", "contents"], emphasized_text_tags=["emphasized", "text", "tags"], diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 10b9b96d79..927d266d2c 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.13.3" # pragma: no cover +__version__ = "0.13.4-dev0" # pragma: no cover diff --git a/unstructured/chunking/base.py b/unstructured/chunking/base.py index 106fe9b38e..edf37b3d2d 100644 --- a/unstructured/chunking/base.py +++ b/unstructured/chunking/base.py @@ -1022,51 +1022,6 @@ def will_fit(self, pre_chunk: TextPreChunk) -> bool: # ================================================================================================ -def is_in_next_section() -> BoundaryPredicate: - """Not a predicate itself, calling this returns a predicate that triggers on each new section. - - The lifetime of the returned callable cannot extend beyond a single element-stream because it - stores current state (current section) that is particular to that element stream. - - A "section" of this type is particular to the EPUB format (so far) and not to be confused with - a "section" composed of a section-heading (`Title` element) followed by content elements. - - The returned predicate tracks the current section, starting at `None`. Calling with an element - with a different value for `metadata.section` returns True, indicating the element starts a new - section boundary, and updates the enclosed section name ready for the next transition. - """ - current_section: Optional[str] = None - is_first: bool = True - - def section_changed(element: Element) -> bool: - nonlocal current_section, is_first - - section = element.metadata.section - - # -- The first element never reports a section break, it starts the first section of the - # -- document. That section could be named (section is non-None) or anonymous (section is - # -- None). We don't really have to care. - if is_first: - current_section = section - is_first = False - return False - - # -- An element with a `None` section is assumed to continue the current section. It never - # -- updates the current-section because once set, the current-section is "sticky" until - # -- replaced by another explicit section. - if section is None: - return False - - # -- another element with the same section continues that section -- - if section == current_section: - return False - - current_section = section - return True - - return section_changed - - def is_on_next_page() -> BoundaryPredicate: """Not a predicate itself, calling this returns a predicate that triggers on each new page. diff --git a/unstructured/chunking/title.py b/unstructured/chunking/title.py index e9f5e54e4d..878302301e 100644 --- a/unstructured/chunking/title.py +++ b/unstructured/chunking/title.py @@ -13,7 +13,6 @@ ChunkingOptions, PreChunkCombiner, PreChunker, - is_in_next_section, is_on_next_page, is_title, ) @@ -121,7 +120,6 @@ def boundary_predicates(self) -> tuple[BoundaryPredicate, ...]: def iter_boundary_predicates() -> Iterator[BoundaryPredicate]: yield is_title - yield is_in_next_section() if not self.multipage_sections: yield is_on_next_page() diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index d3783e9c4c..95778636ce 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -191,8 +191,6 @@ class ElementMetadata: parent_id: Optional[str] # -- "fields" e.g. status, dept.no, etc. extracted from text via regex -- regex_metadata: Optional[dict[str, list[RegexMetadata]]] - # -- EPUB document section -- - section: Optional[str] # -- e-mail specific metadata fields -- sent_from: Optional[list[str]] @@ -235,7 +233,6 @@ def __init__( page_number: Optional[int] = None, parent_id: Optional[str] = None, regex_metadata: Optional[dict[str, list[RegexMetadata]]] = None, - section: Optional[str] = None, sent_from: Optional[list[str]] = None, sent_to: Optional[list[str]] = None, signature: Optional[str] = None, @@ -275,7 +272,6 @@ def __init__( self.page_number = page_number self.parent_id = parent_id self.regex_metadata = regex_metadata - self.section = section self.sent_from = sent_from self.sent_to = sent_to self.signature = signature @@ -488,7 +484,6 @@ def field_consolidation_strategies(cls) -> dict[str, ConsolidationStrategy]: "page_number": cls.FIRST, "parent_id": cls.DROP, "regex_metadata": cls.REGEX, - "section": cls.FIRST, "sent_from": cls.FIRST, "sent_to": cls.FIRST, "signature": cls.FIRST, @@ -671,7 +666,7 @@ def to_dict(cls): class Element(abc.ABC): - """An element is a section of a page in the document. + """An element is a semantically-coherent component of a document, often a paragraph. There are a few design principles that are followed when creating an element: 1. It will always have an ID, which by default is a random UUID. @@ -694,7 +689,9 @@ def __init__( metadata: Optional[ElementMetadata] = None, detection_origin: Optional[str] = None, ): - if element_id is not None and not isinstance(element_id, str): + if element_id is not None and not isinstance( + element_id, str + ): # pyright: ignore[reportUnnecessaryIsInstance] raise ValueError("element_id must be of type str or None.") self._element_id = element_id @@ -885,7 +882,12 @@ class Formula(Text): class CompositeElement(Text): - """A section of text consisting of a combination of elements.""" + """A chunk formed from text (non-Table) elements. + + Only produced by chunking. An instance may be formed by combining one or more sequential + elements produced by partitioning. It it also used when text-splitting an "oversized" element, + a single element that by itself is larger than the requested chunk size. + """ category = "CompositeElement" diff --git a/unstructured/partition/common.py b/unstructured/partition/common.py index 5caff1897c..3e79573437 100644 --- a/unstructured/partition/common.py +++ b/unstructured/partition/common.py @@ -272,7 +272,6 @@ def add_element_metadata( text_as_html: Optional[str] = None, coordinates: Optional[tuple[tuple[float, float], ...]] = None, coordinate_system: Optional[CoordinateSystem] = None, - section: Optional[str] = None, image_path: Optional[str] = None, detection_origin: Optional[str] = None, languages: Optional[List[str]] = None, @@ -324,7 +323,6 @@ def add_element_metadata( link_start_indexes=link_start_indexes, emphasized_text_contents=emphasized_text_contents, emphasized_text_tags=emphasized_text_tags, - section=section, category_depth=depth, image_path=image_path, languages=languages,