Skip to content

Commit

Permalink
Merge branch 'main' into jj/feature-gapped-chunking-cli
Browse files Browse the repository at this point in the history
  • Loading branch information
Coniferish authored Apr 23, 2024
2 parents 13f1ad6 + 05ff975 commit 44d2d47
Show file tree
Hide file tree
Showing 43 changed files with 153 additions and 303 deletions.
12 changes: 11 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,14 @@
## 0.13.3-dev9
## 0.13.4-dev0

### Enhancements

### Features

### Fixes

* **Remove ElementMetadata.section field.**. This field was unused, not populated by any partitioners.

## 0.13.3

### Enhancements

Expand Down
7 changes: 0 additions & 7 deletions docs/source/core/chunking.rst
Original file line number Diff line number Diff line change
Expand Up @@ -152,13 +152,6 @@ following behaviors:
``Title`` element would fit in the prior chunk. This implements the first aspect of the "preserve
section boundaries" contract.

- **Detect metadata.section change.** An element with a new value in ``element.metadata.section`` is
considered to start a new section. When a change in this value is encountered a new chunk is
started. This implements the second aspect of preserving section boundaries. This metadata is not
present in all document formats so is not used alone. An element having ``None`` for this metadata
field is considered to be part of the prior section; a section break is only detected on an
explicit change in value.

- **Respect page boundaries.** Page boundaries can optionally also be respected using the
``multipage_sections`` argument. This defaults to ``True`` meaning that a page break does *not*
start a new chunk. Setting this to ``False`` will separate elements that occur on different pages
Expand Down
4 changes: 2 additions & 2 deletions requirements/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@ compile-base:

.PHONY: compile-all-base
compile-all-base: compile-base compile-test compile-dev
@$(foreach file,$(BASE_REQUIREMENTS),echo "compiling: $(file)" && pip-compile --upgrade $(file);)
@$(foreach file,$(BASE_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;)

.PHONY: compile-ingest
compile-ingest:
@$(foreach file,$(INGEST_REQUIREMENTS),echo "compiling: $(file)" && pip-compile --upgrade $(file);)
@$(foreach file,$(INGEST_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;)

.PHONY: clean
clean: clean-base clean-ingest
Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ python-magic==0.4.27
# via -r ./base.in
rapidfuzz==3.8.1
# via -r ./base.in
regex==2023.12.25
regex==2024.4.16
# via nltk
requests==2.31.0
# via
Expand Down
3 changes: 0 additions & 3 deletions requirements/deps/constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ Office365-REST-Python-Client<2.4.3
# unstructured-inference to be upgraded when unstructured library is upgraded
# https://github.com/Unstructured-IO/unstructured/issues/1458
# unstructured-inference
# unable to build wheel for arm on 0.3.3+
safetensors<=0.3.2
# use the known compatible version of weaviate and unstructured.pytesseract
unstructured.pytesseract>=0.3.12
weaviate-client>3.25.0
Expand All @@ -38,7 +36,6 @@ torch>2
# pinned in unstructured paddleocr
opencv-python==4.8.0.76
opencv-contrib-python==4.8.0.76
onnxruntime==1.15.1
platformdirs==3.10.0

# TODO: Constraint due to langchain, remove when that gets updated:
Expand Down
14 changes: 7 additions & 7 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ defusedxml==0.7.1
# via nbconvert
distlib==0.3.8
# via virtualenv
exceptiongroup==1.2.0
exceptiongroup==1.2.1
# via
# -c ./test.txt
# anyio
Expand All @@ -91,7 +91,7 @@ httpcore==1.0.5
# via httpx
httpx==0.27.0
# via jupyterlab
identify==2.5.35
identify==2.5.36
# via pre-commit
idna==3.7
# via
Expand Down Expand Up @@ -195,7 +195,7 @@ markupsafe==2.1.5
# via
# jinja2
# nbconvert
matplotlib-inline==0.1.6
matplotlib-inline==0.1.7
# via
# ipykernel
# ipython
Expand All @@ -216,7 +216,7 @@ nest-asyncio==1.6.0
# via ipykernel
nodeenv==1.8.0
# via pre-commit
notebook==7.1.2
notebook==7.1.3
# via jupyter
notebook-shim==0.2.4
# via
Expand Down Expand Up @@ -294,7 +294,7 @@ pyyaml==6.0.1
# -c ./test.txt
# jupyter-events
# pre-commit
pyzmq==25.1.2
pyzmq==26.0.2
# via
# ipykernel
# jupyter-client
Expand Down Expand Up @@ -368,7 +368,7 @@ tornado==6.4
# jupyterlab
# notebook
# terminado
traitlets==5.14.2
traitlets==5.14.3
# via
# comm
# ipykernel
Expand Down Expand Up @@ -401,7 +401,7 @@ urllib3==1.26.18
# -c ./base.txt
# -c ./test.txt
# requests
virtualenv==20.25.1
virtualenv==20.25.3
# via pre-commit
wcwidth==0.2.13
# via prompt-toolkit
Expand Down
6 changes: 3 additions & 3 deletions requirements/extra-paddleocr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ importlib-metadata==7.1.0
# via flask
importlib-resources==6.4.0
# via matplotlib
itsdangerous==2.1.2
itsdangerous==2.2.0
# via flask
jinja2==3.1.3
# via
Expand Down Expand Up @@ -188,7 +188,7 @@ scipy==1.10.1
# -c ././deps/constraints.txt
# imgaug
# scikit-image
shapely==2.0.3
shapely==2.0.4
# via
# imgaug
# unstructured-paddleocr
Expand All @@ -200,7 +200,7 @@ six==1.16.0
# imgaug
# python-dateutil
# visualdl
tifffile==2024.2.12
tifffile==2024.4.18
# via scikit-image
tqdm==4.66.2
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pillow_heif
pypdf
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.25
unstructured-inference==0.7.27
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12
19 changes: 8 additions & 11 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -100,10 +100,8 @@ onnx==1.16.0
# via
# -r ./extra-pdf-image.in
# unstructured-inference
onnxruntime==1.15.1
# via
# -c ././deps/constraints.txt
# unstructured-inference
onnxruntime==1.17.3
# via unstructured-inference
opencv-python==4.8.0.76
# via
# -c ././deps/constraints.txt
Expand Down Expand Up @@ -132,7 +130,7 @@ pdfminer-six==20231228
# pdfplumber
pdfplumber==0.11.0
# via layoutparser
pikepdf==8.15.0
pikepdf==8.15.1
# via -r ./extra-pdf-image.in
pillow==10.3.0
# via
Expand Down Expand Up @@ -190,7 +188,7 @@ rapidfuzz==3.8.1
# via
# -c ./base.txt
# unstructured-inference
regex==2023.12.25
regex==2024.4.16
# via
# -c ./base.txt
# transformers
Expand All @@ -199,9 +197,8 @@ requests==2.31.0
# -c ./base.txt
# huggingface-hub
# transformers
safetensors==0.3.2
safetensors==0.4.3
# via
# -c ././deps/constraints.txt
# timm
# transformers
scipy==1.10.1
Expand All @@ -218,7 +215,7 @@ sympy==1.12
# torch
timm==0.9.16
# via effdet
tokenizers==0.15.2
tokenizers==0.19.1
# via transformers
torch==2.2.2
# via
Expand All @@ -238,7 +235,7 @@ tqdm==4.66.2
# huggingface-hub
# iopath
# transformers
transformers==4.37.1
transformers==4.40.0
# via unstructured-inference
typing-extensions==4.11.0
# via
Expand All @@ -249,7 +246,7 @@ typing-extensions==4.11.0
# torch
tzdata==2024.1
# via pandas
unstructured-inference==0.7.25
unstructured-inference==0.7.27
# via -r ./extra-pdf-image.in
unstructured-pytesseract==0.3.12
# via
Expand Down
12 changes: 5 additions & 7 deletions requirements/huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ pyyaml==6.0.1
# via
# huggingface-hub
# transformers
regex==2023.12.25
regex==2024.4.16
# via
# -c ./base.txt
# sacremoses
Expand All @@ -76,10 +76,8 @@ requests==2.31.0
# transformers
sacremoses==0.1.1
# via -r ./huggingface.in
safetensors==0.3.2
# via
# -c ././deps/constraints.txt
# transformers
safetensors==0.4.3
# via transformers
sentencepiece==0.2.0
# via -r ./huggingface.in
six==1.16.0
Expand All @@ -88,7 +86,7 @@ six==1.16.0
# langdetect
sympy==1.12
# via torch
tokenizers==0.15.2
tokenizers==0.19.1
# via transformers
torch==2.2.2
# via
Expand All @@ -100,7 +98,7 @@ tqdm==4.66.2
# huggingface-hub
# sacremoses
# transformers
transformers==4.37.1
transformers==4.40.0
# via -r ./huggingface.in
typing-extensions==4.11.0
# via
Expand Down
8 changes: 5 additions & 3 deletions requirements/ingest/astra.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ bson==0.5.10
# via astrapy
cassandra-driver==3.29.1
# via cassio
cassio==0.1.5
cassio==0.1.6
# via astrapy
certifi==2024.2.2
# via
Expand All @@ -33,7 +33,7 @@ click==8.1.7
# geomet
deprecation==2.1.0
# via astrapy
exceptiongroup==1.2.0
exceptiongroup==1.2.1
# via anyio
geomet==0.2.1.post1
# via cassandra-driver
Expand All @@ -46,7 +46,9 @@ hpack==4.0.0
httpcore==1.0.5
# via httpx
httpx[http2]==0.27.0
# via astrapy
# via
# astrapy
# httpx
hyperframe==6.0.1
# via h2
idna==3.7
Expand Down
8 changes: 5 additions & 3 deletions requirements/ingest/azure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
#
# pip-compile ./ingest/azure.in
#
adlfs==2024.2.0
adlfs==2024.4.1
# via -r ./ingest/azure.in
aiohttp==3.9.4
aiohttp==3.9.5
# via adlfs
aiosignal==1.3.1
# via aiohttp
Expand Down Expand Up @@ -80,7 +80,9 @@ portalocker==2.8.2
pycparser==2.22
# via cffi
pyjwt[crypto]==2.8.0
# via msal
# via
# msal
# pyjwt
requests==2.31.0
# via
# -c ./ingest/../base.txt
Expand Down
4 changes: 3 additions & 1 deletion requirements/ingest/box.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ attrs==23.2.0
boxfs==0.3.0
# via -r ./ingest/box.in
boxsdk[jwt]==3.9.2
# via boxfs
# via
# boxfs
# boxsdk
certifi==2024.2.2
# via
# -c ./ingest/../base.txt
Expand Down
Loading

0 comments on commit 44d2d47

Please sign in to comment.