Skip to content

Commit

Permalink
Merge branch 'main' into CORE-3587/better-element-ids
Browse files Browse the repository at this point in the history
  • Loading branch information
micmarty-deepsense committed Apr 22, 2024
2 parents 22991c8 + 305247b commit cc8be15
Show file tree
Hide file tree
Showing 49 changed files with 242 additions and 228 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
## 0.13.3-dev8
## 0.13.3

### Enhancements
* **Unique and deterministic hash IDs for elements** Element IDs produced by any partitioning function are now deterministic and unique at the document level by default. Before, hashes were based only on text; however, they now also take into account the element's sequence number on a page, the page's number in the document, and the document's file name.
Expand All @@ -16,6 +16,7 @@

* **Add support for extracting text from tag tails in HTML**. This fix adds ability to generate separate elements using tag tails.
* **Add support for extracting text from `<b>` tags in HTML** Now `partition_html()` can extract text from `<b>` tags inside container tags (like `<div>`, `<pre>`).
* **Fix pip-compile make target** Missing base.in dependency missing from requirments make file added

## 0.13.2

Expand Down
2 changes: 1 addition & 1 deletion docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ docutils==0.18.1
# sphinx-tabs
furo==2023.7.26
# via -r ./build.in
idna==3.6
idna==3.7
# via
# -c ./base.txt
# requests
Expand Down
12 changes: 8 additions & 4 deletions requirements/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ INGEST_REQUIREMENTSTXT := $(patsubst %.in,%.txt,$(INGEST_REQUIREMENTS))


.PHONY: all
all: compile-base compile-ingest
all: compile-all-base compile-ingest

.PHONY: compile-test
compile-test:
Expand All @@ -19,12 +19,16 @@ compile-dev:
pip-compile --upgrade dev.in

.PHONY: compile-base
compile-base: compile-test compile-dev
@$(foreach file,$(BASE_REQUIREMENTS),echo "compiling: $(file)" && pip-compile --upgrade $(file);)
compile-base:
pip-compile --upgrade base.in

.PHONY: compile-all-base
compile-all-base: compile-base compile-test compile-dev
@$(foreach file,$(BASE_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;)

.PHONY: compile-ingest
compile-ingest:
@$(foreach file,$(INGEST_REQUIREMENTS),echo "compiling: $(file)" && pip-compile --upgrade $(file);)
@$(foreach file,$(INGEST_REQUIREMENTS),echo -e "\n\ncompiling: $(file)" && pip-compile --no-strip-extras --upgrade $(file) || exit;)

.PHONY: clean
clean: clean-base clean-ingest
Expand Down
8 changes: 4 additions & 4 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,11 @@ emoji==2.11.0
# via -r ./base.in
filetype==1.2.0
# via -r ./base.in
idna==3.6
idna==3.7
# via
# requests
# unstructured-client
joblib==1.3.2
joblib==1.4.0
# via nltk
jsonpath-python==1.0.6
# via unstructured-client
Expand Down Expand Up @@ -67,9 +67,9 @@ python-iso639==2024.2.7
# via -r ./base.in
python-magic==0.4.27
# via -r ./base.in
rapidfuzz==3.7.0
rapidfuzz==3.8.1
# via -r ./base.in
regex==2023.12.25
regex==2024.4.16
# via nltk
requests==2.31.0
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/build.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ docutils==0.18.1
# sphinx-tabs
furo==2023.7.26
# via -r ./build.in
idna==3.6
idna==3.7
# via
# -c ./base.txt
# requests
Expand Down
3 changes: 0 additions & 3 deletions requirements/deps/constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ Office365-REST-Python-Client<2.4.3
# unstructured-inference to be upgraded when unstructured library is upgraded
# https://github.com/Unstructured-IO/unstructured/issues/1458
# unstructured-inference
# unable to build wheel for arm on 0.3.3+
safetensors<=0.3.2
# use the known compatible version of weaviate and unstructured.pytesseract
unstructured.pytesseract>=0.3.12
weaviate-client>3.25.0
Expand All @@ -38,7 +36,6 @@ torch>2
# pinned in unstructured paddleocr
opencv-python==4.8.0.76
opencv-contrib-python==4.8.0.76
onnxruntime==1.15.1
platformdirs==3.10.0

# TODO: Constraint due to langchain, remove when that gets updated:
Expand Down
30 changes: 15 additions & 15 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,15 @@ defusedxml==0.7.1
# via nbconvert
distlib==0.3.8
# via virtualenv
exceptiongroup==1.2.0
exceptiongroup==1.2.1
# via
# -c ./test.txt
# anyio
executing==2.0.1
# via stack-data
fastjsonschema==2.19.1
# via nbformat
filelock==3.13.3
filelock==3.13.4
# via virtualenv
fqdn==1.5.1
# via jsonschema
Expand All @@ -91,9 +91,9 @@ httpcore==1.0.5
# via httpx
httpx==0.27.0
# via jupyterlab
identify==2.5.35
identify==2.5.36
# via pre-commit
idna==3.6
idna==3.7
# via
# -c ./base.txt
# -c ./test.txt
Expand Down Expand Up @@ -134,7 +134,7 @@ jinja2==3.1.3
# jupyterlab
# jupyterlab-server
# nbconvert
json5==0.9.24
json5==0.9.25
# via jupyterlab-server
jsonpointer==2.4
# via jsonschema
Expand Down Expand Up @@ -170,9 +170,9 @@ jupyter-core==5.7.2
# qtconsole
jupyter-events==0.10.0
# via jupyter-server
jupyter-lsp==2.2.4
jupyter-lsp==2.2.5
# via jupyterlab
jupyter-server==2.13.0
jupyter-server==2.14.0
# via
# jupyter-lsp
# jupyterlab
Expand All @@ -181,11 +181,11 @@ jupyter-server==2.13.0
# notebook-shim
jupyter-server-terminals==0.5.3
# via jupyter-server
jupyterlab==4.1.5
jupyterlab==4.1.6
# via notebook
jupyterlab-pygments==0.3.0
# via nbconvert
jupyterlab-server==2.25.4
jupyterlab-server==2.26.0
# via
# jupyterlab
# notebook
Expand All @@ -195,7 +195,7 @@ markupsafe==2.1.5
# via
# jinja2
# nbconvert
matplotlib-inline==0.1.6
matplotlib-inline==0.1.7
# via
# ipykernel
# ipython
Expand All @@ -216,7 +216,7 @@ nest-asyncio==1.6.0
# via ipykernel
nodeenv==1.8.0
# via pre-commit
notebook==7.1.2
notebook==7.1.3
# via jupyter
notebook-shim==0.2.4
# via
Expand Down Expand Up @@ -294,7 +294,7 @@ pyyaml==6.0.1
# -c ./test.txt
# jupyter-events
# pre-commit
pyzmq==25.1.2
pyzmq==26.0.2
# via
# ipykernel
# jupyter-client
Expand Down Expand Up @@ -327,7 +327,7 @@ rpds-py==0.18.0
# via
# jsonschema
# referencing
send2trash==1.8.2
send2trash==1.8.3
# via jupyter-server
six==1.16.0
# via
Expand Down Expand Up @@ -368,7 +368,7 @@ tornado==6.4
# jupyterlab
# notebook
# terminado
traitlets==5.14.2
traitlets==5.14.3
# via
# comm
# ipykernel
Expand Down Expand Up @@ -401,7 +401,7 @@ urllib3==1.26.18
# -c ./base.txt
# -c ./test.txt
# requests
virtualenv==20.25.1
virtualenv==20.25.3
# via pre-commit
wcwidth==0.2.13
# via prompt-toolkit
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-csv.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ numpy==1.26.4
# via
# -c ./base.txt
# pandas
pandas==2.2.1
pandas==2.2.2
# via -r ./extra-csv.in
python-dateutil==2.9.0.post0
# via
Expand Down
14 changes: 7 additions & 7 deletions requirements/extra-paddleocr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ cython==3.0.10
# via unstructured-paddleocr
et-xmlfile==1.1.0
# via openpyxl
flask==3.0.2
flask==3.0.3
# via
# flask-babel
# visualdl
Expand All @@ -49,7 +49,7 @@ fonttools==4.51.0
# via matplotlib
future==1.0.0
# via bce-python-sdk
idna==3.6
idna==3.7
# via
# -c ./base.txt
# requests
Expand All @@ -63,7 +63,7 @@ importlib-metadata==7.1.0
# via flask
importlib-resources==6.4.0
# via matplotlib
itsdangerous==2.1.2
itsdangerous==2.2.0
# via flask
jinja2==3.1.3
# via
Expand Down Expand Up @@ -129,7 +129,7 @@ packaging==23.2
# matplotlib
# scikit-image
# visualdl
pandas==2.2.1
pandas==2.2.2
# via visualdl
pdf2image==1.17.0
# via unstructured-paddleocr
Expand Down Expand Up @@ -168,7 +168,7 @@ pytz==2024.1
# via
# flask-babel
# pandas
rapidfuzz==3.7.0
rapidfuzz==3.8.1
# via
# -c ./base.txt
# unstructured-paddleocr
Expand All @@ -188,7 +188,7 @@ scipy==1.10.1
# -c ././deps/constraints.txt
# imgaug
# scikit-image
shapely==2.0.3
shapely==2.0.4
# via
# imgaug
# unstructured-paddleocr
Expand All @@ -200,7 +200,7 @@ six==1.16.0
# imgaug
# python-dateutil
# visualdl
tifffile==2024.2.12
tifffile==2024.4.18
# via scikit-image
tqdm==4.66.2
# via
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ pillow_heif
pypdf
# Do not move to constraints.in, otherwise unstructured-inference will not be upgraded
# when unstructured library is.
unstructured-inference==0.7.25
unstructured-inference==0.7.27
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12
Loading

0 comments on commit cc8be15

Please sign in to comment.