Skip to content

Commit

Permalink
Merge branch 'main' into feat/using-nltk-assets-from-docker-image
Browse files Browse the repository at this point in the history
  • Loading branch information
christinestraub authored Dec 23, 2024
2 parents 9e76d49 + 0245661 commit ea158a6
Show file tree
Hide file tree
Showing 22 changed files with 676 additions and 113 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
### Enhancements

- **Prepare auto-partitioning for pluggable partitioners**. Move toward a uniform partitioner call signature so a custom or override partitioner can be registered without code changes.
- **Add NDJSON file type support**

### Features

### Fixes

- Base image has been updated, trigger new workflows
- **Upgrade ruff to latest.** Previously the ruff version was pinned to <0.5. Remove that pin and fix the handful of lint items that resulted.
- **CSV with asserted XLS content-type is correctly identified as CSV.** Resolves a bug where a CSV file with an asserted content-type of `application/vnd.ms-excel` was incorrectly identified as an XLS file.
- **Improve element-type mapping for Chinese text.** Fixes bug where Chinese text would produce large numbers of false-positive `Title` elements.
Expand Down
8 changes: 5 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,11 @@ COPY test_unstructured test_unstructured
COPY example-docs example-docs

RUN chown -R notebook-user:notebook-user /app && \
apk add font-ubuntu git && \
fc-cache -fv && \
if [ ! -e /usr/bin/python3 ]; then ln -s /usr/bin/python3.11 /usr/bin/python3; fi
apk add font-ubuntu git && \
fc-cache -fv && \
if [ "$(readlink -f /usr/bin/python3)" != "/usr/bin/python3.11" ]; then \
ln -sf /usr/bin/python3.11 /usr/bin/python3; \
fi

USER notebook-user

Expand Down
8 changes: 8 additions & 0 deletions example-docs/simple.ndjson
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{"element_id": "a06d2d9e65212d4aa955c3ab32950ffa", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "These are a few of my favorite things:", "type": "Title"}
{"element_id": "b334c93e9b1cbca3b6f6d78ce8bc2484", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Parrots", "type": "ListItem"}
{"element_id": "76469ecb9f1459943c8d8cca1a550b5a", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "a06d2d9e65212d4aa955c3ab32950ffa"}, "text": "Hockey", "type": "ListItem"}
{"element_id": "261fac731945a138415adc2dd4434b17", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "Analysis", "type": "Title"}
{"element_id": "95f392d32c5271bfdb30eaef45921e59", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my first thought. This is my second thought.", "type": "NarrativeText"}
{"element_id": "0de25bd6f0d74bc4f909f2678f385736", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "This is my third thought.", "type": "NarrativeText"}
{"element_id": "f296a3bc8a901f19199fda1da92829b6", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51", "parent_id": "261fac731945a138415adc2dd4434b17"}, "text": "2023", "type": "UncategorizedText"}
{"element_id": "78c62edbc674fdca0f6a0e3ffb459f86", "metadata": {"category_depth": 0, "file_directory": "unstructured/example-docs", "filename": "simple.docx", "filetype": "application/vnd.openxmlformats-officedocument.wordprocessingml.document", "languages": ["eng"], "last_modified": "2024-07-06T16:44:51"}, "text": "DOYLESTOWN, PA 18901", "type": "Address"}
35 changes: 35 additions & 0 deletions example-docs/spring-weather.html.ndjson

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions requirements/base.in
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ tqdm
psutil
python-oxmsg
html5lib
ndjson
36 changes: 18 additions & 18 deletions requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
#
# pip-compile ./base.in
#
anyio==4.6.2.post1
anyio==4.7.0
# via httpx
backoff==2.2.1
# via -r ./base.in
beautifulsoup4==4.12.3
# via -r ./base.in
certifi==2024.8.30
certifi==2024.12.14
# via
# httpcore
# httpx
Expand All @@ -28,13 +28,13 @@ click==8.1.7
# via
# nltk
# python-oxmsg
cryptography==43.0.3
cryptography==44.0.0
# via unstructured-client
dataclasses-json==0.6.7
# via
# -r ./base.in
# unstructured-client
deepdiff==8.0.1
deepdiff==8.1.1
# via unstructured-client
emoji==2.14.0
# via -r ./base.in
Expand All @@ -46,9 +46,9 @@ h11==0.14.0
# via httpcore
html5lib==1.1
# via -r ./base.in
httpcore==1.0.6
httpcore==1.0.7
# via httpx
httpx==0.27.2
httpx==0.28.1
# via unstructured-client
idna==3.10
# via
Expand All @@ -64,14 +64,16 @@ langdetect==1.0.9
# via -r ./base.in
lxml==5.3.0
# via -r ./base.in
marshmallow==3.23.0
marshmallow==3.23.1
# via
# dataclasses-json
# unstructured-client
mypy-extensions==1.0.0
# via
# typing-inspect
# unstructured-client
ndjson==0.3.1
# via -r ./base.in
nest-asyncio==1.6.0
# via unstructured-client
nltk==3.9.1
Expand All @@ -80,17 +82,17 @@ numpy==1.26.4
# via -r ./base.in
olefile==0.47
# via python-oxmsg
orderly-set==5.2.2
orderly-set==5.2.3
# via deepdiff
packaging==24.1
packaging==24.2
# via
# marshmallow
# unstructured-client
psutil==6.1.0
# via -r ./base.in
pycparser==2.22
# via cffi
pypdf==5.0.1
pypdf==5.1.0
# via unstructured-client
python-dateutil==2.9.0.post0
# via unstructured-client
Expand All @@ -100,9 +102,9 @@ python-magic==0.4.27
# via -r ./base.in
python-oxmsg==0.0.1
# via -r ./base.in
rapidfuzz==3.10.1
rapidfuzz==3.11.0
# via -r ./base.in
regex==2024.9.11
regex==2024.11.6
# via nltk
requests==2.32.3
# via
Expand All @@ -111,19 +113,17 @@ requests==2.32.3
# unstructured-client
requests-toolbelt==1.0.0
# via unstructured-client
six==1.16.0
six==1.17.0
# via
# html5lib
# langdetect
# python-dateutil
# unstructured-client
sniffio==1.3.1
# via
# anyio
# httpx
# via anyio
soupsieve==2.6
# via beautifulsoup4
tqdm==4.66.5
tqdm==4.67.1
# via
# -r ./base.in
# nltk
Expand All @@ -150,5 +150,5 @@ urllib3==1.26.20
# unstructured-client
webencodings==0.5.1
# via html5lib
wrapt==1.16.0
wrapt==1.17.0
# via -r ./base.in
12 changes: 6 additions & 6 deletions requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@ distlib==0.3.9
# via virtualenv
filelock==3.16.1
# via virtualenv
identify==2.6.1
identify==2.6.3
# via pre-commit
importlib-metadata==8.5.0
# via
# -c ././deps/constraints.txt
# build
nodeenv==1.9.1
# via pre-commit
packaging==24.1
packaging==24.2
# via
# -c ./base.txt
# -c ./test.txt
Expand All @@ -46,16 +46,16 @@ pyyaml==6.0.2
# via
# -c ./test.txt
# pre-commit
tomli==2.0.2
tomli==2.2.1
# via
# -c ./test.txt
# build
# pip-tools
virtualenv==20.27.0
virtualenv==20.28.0
# via pre-commit
wheel==0.44.0
wheel==0.45.1
# via pip-tools
zipp==3.20.2
zipp==3.21.0
# via importlib-metadata

# The following packages are considered to be unsafe in a requirements file:
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-csv.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ python-dateutil==2.9.0.post0
# pandas
pytz==2024.2
# via pandas
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# python-dateutil
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-markdown.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@ importlib-metadata==8.5.0
# markdown
markdown==3.7
# via -r ./extra-markdown.in
zipp==3.20.2
zipp==3.21.0
# via importlib-metadata
25 changes: 12 additions & 13 deletions requirements/extra-paddleocr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
#
# pip-compile ./extra-paddleocr.in
#
anyio==4.6.2.post1
anyio==4.7.0
# via
# -c ./base.txt
# httpx
astor==0.8.1
# via paddlepaddle
certifi==2024.8.30
certifi==2024.12.14
# via
# -c ./base.txt
# httpcore
Expand All @@ -32,17 +32,17 @@ exceptiongroup==1.2.2
# via
# -c ./base.txt
# anyio
fonttools==4.54.1
fonttools==4.55.3
# via matplotlib
h11==0.14.0
# via
# -c ./base.txt
# httpcore
httpcore==1.0.6
httpcore==1.0.7
# via
# -c ./base.txt
# httpx
httpx==0.27.2
httpx==0.28.1
# via
# -c ./base.txt
# paddlepaddle
Expand All @@ -52,7 +52,7 @@ idna==3.10
# anyio
# httpx
# requests
imageio==2.36.0
imageio==2.36.1
# via
# imgaug
# scikit-image
Expand All @@ -64,7 +64,7 @@ kiwisolver==1.4.7
# via matplotlib
lazy-loader==0.4
# via scikit-image
matplotlib==3.9.2
matplotlib==3.9.4
# via imgaug
networkx==3.2.1
# via
Expand Down Expand Up @@ -94,7 +94,7 @@ opencv-python==4.10.0.84
# unstructured-paddleocr
opt-einsum==3.3.0
# via paddlepaddle
packaging==24.1
packaging==24.2
# via
# -c ./base.txt
# lazy-loader
Expand Down Expand Up @@ -127,7 +127,7 @@ python-dateutil==2.9.0.post0
# matplotlib
pyyaml==6.0.2
# via unstructured-paddleocr
rapidfuzz==3.10.1
rapidfuzz==3.11.0
# via
# -c ./base.txt
# unstructured-paddleocr
Expand All @@ -147,7 +147,7 @@ shapely==2.0.6
# via
# imgaug
# unstructured-paddleocr
six==1.16.0
six==1.17.0
# via
# -c ./base.txt
# imgaug
Expand All @@ -156,10 +156,9 @@ sniffio==1.3.1
# via
# -c ./base.txt
# anyio
# httpx
tifffile==2024.8.30
# via scikit-image
tqdm==4.66.5
tqdm==4.67.1
# via
# -c ./base.txt
# unstructured-paddleocr
Expand All @@ -175,5 +174,5 @@ urllib3==1.26.20
# -c ././deps/constraints.txt
# -c ./base.txt
# requests
zipp==3.20.2
zipp==3.21.0
# via importlib-resources
Loading

0 comments on commit ea158a6

Please sign in to comment.