Skip to content

Commit

Permalink
Merge branch 'main' into patch-1
Browse files Browse the repository at this point in the history
  • Loading branch information
MthwRobinson authored May 16, 2024
2 parents b91fdf3 + 8644a3b commit 818e57d
Show file tree
Hide file tree
Showing 35 changed files with 913 additions and 662 deletions.
1 change: 1 addition & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -497,5 +497,6 @@ jobs:
- name: Test Dockerfile
run: |
echo "UNS_API_KEY=${{ secrets.UNS_API_KEY }}" > uns_test_env_file
make docker-dl-packages
make docker-build
make docker-test CI=true UNSTRUCTURED_INCLUDE_DEBUG_METADATA=true
6 changes: 6 additions & 0 deletions .github/workflows/docker-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,19 @@ jobs:
password: ${{ secrets.QUAY_IO_ROBOT_TOKEN }}
- name: Build images
run: |
make docker-dl-packages
ARCH=$(cut -d "/" -f2 <<< ${{ matrix.docker-platform }})
DOCKER_BUILDKIT=1 docker buildx build --platform=$ARCH --load \
--build-arg PIP_VERSION=$PIP_VERSION \
--build-arg BUILDKIT_INLINE_CACHE=1 \
--progress plain \
--cache-from $DOCKER_BUILD_REPOSITORY:$ARCH \
-t $DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA .
- name: Scan image
uses: anchore/scan-action@v3
with:
image: "$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA"
severity-cutoff: high
- name: Set up QEMU
uses: docker/setup-qemu-action@v2
- name: Test images
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,6 @@ examples/**/output/

outputdiff.txt
metricsdiff.txt

# APK packages for the docker build
docker-packages/*
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
## 0.13.8-dev8
## 0.13.8-dev15

### Enhancements

* **Skip unnecessary element sorting in `partition_pdf()`**. Skip element sorting when determining whether embedded text can be extracted.
* **Faster evaluation** Support for concurrent processing of documents during evaluation
* **Add strategy parameter to `partition_docx()`.** Behavior of future enhancements may be sensitive the partitioning strategy. Add this parameter so `partition_docx()` is aware of the requested strategy.

### Features
* **Add form extraction basics (document elements and placeholder code in partition)**. This is to lay the ground work for the future. Form extraction models are not currently available in the library. An attempt to use this functionality will end in a `NotImplementedError`.

### Fixes

Expand All @@ -15,6 +18,9 @@
* **Fix type hint for paragraph_grouper param** `paragraph_grouper` can be set to `False`, but the type hint did not not reflect this previously.
* **Remove links param from partition_pdf** `links` is extracted during partitioning and is not needed as a paramter in partition_pdf.
* **Improve CSV delimeter detection.** `partition_csv()` would raise on CSV files with very long lines.
* **Fix disk-space leak in `partition_doc()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_doc()`.
* **Fix possible `SyntaxError` or `SyntaxWarning` on regex patterns.** Change regex patterns to raw strings to avoid these warnings/errors in Python 3.11+.
* **Fix disk-space leak in `partition_odt()`.** Remove temporary file created but not removed when `file` argument is passed to `partition_odt()`.

## 0.13.7

Expand Down
82 changes: 48 additions & 34 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,41 +1,55 @@
# syntax=docker/dockerfile:experimental
FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base
FROM cgr.dev/chainguard/wolfi-base:latest

# NOTE(crag): NB_USER ARG for mybinder.org compat:
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html
ARG NB_USER=notebook-user
ARG NB_UID=1000
ARG PIP_VERSION
WORKDIR /app

# Set up environment
ENV HOME /home/${NB_USER}
ENV PYTHONPATH="${PYTHONPATH}:${HOME}"
ENV PATH="/home/usr/.local/bin:${PATH}"
USER root

RUN groupadd --gid ${NB_UID} ${NB_USER}
RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER}
WORKDIR ${HOME}

FROM base as deps
# Copy and install Unstructured
COPY requirements requirements

RUN python3.10 -m pip install pip==${PIP_VERSION} && \
dnf -y groupinstall "Development Tools" && \
find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \
dnf -y groupremove "Development Tools" && \
dnf clean all

RUN python3.10 -c "import nltk; nltk.download('punkt')" && \
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')"

FROM deps as code

USER ${NB_USER}

COPY example-docs example-docs
COPY ./docker-packages/*.apk packages/
COPY ./requirements/*.txt requirements/
COPY unstructured unstructured
COPY test_unstructured test_unstructured
COPY example-docs example-docs

RUN python3.10 -c "from unstructured.partition.model_init import initialize; initialize()"
RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \
apk add --allow-untrusted packages/pandoc-3.1.8-r0.apk && \
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \
apk add --allow-untrusted packages/libreoffice-7.6.5-r0.apk && \
apk add bash && \
apk add libmagic && \
mv /share/tessdata/configs /usr/local/share/tessdata/ && \
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/libreoffice && \
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/soffice && \
chmod +x /usr/local/lib/libreoffice/program/soffice.bin && \
chmod +x /usr/local/bin/libreoffice && \
chmod +x /usr/local/bin/soffice

RUN chown -R nonroot:nonroot /app

USER nonroot

RUN pip3.11 install --no-cache-dir --user -r requirements/base.txt && \
pip3.11 install --no-cache-dir --user -r requirements/test.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-csv.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-docx.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-epub.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-markdown.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-msg.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-odt.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-pdf-image.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-pptx.txt && \
pip3.11 install --no-cache-dir --user -r requirements/extra-xlsx.txt && \
pip3.11 install --no-cache-dir --user -r requirements/huggingface.txt && \
pip3.11 install unstructured.paddlepaddle

RUN python3.11 -c "import nltk; nltk.download('punkt')" && \
python3.11 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')"

ENV PATH="${PATH}:/home/nonroot/.local/bin"
ENV TESSDATA_PREFIX=/usr/local/share/tessdata

CMD ["/bin/bash"]
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,10 @@ DOCKER_IMAGE ?= unstructured:dev
docker-build:
PIP_VERSION=${PIP_VERSION} DOCKER_IMAGE_NAME=${DOCKER_IMAGE} ./scripts/docker-build.sh

.PHONY: docker-dl-packages
docker-dl-packages:
@scripts/docker-dl-packages.sh

.PHONY: docker-start-bash
docker-start-bash:
docker run -ti --rm ${DOCKER_IMAGE}
Expand Down
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ As the Chipper model is in beta version, we welcome feedback and suggestions. Fo
## :eight_pointed_black_star: Quick Start

There are several ways to use the `unstructured` library:
* [Run the library in a container](https://github.com/Unstructured-IO/unstructured#using-the-library-in-a-container) or
* [Run the library in a container](https://github.com/Unstructured-IO/unstructured#run-the-library-in-a-container) or
* Install the library
1. [Install from PyPI](https://github.com/Unstructured-IO/unstructured#installing-the-library)
2. [Install for local development](https://github.com/Unstructured-IO/unstructured#installation-instructions-for-local-development)
Expand Down Expand Up @@ -85,7 +85,9 @@ docker run -dt --name unstructured downloads.unstructured.io/unstructured-io/uns
docker exec -it unstructured bash
```

You can also build your own Docker image.
You can also build your own Docker image. Note that the base image is `wolfi-base`, which is
updated regularly. If you are building the image locally, it is possible `docker-build` could
fail due to upstream changes in `wolfi-base`.

If you only plan on parsing one type of data you can speed up building the image by commenting out some
of the packages/requirements necessary for other data types. See Dockerfile to know which lines are necessary
Expand Down
Binary file added example-docs/simple.doc
Binary file not shown.
Binary file added example-docs/simple.docx
Binary file not shown.
Binary file added example-docs/simple.odt
Binary file not shown.
149 changes: 149 additions & 0 deletions example-docs/test_evaluate_files/unstructured_output/form.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
[
{
"type": "FormKeysValues",
"element_id": "MOCK_FORM_ID",
"text": "",
"metadata": {
"coordinates": {
"points": [
[
35.15625,
95.556640625
],
[
710.357666015625,
95.556640625
],
[
710.357666015625,
887.890625
],
[
35.15625,
887.890625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1,
"key_value_pairs": [
{
"key": {
"text": "MOCK KEY",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_1",
"text": "MOCK KEY",
"metadata": {
"coordinates": {
"points": [
[
503.271484375,
96.3897705078125
],
[
503.271484375,
107.5164794921875
],
[
606.103515625,
107.5164794921875
],
[
606.103515625,
96.3897705078125
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": {
"text": "MOCK VALUE",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_VALUE_ID",
"text": "MOCK VALUE",
"metadata": {
"coordinates": {
"points": [
[
557.568359375,
124.8626708984375
],
[
557.568359375,
136.6607666015625
],
[
595.556640625,
136.6607666015625
],
[
595.556640625,
124.8626708984375
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"confidence": 0.0
},
{
"key": {
"text": "MOCK KEY 2",
"custom_element": {
"type": "UncategorizedText",
"element_id": "MOCK_KEY_ID_2",
"text": "MOCK KEY 2",
"metadata": {
"coordinates": {
"points": [
[
428.52783203125,
124.0478515625
],
[
428.52783203125,
136.6943359375
],
[
473.81591796875,
136.6943359375
],
[
473.81591796875,
124.0478515625
]
],
"system": "PixelSpace",
"layout_width": 754,
"layout_height": 1000
},
"page_number": 1
}
},
"layout_element_id": null
},
"value": null,
"confidence": 0.0
}
],
"file_directory": "dataset/testing_data/images",
"filename": "MOCK.png"
}
}
]
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ lint.select = [
"UP018", # -- Unnecessary {literal_type} call like `str("abc")`. (rewrite as a literal) --
"UP032", # -- Use f-string instead of `.format()` call --
"UP034", # -- Avoid extraneous parentheses --
"W", # -- Warnings, including invalid escape-sequence --
]
lint.ignore = [
"COM812", # -- over aggressively insists on trailing commas where not desireable --
Expand Down
1 change: 1 addition & 0 deletions scripts/docker-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ DOCKER_BUILD_CMD=(docker buildx build --load -f Dockerfile
--build-arg PIP_VERSION="$PIP_VERSION"
--build-arg BUILDKIT_INLINE_CACHE=1
--progress plain
--platform linux/amd64
--cache-from "$DOCKER_REPOSITORY":latest
-t "$DOCKER_IMAGE" .)

Expand Down
22 changes: 22 additions & 0 deletions scripts/docker-dl-packages.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

files=(
"libreoffice-7.6.5-r0.apk"
"openjpeg-2.5.0-r0.apk"
"poppler-23.09.0-r0.apk"
"leptonica-1.83.0-r0.apk"
"pandoc-3.1.8-r0.apk"
"tesseract-5.3.2-r0.apk"
"nltk_data.tgz"

)

directory="docker-packages"
mkdir -p "${directory}"

for file in "${files[@]}"; do
echo "Downloading ${file}"
wget "https://utic-public-cf.s3.amazonaws.com/$file" -P "$directory"
done

echo "Downloads complete."
Loading

0 comments on commit 818e57d

Please sign in to comment.