-
Notifications
You must be signed in to change notification settings - Fork 818
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
35 changed files
with
913 additions
and
662 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -204,3 +204,6 @@ examples/**/output/ | |
|
||
outputdiff.txt | ||
metricsdiff.txt | ||
|
||
# APK packages for the docker build | ||
docker-packages/* |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,41 +1,55 @@ | ||
# syntax=docker/dockerfile:experimental | ||
FROM quay.io/unstructured-io/base-images:rocky9.2-9@sha256:73d8492452f086144d4b92b7931aa04719f085c74d16cae81e8826ef873729c9 as base | ||
FROM cgr.dev/chainguard/wolfi-base:latest | ||
|
||
# NOTE(crag): NB_USER ARG for mybinder.org compat: | ||
# https://mybinder.readthedocs.io/en/latest/tutorials/dockerfile.html | ||
ARG NB_USER=notebook-user | ||
ARG NB_UID=1000 | ||
ARG PIP_VERSION | ||
WORKDIR /app | ||
|
||
# Set up environment | ||
ENV HOME /home/${NB_USER} | ||
ENV PYTHONPATH="${PYTHONPATH}:${HOME}" | ||
ENV PATH="/home/usr/.local/bin:${PATH}" | ||
USER root | ||
|
||
RUN groupadd --gid ${NB_UID} ${NB_USER} | ||
RUN useradd --uid ${NB_UID} --gid ${NB_UID} ${NB_USER} | ||
WORKDIR ${HOME} | ||
|
||
FROM base as deps | ||
# Copy and install Unstructured | ||
COPY requirements requirements | ||
|
||
RUN python3.10 -m pip install pip==${PIP_VERSION} && \ | ||
dnf -y groupinstall "Development Tools" && \ | ||
find requirements/ -type f -name "*.txt" -exec python3 -m pip install --no-cache -r '{}' ';' && \ | ||
dnf -y groupremove "Development Tools" && \ | ||
dnf clean all | ||
|
||
RUN python3.10 -c "import nltk; nltk.download('punkt')" && \ | ||
python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" | ||
|
||
FROM deps as code | ||
|
||
USER ${NB_USER} | ||
|
||
COPY example-docs example-docs | ||
COPY ./docker-packages/*.apk packages/ | ||
COPY ./requirements/*.txt requirements/ | ||
COPY unstructured unstructured | ||
COPY test_unstructured test_unstructured | ||
COPY example-docs example-docs | ||
|
||
RUN python3.10 -c "from unstructured.partition.model_init import initialize; initialize()" | ||
RUN apk update && apk add py3.11-pip mesa-gl glib cmake && \ | ||
apk add --allow-untrusted packages/pandoc-3.1.8-r0.apk && \ | ||
apk add --allow-untrusted packages/poppler-23.09.0-r0.apk && \ | ||
apk add --allow-untrusted packages/leptonica-1.83.0-r0.apk && \ | ||
apk add --allow-untrusted packages/tesseract-5.3.2-r0.apk && \ | ||
apk add --allow-untrusted packages/libreoffice-7.6.5-r0.apk && \ | ||
apk add bash && \ | ||
apk add libmagic && \ | ||
mv /share/tessdata/configs /usr/local/share/tessdata/ && \ | ||
mv /share/tessdata/tessconfigs /usr/local/share/tessdata/ && \ | ||
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/libreoffice && \ | ||
ln -s /usr/local/lib/libreoffice/program/soffice.bin /usr/local/bin/soffice && \ | ||
chmod +x /usr/local/lib/libreoffice/program/soffice.bin && \ | ||
chmod +x /usr/local/bin/libreoffice && \ | ||
chmod +x /usr/local/bin/soffice | ||
|
||
RUN chown -R nonroot:nonroot /app | ||
|
||
USER nonroot | ||
|
||
RUN pip3.11 install --no-cache-dir --user -r requirements/base.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/test.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-csv.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-docx.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-epub.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-markdown.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-msg.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-odt.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-pdf-image.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-pptx.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/extra-xlsx.txt && \ | ||
pip3.11 install --no-cache-dir --user -r requirements/huggingface.txt && \ | ||
pip3.11 install unstructured.paddlepaddle | ||
|
||
RUN python3.11 -c "import nltk; nltk.download('punkt')" && \ | ||
python3.11 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \ | ||
python3.11 -c "from unstructured.partition.model_init import initialize; initialize()" && \ | ||
python3.11 -c "from unstructured_inference.models.tables import UnstructuredTableTransformerModel; model = UnstructuredTableTransformerModel(); model.initialize('microsoft/table-transformer-structure-recognition')" | ||
|
||
ENV PATH="${PATH}:/home/nonroot/.local/bin" | ||
ENV TESSDATA_PREFIX=/usr/local/share/tessdata | ||
|
||
CMD ["/bin/bash"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
149 changes: 149 additions & 0 deletions
149
example-docs/test_evaluate_files/unstructured_output/form.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
[ | ||
{ | ||
"type": "FormKeysValues", | ||
"element_id": "MOCK_FORM_ID", | ||
"text": "", | ||
"metadata": { | ||
"coordinates": { | ||
"points": [ | ||
[ | ||
35.15625, | ||
95.556640625 | ||
], | ||
[ | ||
710.357666015625, | ||
95.556640625 | ||
], | ||
[ | ||
710.357666015625, | ||
887.890625 | ||
], | ||
[ | ||
35.15625, | ||
887.890625 | ||
] | ||
], | ||
"system": "PixelSpace", | ||
"layout_width": 754, | ||
"layout_height": 1000 | ||
}, | ||
"page_number": 1, | ||
"key_value_pairs": [ | ||
{ | ||
"key": { | ||
"text": "MOCK KEY", | ||
"custom_element": { | ||
"type": "UncategorizedText", | ||
"element_id": "MOCK_KEY_ID_1", | ||
"text": "MOCK KEY", | ||
"metadata": { | ||
"coordinates": { | ||
"points": [ | ||
[ | ||
503.271484375, | ||
96.3897705078125 | ||
], | ||
[ | ||
503.271484375, | ||
107.5164794921875 | ||
], | ||
[ | ||
606.103515625, | ||
107.5164794921875 | ||
], | ||
[ | ||
606.103515625, | ||
96.3897705078125 | ||
] | ||
], | ||
"system": "PixelSpace", | ||
"layout_width": 754, | ||
"layout_height": 1000 | ||
}, | ||
"page_number": 1 | ||
} | ||
}, | ||
"layout_element_id": null | ||
}, | ||
"value": { | ||
"text": "MOCK VALUE", | ||
"custom_element": { | ||
"type": "UncategorizedText", | ||
"element_id": "MOCK_VALUE_ID", | ||
"text": "MOCK VALUE", | ||
"metadata": { | ||
"coordinates": { | ||
"points": [ | ||
[ | ||
557.568359375, | ||
124.8626708984375 | ||
], | ||
[ | ||
557.568359375, | ||
136.6607666015625 | ||
], | ||
[ | ||
595.556640625, | ||
136.6607666015625 | ||
], | ||
[ | ||
595.556640625, | ||
124.8626708984375 | ||
] | ||
], | ||
"system": "PixelSpace", | ||
"layout_width": 754, | ||
"layout_height": 1000 | ||
}, | ||
"page_number": 1 | ||
} | ||
}, | ||
"layout_element_id": null | ||
}, | ||
"confidence": 0.0 | ||
}, | ||
{ | ||
"key": { | ||
"text": "MOCK KEY 2", | ||
"custom_element": { | ||
"type": "UncategorizedText", | ||
"element_id": "MOCK_KEY_ID_2", | ||
"text": "MOCK KEY 2", | ||
"metadata": { | ||
"coordinates": { | ||
"points": [ | ||
[ | ||
428.52783203125, | ||
124.0478515625 | ||
], | ||
[ | ||
428.52783203125, | ||
136.6943359375 | ||
], | ||
[ | ||
473.81591796875, | ||
136.6943359375 | ||
], | ||
[ | ||
473.81591796875, | ||
124.0478515625 | ||
] | ||
], | ||
"system": "PixelSpace", | ||
"layout_width": 754, | ||
"layout_height": 1000 | ||
}, | ||
"page_number": 1 | ||
} | ||
}, | ||
"layout_element_id": null | ||
}, | ||
"value": null, | ||
"confidence": 0.0 | ||
} | ||
], | ||
"file_directory": "dataset/testing_data/images", | ||
"filename": "MOCK.png" | ||
} | ||
} | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#!/bin/bash | ||
|
||
files=( | ||
"libreoffice-7.6.5-r0.apk" | ||
"openjpeg-2.5.0-r0.apk" | ||
"poppler-23.09.0-r0.apk" | ||
"leptonica-1.83.0-r0.apk" | ||
"pandoc-3.1.8-r0.apk" | ||
"tesseract-5.3.2-r0.apk" | ||
"nltk_data.tgz" | ||
|
||
) | ||
|
||
directory="docker-packages" | ||
mkdir -p "${directory}" | ||
|
||
for file in "${files[@]}"; do | ||
echo "Downloading ${file}" | ||
wget "https://utic-public-cf.s3.amazonaws.com/$file" -P "$directory" | ||
done | ||
|
||
echo "Downloads complete." |
Oops, something went wrong.