From 5624fdfa3b51bc33f2671f7657fa182031fd5e6a Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 13:20:09 -0400 Subject: [PATCH 01/11] fix: disable arm build for chainguard --- .github/workflows/docker-publish.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 40ac4f4fe2..ca9f164f7f 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -24,7 +24,10 @@ jobs: build-images: strategy: matrix: - docker-platform: ["linux/arm64", "linux/amd64"] + # NOTE(robinson) - temporarily disabling arm since the libreoffice packages only + # works on amd right now + docker-platform: ["linux/amd64"] + # docker-platform: ["linux/arm64", "linux/amd64"] runs-on: ubuntu-latest-m needs: set-short-sha env: From 3d808c43b43404250275370e8bb41826fa4e1606 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 13:22:39 -0400 Subject: [PATCH 02/11] test out publish workflow on branch --- .github/workflows/docker-publish.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index ca9f164f7f..1e6f952623 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - fix/only-amd-build env: DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured From c4e324102213e6b8b81e7abe0d2ba134044b756e Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 15:06:28 -0400 Subject: [PATCH 03/11] skip scan for now --- .github/workflows/docker-publish.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 1e6f952623..9954f53534 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -59,11 +59,6 @@ jobs: --progress plain \ --cache-from $DOCKER_BUILD_REPOSITORY:$ARCH \ -t $DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA . - - name: Scan image - uses: anchore/scan-action@v3 - with: - image: "$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA" - severity-cutoff: high - name: Set up QEMU uses: docker/setup-qemu-action@v2 - name: Test images From a0399c7897299319e9b015b16452e3615a647125 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 15:13:57 -0400 Subject: [PATCH 04/11] remove feature branch --- .github/workflows/docker-publish.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 9954f53534..22ec645793 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - fix/only-amd-build env: DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured From 4e991aca742d5cc3fde6398cd4873c027fe7955e Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 15:36:21 -0400 Subject: [PATCH 05/11] disable smoke test --- .github/workflows/docker-publish.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 22ec645793..36791a82c6 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -72,7 +72,8 @@ jobs: DOCKER_PLATFORM="${{ matrix.docker-platform }}" DOCKER_IMAGE="$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA" \ make docker-test CI=true TEST_FILE=test_unstructured/partition/test_text.py fi - DOCKER_IMAGE=$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA make docker-smoke-test + # NOTE(robinson) - disabling smoke because there's no notebook user anymore + # DOCKER_IMAGE=$DOCKER_BUILD_REPOSITORY:$ARCH-$SHORT_SHA make docker-smoke-test - name: Push images run: | # write to the build repository to cache for the publish-images job From 2b6cb3955d4f735b0888115d4962a53316697d0a Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 17:23:45 -0400 Subject: [PATCH 06/11] install ingest deps and smoke test --- Dockerfile | 17 +++-------------- scripts/docker-smoke-test.sh | 7 ++++--- 2 files changed, 7 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index 4647c79dd0..f2fc3c675c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -5,7 +5,7 @@ WORKDIR /app USER root COPY ./docker-packages/*.apk packages/ -COPY ./requirements/*.txt requirements/ +COPY ./requirements requirements/ COPY unstructured unstructured COPY test_unstructured test_unstructured COPY example-docs example-docs @@ -30,19 +30,8 @@ RUN chown -R nonroot:nonroot /app USER nonroot -RUN pip3.11 install --no-cache-dir --user -r requirements/base.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/test.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-csv.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-docx.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-epub.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-markdown.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-msg.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-odt.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-pdf-image.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-pptx.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/extra-xlsx.txt && \ - pip3.11 install --no-cache-dir --user -r requirements/huggingface.txt && \ - pip3.11 install unstructured.paddlepaddle +RUN find requirements/ -type f -name "*.txt" -exec pip3.11 install --no-cache-dir --user -r '{}' ';' +RUN pip3.11 install unstructured.paddlepaddle RUN python3.11 -c "import nltk; nltk.download('punkt')" && \ python3.11 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \ diff --git a/scripts/docker-smoke-test.sh b/scripts/docker-smoke-test.sh index b040b13a52..6cace034bb 100755 --- a/scripts/docker-smoke-test.sh +++ b/scripts/docker-smoke-test.sh @@ -38,9 +38,10 @@ trap stop_container EXIT await_container # Run the tests -docker cp test_unstructured_ingest $CONTAINER_NAME:/home/notebook-user -docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R 1000:1000 /home/notebook-user/test_unstructured_ingest" -docker exec "$CONTAINER_NAME" /bin/bash -c "/home/notebook-user/test_unstructured_ingest/src/wikipedia.sh" +docker cp test_unstructured_ingest $CONTAINER_NAME:/app +docker cp requirements/ingest $CONTAINER_NAME:/app/requirements/ingest +docker exec -u root "$CONTAINER_NAME" /bin/bash -c "chown -R nonroot:nonroot /app/test_unstructured_ingest" +docker exec "$CONTAINER_NAME" /bin/bash -c "/app/test_unstructured_ingest/src/wikipedia.sh" result=$? exit $result From bd5fded1b77d9951ccd7396fcf00b7b63e61e98e Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 17:24:24 -0400 Subject: [PATCH 07/11] test on feature branch --- .github/workflows/docker-publish.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 36791a82c6..1e9351bc2c 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - fix/only-amd-build env: DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured From e16cfb06fd62dfa310985ca01b6860219058ffcb Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 18:04:41 -0400 Subject: [PATCH 08/11] remove feature branch --- .github/workflows/docker-publish.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 1e9351bc2c..36791a82c6 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - fix/only-amd-build env: DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured From f75024924eb30e4162ab363b4c5db876d1febe7b Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 18:10:45 -0400 Subject: [PATCH 09/11] don't pull arm manifest --- .github/workflows/docker-publish.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 36791a82c6..809d87e45b 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -4,6 +4,7 @@ on: push: branches: - main + - fix/only-amd-build env: DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured @@ -97,9 +98,10 @@ jobs: - name: Pull AMD image run: | docker pull $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA - - name: Pull ARM image - run: | - docker pull $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA + # NOTE(robinson) - put this back in when we reenable ARM + # - name: Pull ARM image + # run: | + # docker pull $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA - name: Push latest build tags for AMD and ARM run: | # these are used to construct the final manifest but also cache-from in subsequent runs From 460cd712e1d0704658b9d46b7b2cab91c73fa291 Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 19:06:54 -0400 Subject: [PATCH 10/11] disable more arm --- .github/workflows/docker-publish.yml | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 809d87e45b..3eb58894c0 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -107,14 +107,16 @@ jobs: # these are used to construct the final manifest but also cache-from in subsequent runs docker tag $DOCKER_BUILD_REPOSITORY:amd64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 docker push $DOCKER_BUILD_REPOSITORY:amd64 - docker tag $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm64 - docker push $DOCKER_BUILD_REPOSITORY:arm64 + # NOTE(robinson) - update this when we reenable ARM + # docker tag $DOCKER_BUILD_REPOSITORY:arm64-$SHORT_SHA $DOCKER_BUILD_REPOSITORY:arm64 + # docker push $DOCKER_BUILD_REPOSITORY:arm64 - name: Push multiarch manifest run: | - docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 + # NOTE(robinson) - update this when we reenable ARM + docker manifest create ${DOCKER_REPOSITORY}:latest $DOCKER_BUILD_REPOSITORY:amd64 docker manifest push $DOCKER_REPOSITORY:latest - docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 + docker manifest create ${DOCKER_REPOSITORY}:$SHORT_SHA $DOCKER_BUILD_REPOSITORY:amd64 docker manifest push $DOCKER_REPOSITORY:$SHORT_SHA VERSION=$(grep -Po '(?<=__version__ = ")[^"]*' unstructured/__version__.py) - docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 $DOCKER_BUILD_REPOSITORY:arm64 + docker manifest create ${DOCKER_REPOSITORY}:$VERSION $DOCKER_BUILD_REPOSITORY:amd64 docker manifest push $DOCKER_REPOSITORY:$VERSION From 0f68784ca64889b2d8edcd809a6ba65274dae16c Mon Sep 17 00:00:00 2001 From: Matt Robinson Date: Thu, 16 May 2024 19:49:20 -0400 Subject: [PATCH 11/11] remove feature branch --- .github/workflows/docker-publish.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml index 3eb58894c0..5cdda5724c 100644 --- a/.github/workflows/docker-publish.yml +++ b/.github/workflows/docker-publish.yml @@ -4,7 +4,6 @@ on: push: branches: - main - - fix/only-amd-build env: DOCKER_REPOSITORY: quay.io/unstructured-io/unstructured