From a4faa305e1c3dc670bd4794a587ffb467792409c Mon Sep 17 00:00:00 2001 From: teor Date: Tue, 28 Jun 2022 09:46:04 +1000 Subject: [PATCH 01/13] Checkout zebra in each job to avoid warnings But put TODOs where we might be able to skip checkouts --- .github/workflows/deploy-gcp-tests.yml | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index e6ae8ed59c7..c3889db24f1 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -91,9 +91,12 @@ jobs: contents: 'read' id-token: 'write' steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -158,9 +161,12 @@ jobs: contents: 'read' id-token: 'write' steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -332,6 +338,8 @@ jobs: contents: 'read' id-token: 'write' steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -458,9 +466,12 @@ jobs: contents: 'read' id-token: 'write' steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -510,9 +521,12 @@ jobs: contents: 'read' id-token: 'write' steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -563,6 +577,11 @@ jobs: contents: 'read' id-token: 'write' steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 with: @@ -650,6 +669,13 @@ jobs: contents: 'read' id-token: 'write' steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 with: From 70822873aafec03652d045d136f57373ea5d99cb Mon Sep 17 00:00:00 2001 From: teor Date: Tue, 28 Jun 2022 10:14:03 +1000 Subject: [PATCH 02/13] Split log following into sprout checkpoints, sapling/orchard checkpoints, and full validation --- .github/workflows/deploy-gcp-tests.yml | 146 +++++++++++++++++++++++-- 1 file changed, 139 insertions(+), 7 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index c3889db24f1..6e491621d8a 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -75,10 +75,19 @@ on: description: 'Application name for Google Cloud instance metadata' env: + # where we get the Docker image from IMAGE_NAME: zebrad-test GAR_BASE: us-docker.pkg.dev/zealous-zebra/zebra + # what kind of Google Cloud instance we want to launch ZONE: us-central1-a MACHINE_TYPE: c2d-standard-16 + # How many previous log lines we show at the start of each new log job. + # Increase this number if some log lines are skipped between jobs + # + # We want to show all the logs since the last job finished, + # but we don't know how long it will be between jobs. + # 200 lines is about 6-15 minutes of sync logs, or one panic log. + EXTRA_LOG_LINES: 200 jobs: # set up the test, if it doesn't use any cached state @@ -453,9 +462,9 @@ jobs: " - # follow the logs of the test we just launched - follow-logs: - name: Show logs for ${{ inputs.test_id }} test + # follow the logs of the test we just launched, up to Sapling activation (or the test finishing) + follow-logs-sprout: + name: Log ${{ inputs.test_id }} test (sprout) needs: [ launch-with-cached-state, launch-without-cached-state ] # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one. # If the previous job fails, we also want to run and fail this job, @@ -492,8 +501,9 @@ jobs: service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' token_format: 'access_token' - # Show all the logs since the container launched - - name: Show logs for ${{ inputs.test_id }} test + # Show all the logs since the container launched, + # following until Sapling activation (or the test finishes) + - name: Show logs for ${{ inputs.test_id }} test (sprout) run: | gcloud compute ssh \ ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ @@ -502,17 +512,139 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ + set -o pipefail; \ docker logs \ --tail all \ --follow \ - ${{ inputs.test_id }} \ + ${{ inputs.test_id }} | \ + tee /dev/tty | \ + grep --max-count=1 --extended-regexp --color=always \ + '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \ " + # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing) + # TODO: split out sapling logs when the mandatory checkpoint is above NU5 activation + follow-logs-checkpoint: + name: Log ${{ inputs.test_id }} test (checkpoint) + needs: [ follow-logs-sprout ] + # If the previous job fails, we also want to run and fail this job, + # so that the branch protection rule fails in Mergify and GitHub. + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + permissions: + contents: 'read' + id-token: 'write' + steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + with: + short-length: 7 + + - name: Downcase network name for disks + run: | + NETWORK_CAPS=${{ inputs.network }} + echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV + + # Setup gcloud CLI + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.8.0 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Show recent logs, following until the last checkpoint (or the test finishes) + - name: Show logs for ${{ inputs.test_id }} test (checkpoint) + run: | + gcloud compute ssh \ + ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ env.ZONE }} \ + --quiet \ + --ssh-flag="-o ServerAliveInterval=5" \ + --command \ + "\ + set -o pipefail; \ + docker logs \ + --tail ${{ env.EXTRA_LOG_LINES }} \ + --follow \ + ${{ inputs.test_id }} | \ + tee /dev/tty | \ + grep --max-count=1 --extended-regexp --color=always \ + '(verified final checkpoint)|(test result:.*finished in)' \ + " + + # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing) + follow-logs-end: + name: Log ${{ inputs.test_id }} test (end) + needs: [ follow-logs-checkpoint ] + # If the previous job fails, we also want to run and fail this job, + # so that the branch protection rule fails in Mergify and GitHub. + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + permissions: + contents: 'read' + id-token: 'write' + steps: + # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? + # Or will that break the slug-action variables we use to find the instance? + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + with: + short-length: 7 + + - name: Downcase network name for disks + run: | + NETWORK_CAPS=${{ inputs.network }} + echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV + + # Setup gcloud CLI + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.8.0 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Show recent logs, following until the test finishes + - name: Show logs for ${{ inputs.test_id }} test (end) + run: | + gcloud compute ssh \ + ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ env.ZONE }} \ + --quiet \ + --ssh-flag="-o ServerAliveInterval=5" \ + --command \ + "\ + set -o pipefail; \ + docker logs \ + --tail ${{ env.EXTRA_LOG_LINES }} \ + --follow \ + ${{ inputs.test_id }} | \ + tee /dev/tty | \ + grep --max-count=1 --extended-regexp --color=always \ + 'test result:.*finished in' \ + " + + # wait for the result of the test test-result: # TODO: update the job name here, and in the branch protection rules name: Run ${{ inputs.test_id }} test - needs: [ follow-logs ] + needs: [ follow-logs-end ] # If the previous job fails, we also want to run and fail this job, # so that the branch protection rule fails in Mergify and GitHub. if: ${{ !cancelled() }} From 837cda1db6e8be8d65291d20fca810e8c6631849 Mon Sep 17 00:00:00 2001 From: teor Date: Tue, 28 Jun 2022 10:27:40 +1000 Subject: [PATCH 03/13] Make job IDs shorter --- .github/workflows/deploy-gcp-tests.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 6e491621d8a..54a35da7a6e 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -463,7 +463,7 @@ jobs: # follow the logs of the test we just launched, up to Sapling activation (or the test finishing) - follow-logs-sprout: + logs-sprout: name: Log ${{ inputs.test_id }} test (sprout) needs: [ launch-with-cached-state, launch-without-cached-state ] # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one. @@ -524,9 +524,9 @@ jobs: # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing) # TODO: split out sapling logs when the mandatory checkpoint is above NU5 activation - follow-logs-checkpoint: + logs-checkpoint: name: Log ${{ inputs.test_id }} test (checkpoint) - needs: [ follow-logs-sprout ] + needs: [ logs-sprout ] # If the previous job fails, we also want to run and fail this job, # so that the branch protection rule fails in Mergify and GitHub. if: ${{ !cancelled() }} @@ -582,9 +582,9 @@ jobs: " # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing) - follow-logs-end: + logs-end: name: Log ${{ inputs.test_id }} test (end) - needs: [ follow-logs-checkpoint ] + needs: [ logs-checkpoint ] # If the previous job fails, we also want to run and fail this job, # so that the branch protection rule fails in Mergify and GitHub. if: ${{ !cancelled() }} @@ -644,7 +644,7 @@ jobs: test-result: # TODO: update the job name here, and in the branch protection rules name: Run ${{ inputs.test_id }} test - needs: [ follow-logs-end ] + needs: [ logs-end ] # If the previous job fails, we also want to run and fail this job, # so that the branch protection rule fails in Mergify and GitHub. if: ${{ !cancelled() }} From 9fe6ca5ee6d049b079aa98655aea8ff2add66032 Mon Sep 17 00:00:00 2001 From: teor Date: Tue, 28 Jun 2022 14:05:56 +1000 Subject: [PATCH 04/13] Use /dev/stderr because docker doesn't have a tty --- .github/workflows/deploy-gcp-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 54a35da7a6e..19455d0d78a 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -517,7 +517,7 @@ jobs: --tail all \ --follow \ ${{ inputs.test_id }} | \ - tee /dev/tty | \ + tee /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \ " @@ -576,7 +576,7 @@ jobs: --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ ${{ inputs.test_id }} | \ - tee /dev/tty | \ + tee /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ '(verified final checkpoint)|(test result:.*finished in)' \ " @@ -634,7 +634,7 @@ jobs: --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ ${{ inputs.test_id }} | \ - tee /dev/tty | \ + tee /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ 'test result:.*finished in' \ " From a852cb674efa09aa1edf19d23d1a6111929f0348 Mon Sep 17 00:00:00 2001 From: Conrado Gouvea Date: Tue, 28 Jun 2022 11:24:13 -0300 Subject: [PATCH 05/13] remove pipefail --- .github/workflows/deploy-gcp-tests.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 19455d0d78a..e58322ca834 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -512,7 +512,6 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ - set -o pipefail; \ docker logs \ --tail all \ --follow \ @@ -571,7 +570,6 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ - set -o pipefail; \ docker logs \ --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ @@ -629,7 +627,6 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ - set -o pipefail; \ docker logs \ --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ From 4618a780ac99ba5b2aac6fd697b72971361f4cbc Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 29 Jun 2022 10:35:48 +1000 Subject: [PATCH 06/13] Revert "remove pipefail" This reverts commit a7ee37bebdc107a4215e7dd307b189d925969234. --- .github/workflows/deploy-gcp-tests.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index e58322ca834..19455d0d78a 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -512,6 +512,7 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ + set -o pipefail; \ docker logs \ --tail all \ --follow \ @@ -570,6 +571,7 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ + set -o pipefail; \ docker logs \ --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ @@ -627,6 +629,7 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ + set -o pipefail; \ docker logs \ --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ From d35e0c40415c12fe520e2372e33df8edd7a409f2 Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 29 Jun 2022 10:37:20 +1000 Subject: [PATCH 07/13] Make tee ignore errors writing to a grep pipe --- .github/workflows/deploy-gcp-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 19455d0d78a..46276fbeca2 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -517,7 +517,7 @@ jobs: --tail all \ --follow \ ${{ inputs.test_id }} | \ - tee /dev/stderr | \ + tee --output-error=exit-nopipe /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \ " @@ -576,7 +576,7 @@ jobs: --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ ${{ inputs.test_id }} | \ - tee /dev/stderr | \ + tee --output-error=exit-nopipe /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ '(verified final checkpoint)|(test result:.*finished in)' \ " @@ -634,7 +634,7 @@ jobs: --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ ${{ inputs.test_id }} | \ - tee /dev/stderr | \ + tee --output-error=exit-nopipe /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ 'test result:.*finished in' \ " From a4857ff13123cd194caed445a63721e8a5935aa5 Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 29 Jun 2022 10:48:48 +1000 Subject: [PATCH 08/13] Avoid launching multiple docker instances for duplicate jobs --- .github/workflows/deploy-gcp-tests.yml | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 46276fbeca2..31b7609639e 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -162,9 +162,8 @@ jobs: launch-without-cached-state: name: Launch ${{ inputs.test_id }} test needs: [ setup-without-cached-state ] - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. - if: ${{ !cancelled() && !inputs.needs_zebra_state }} + # If creating the Google Cloud instance fails, we don't want to launch another docker instance. + if: ${{ !cancelled() && !failure() && !inputs.needs_zebra_state }} runs-on: ubuntu-latest permissions: contents: 'read' @@ -339,9 +338,8 @@ jobs: launch-with-cached-state: name: Launch ${{ inputs.test_id }} test needs: [ setup-with-cached-state ] - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. - if: ${{ !cancelled() && inputs.needs_zebra_state }} + # If creating the Google Cloud instance fails, we don't want to launch another docker instance. + if: ${{ !cancelled() && !failure() && inputs.needs_zebra_state }} runs-on: ubuntu-latest permissions: contents: 'read' @@ -465,10 +463,9 @@ jobs: # follow the logs of the test we just launched, up to Sapling activation (or the test finishing) logs-sprout: name: Log ${{ inputs.test_id }} test (sprout) - needs: [ launch-with-cached-state, launch-without-cached-state ] # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one. - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. + needs: [ launch-with-cached-state, launch-without-cached-state ] + # If the previous job fails, we still want to show the logs. if: ${{ !cancelled() }} runs-on: ubuntu-latest permissions: @@ -527,8 +524,7 @@ jobs: logs-checkpoint: name: Log ${{ inputs.test_id }} test (checkpoint) needs: [ logs-sprout ] - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. + # If the previous job fails, we still want to show the logs. if: ${{ !cancelled() }} runs-on: ubuntu-latest permissions: @@ -585,8 +581,7 @@ jobs: logs-end: name: Log ${{ inputs.test_id }} test (end) needs: [ logs-checkpoint ] - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. + # If the previous job fails, we still want to show the logs. if: ${{ !cancelled() }} runs-on: ubuntu-latest permissions: From d04dd661217b98fc4a3a06ecc6f94b84536271b9 Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 29 Jun 2022 12:42:09 +1000 Subject: [PATCH 09/13] Ignore broken pipe error messages and statuses --- .github/workflows/deploy-gcp-tests.yml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 31b7609639e..6710ba3e5c9 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -499,7 +499,10 @@ jobs: token_format: 'access_token' # Show all the logs since the container launched, - # following until Sapling activation (or the test finishes) + # following until Sapling activation (or the test finishes). + # + # The log pipeline ignores the exit status of `docker logs`. + # Errors in the tests are caught by the final test status job. - name: Show logs for ${{ inputs.test_id }} test (sprout) run: | gcloud compute ssh \ @@ -509,12 +512,11 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ - set -o pipefail; \ docker logs \ --tail all \ --follow \ ${{ inputs.test_id }} | \ - tee --output-error=exit-nopipe /dev/stderr | \ + tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \ " @@ -567,12 +569,11 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ - set -o pipefail; \ docker logs \ --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ ${{ inputs.test_id }} | \ - tee --output-error=exit-nopipe /dev/stderr | \ + tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ '(verified final checkpoint)|(test result:.*finished in)' \ " @@ -624,12 +625,11 @@ jobs: --ssh-flag="-o ServerAliveInterval=5" \ --command \ "\ - set -o pipefail; \ docker logs \ --tail ${{ env.EXTRA_LOG_LINES }} \ --follow \ ${{ inputs.test_id }} | \ - tee --output-error=exit-nopipe /dev/stderr | \ + tee --output-error=exit /dev/stderr | \ grep --max-count=1 --extended-regexp --color=always \ 'test result:.*finished in' \ " From f8d417639bcabb62ef6f7f297f0840a7cf70a660 Mon Sep 17 00:00:00 2001 From: Gustavo Valverde Date: Wed, 29 Jun 2022 16:46:02 -0400 Subject: [PATCH 10/13] fix(ci): docker wait not finding container We had this issue before, I can't recall if this was a parsing error between GitHub Actions and gcloud `--command` parsing, but we had to change this into two pieces. This implementation keeps it how we did it before https://github.com/ZcashFoundation/zebra/blob/9b9578c99975952a291006dde8d2828fd3e97799/.github/workflows/test.yml#L235-L243 --- .github/workflows/deploy-gcp-tests.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 6710ba3e5c9..dd75b76870d 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -680,15 +680,15 @@ jobs: # `docker wait` can also wait for multiple containers, but we only ever wait for a single container. - name: Result of ${{ inputs.test_id }} test run: | + EXIT_CODE=$(\ gcloud compute ssh \ ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ env.ZONE }} \ --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ - --command \ - "\ - exit $(docker wait ${{ inputs.test_id }}) \ - " + --command="docker wait ${{ inputs.test_id }}") + + exit ${EXIT_CODE} # create a state image from the instance's state disk, if requested by the caller From 3ae36af4633c479c31c0f7b47a6fc4727c77ba91 Mon Sep 17 00:00:00 2001 From: Gustavo Valverde Date: Wed, 29 Jun 2022 16:57:06 -0400 Subject: [PATCH 11/13] docs: remove pending TODO We can't remove `actions/checkout` nor set `create_credentials_file` to `false` as next steps won't be able to authenticate to GCP. We can surely remove `actions/checkout` and leave `create_credentials_file` as `true`, but this will raise a warning on each step, and there's no benefit of doing so. --- .github/workflows/deploy-gcp-tests.yml | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index dd75b76870d..bbfddc1cf5a 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -100,8 +100,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -169,8 +167,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -345,8 +341,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -472,8 +466,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -533,8 +525,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -589,8 +579,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -648,8 +636,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false @@ -796,8 +782,6 @@ jobs: contents: 'read' id-token: 'write' steps: - # TODO: can we delete this step and set create_credentials_file to false in Google Cloud? - # Or will that break the slug-action variables we use to find the instance? - uses: actions/checkout@v3.0.2 with: persist-credentials: false From ef331624b419efe43bec7da41e03d04560aaa87a Mon Sep 17 00:00:00 2001 From: teor Date: Thu, 30 Jun 2022 08:09:05 +1000 Subject: [PATCH 12/13] Show `docker wait` and `gcloud ssh` output --- .github/workflows/deploy-gcp-tests.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index bbfddc1cf5a..602630e0a6a 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -672,9 +672,10 @@ jobs: --zone ${{ env.ZONE }} \ --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ - --command="docker wait ${{ inputs.test_id }}") - - exit ${EXIT_CODE} + --command="docker wait ${{ inputs.test_id }}" \ + ) + echo "$EXIT_CODE" + exit "$EXIT_CODE" # create a state image from the instance's state disk, if requested by the caller From 74a9ddaa64b2f7adfffe1af31bff7b270e7a615b Mon Sep 17 00:00:00 2001 From: teor Date: Thu, 30 Jun 2022 11:58:29 +1000 Subject: [PATCH 13/13] If `docker wait` fails, get the exit code using `docker inspect` --- .github/workflows/deploy-gcp-tests.yml | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index 602630e0a6a..fedba1361af 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -662,20 +662,28 @@ jobs: # Wait for the container to finish, then exit with the test's exit status. # - # `docker wait` prints the container exit status as a string, but we need to exit `ssh` with that status. - # `docker wait` can also wait for multiple containers, but we only ever wait for a single container. + # If the container has already finished, `docker wait` should return its status. + # But sometimes this doesn't work, so we use `docker inspect` as a fallback. + # + # `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command + # with that status. + # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.) - name: Result of ${{ inputs.test_id }} test run: | - EXIT_CODE=$(\ gcloud compute ssh \ ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ --zone ${{ env.ZONE }} \ --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ - --command="docker wait ${{ inputs.test_id }}" \ - ) - echo "$EXIT_CODE" - exit "$EXIT_CODE" + --command=' \ + EXIT_STATUS=$( \ + docker wait ${{ inputs.test_id }} || \ + docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \ + echo "missing container, or missing exit status for container" \ + ); \ + echo "docker exit status: $EXIT_STATUS"; \ + exit "$EXIT_STATUS" \ + ' # create a state image from the instance's state disk, if requested by the caller