From a4faa305e1c3dc670bd4794a587ffb467792409c Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Tue, 28 Jun 2022 09:46:04 +1000
Subject: [PATCH 01/13] Checkout zebra in each job to avoid warnings

But put TODOs where we might be able to skip checkouts
---
 .github/workflows/deploy-gcp-tests.yml | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index e6ae8ed59c7..c3889db24f1 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -91,9 +91,12 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
+          fetch-depth: '2'
 
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4
@@ -158,9 +161,12 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
+          fetch-depth: '2'
 
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4
@@ -332,6 +338,8 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -458,9 +466,12 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
+          fetch-depth: '2'
 
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4
@@ -510,9 +521,12 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
+          fetch-depth: '2'
 
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4
@@ -563,6 +577,11 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4
         with:
@@ -650,6 +669,13 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
       - name: Inject slug/short variables
         uses: rlespinasse/github-slug-action@v4
         with:

From 70822873aafec03652d045d136f57373ea5d99cb Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Tue, 28 Jun 2022 10:14:03 +1000
Subject: [PATCH 02/13] Split log following into sprout checkpoints,
 sapling/orchard checkpoints, and full validation

---
 .github/workflows/deploy-gcp-tests.yml | 146 +++++++++++++++++++++++--
 1 file changed, 139 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index c3889db24f1..6e491621d8a 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -75,10 +75,19 @@ on:
         description: 'Application name for Google Cloud instance metadata'
 
 env:
+  # where we get the Docker image from
   IMAGE_NAME: zebrad-test
   GAR_BASE: us-docker.pkg.dev/zealous-zebra/zebra
+  # what kind of Google Cloud instance we want to launch
   ZONE: us-central1-a
   MACHINE_TYPE: c2d-standard-16
+  # How many previous log lines we show at the start of each new log job.
+  # Increase this number if some log lines are skipped between jobs
+  #
+  # We want to show all the logs since the last job finished,
+  # but we don't know how long it will be between jobs.
+  # 200 lines is about 6-15 minutes of sync logs, or one panic log.
+  EXTRA_LOG_LINES: 200
 
 jobs:
   # set up the test, if it doesn't use any cached state
@@ -453,9 +462,9 @@ jobs:
           "
 
 
-  # follow the logs of the test we just launched
-  follow-logs:
-    name: Show logs for ${{ inputs.test_id }} test
+  # follow the logs of the test we just launched, up to Sapling activation (or the test finishing)
+  follow-logs-sprout:
+    name: Log ${{ inputs.test_id }} test (sprout)
     needs: [ launch-with-cached-state, launch-without-cached-state ]
     # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
     # If the previous job fails, we also want to run and fail this job,
@@ -492,8 +501,9 @@ jobs:
           service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
           token_format: 'access_token'
 
-      # Show all the logs since the container launched
-      - name: Show logs for ${{ inputs.test_id }} test
+      # Show all the logs since the container launched,
+      # following until Sapling activation (or the test finishes)
+      - name: Show logs for ${{ inputs.test_id }} test (sprout)
         run: |
           gcloud compute ssh \
           ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
@@ -502,17 +512,139 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
+          set -o pipefail; \
           docker logs \
           --tail all \
           --follow \
-          ${{ inputs.test_id }} \
+          ${{ inputs.test_id }} | \
+          tee /dev/tty | \
+          grep --max-count=1 --extended-regexp --color=always \
+          '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \
           "
 
+  # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing)
+  # TODO: split out sapling logs when the mandatory checkpoint is above NU5 activation
+  follow-logs-checkpoint:
+    name: Log ${{ inputs.test_id }} test (checkpoint)
+    needs: [ follow-logs-sprout ]
+    # If the previous job fails, we also want to run and fail this job,
+    # so that the branch protection rule fails in Mergify and GitHub.
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4
+        with:
+          short-length: 7
+
+      - name: Downcase network name for disks
+        run: |
+          NETWORK_CAPS=${{ inputs.network }}
+          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+
+      # Setup gcloud CLI
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v0.8.0
+        with:
+          workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
+          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
+          token_format: 'access_token'
+
+      # Show recent logs, following until the last checkpoint (or the test finishes)
+      - name: Show logs for ${{ inputs.test_id }} test (checkpoint)
+        run: |
+          gcloud compute ssh \
+          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ env.ZONE }} \
+          --quiet \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --command \
+          "\
+          set -o pipefail; \
+          docker logs \
+          --tail ${{ env.EXTRA_LOG_LINES }} \
+          --follow \
+          ${{ inputs.test_id }} | \
+          tee /dev/tty | \
+          grep --max-count=1 --extended-regexp --color=always \
+          '(verified final checkpoint)|(test result:.*finished in)' \
+          "
+
+  # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing)
+  follow-logs-end:
+    name: Log ${{ inputs.test_id }} test (end)
+    needs: [ follow-logs-checkpoint ]
+    # If the previous job fails, we also want to run and fail this job,
+    # so that the branch protection rule fails in Mergify and GitHub.
+    if: ${{ !cancelled() }}
+    runs-on: ubuntu-latest
+    permissions:
+      contents: 'read'
+      id-token: 'write'
+    steps:
+      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
+      # Or will that break the slug-action variables we use to find the instance?
+      - uses: actions/checkout@v3.0.2
+        with:
+          persist-credentials: false
+          fetch-depth: '2'
+
+      - name: Inject slug/short variables
+        uses: rlespinasse/github-slug-action@v4
+        with:
+          short-length: 7
+
+      - name: Downcase network name for disks
+        run: |
+          NETWORK_CAPS=${{ inputs.network }}
+          echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV
+
+      # Setup gcloud CLI
+      - name: Authenticate to Google Cloud
+        id: auth
+        uses: google-github-actions/auth@v0.8.0
+        with:
+          workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc'
+          service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com'
+          token_format: 'access_token'
+
+      # Show recent logs, following until the test finishes
+      - name: Show logs for ${{ inputs.test_id }} test (end)
+        run: |
+          gcloud compute ssh \
+          ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
+          --zone ${{ env.ZONE }} \
+          --quiet \
+          --ssh-flag="-o ServerAliveInterval=5" \
+          --command \
+          "\
+          set -o pipefail; \
+          docker logs \
+          --tail ${{ env.EXTRA_LOG_LINES }} \
+          --follow \
+          ${{ inputs.test_id }} | \
+          tee /dev/tty | \
+          grep --max-count=1 --extended-regexp --color=always \
+          'test result:.*finished in' \
+          "
+
+
   # wait for the result of the test
   test-result:
     # TODO: update the job name here, and in the branch protection rules
     name: Run ${{ inputs.test_id }} test
-    needs: [ follow-logs ]
+    needs: [ follow-logs-end ]
     # If the previous job fails, we also want to run and fail this job,
     # so that the branch protection rule fails in Mergify and GitHub.
     if: ${{ !cancelled() }}

From 837cda1db6e8be8d65291d20fca810e8c6631849 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Tue, 28 Jun 2022 10:27:40 +1000
Subject: [PATCH 03/13] Make job IDs shorter

---
 .github/workflows/deploy-gcp-tests.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 6e491621d8a..54a35da7a6e 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -463,7 +463,7 @@ jobs:
 
 
   # follow the logs of the test we just launched, up to Sapling activation (or the test finishing)
-  follow-logs-sprout:
+  logs-sprout:
     name: Log ${{ inputs.test_id }} test (sprout)
     needs: [ launch-with-cached-state, launch-without-cached-state ]
     # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
@@ -524,9 +524,9 @@ jobs:
 
   # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing)
   # TODO: split out sapling logs when the mandatory checkpoint is above NU5 activation
-  follow-logs-checkpoint:
+  logs-checkpoint:
     name: Log ${{ inputs.test_id }} test (checkpoint)
-    needs: [ follow-logs-sprout ]
+    needs: [ logs-sprout ]
     # If the previous job fails, we also want to run and fail this job,
     # so that the branch protection rule fails in Mergify and GitHub.
     if: ${{ !cancelled() }}
@@ -582,9 +582,9 @@ jobs:
           "
 
   # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing)
-  follow-logs-end:
+  logs-end:
     name: Log ${{ inputs.test_id }} test (end)
-    needs: [ follow-logs-checkpoint ]
+    needs: [ logs-checkpoint ]
     # If the previous job fails, we also want to run and fail this job,
     # so that the branch protection rule fails in Mergify and GitHub.
     if: ${{ !cancelled() }}
@@ -644,7 +644,7 @@ jobs:
   test-result:
     # TODO: update the job name here, and in the branch protection rules
     name: Run ${{ inputs.test_id }} test
-    needs: [ follow-logs-end ]
+    needs: [ logs-end ]
     # If the previous job fails, we also want to run and fail this job,
     # so that the branch protection rule fails in Mergify and GitHub.
     if: ${{ !cancelled() }}

From 9fe6ca5ee6d049b079aa98655aea8ff2add66032 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Tue, 28 Jun 2022 14:05:56 +1000
Subject: [PATCH 04/13] Use /dev/stderr because docker doesn't have a tty

---
 .github/workflows/deploy-gcp-tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 54a35da7a6e..19455d0d78a 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -517,7 +517,7 @@ jobs:
           --tail all \
           --follow \
           ${{ inputs.test_id }} | \
-          tee /dev/tty | \
+          tee /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \
           "
@@ -576,7 +576,7 @@ jobs:
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
           ${{ inputs.test_id }} | \
-          tee /dev/tty | \
+          tee /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           '(verified final checkpoint)|(test result:.*finished in)' \
           "
@@ -634,7 +634,7 @@ jobs:
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
           ${{ inputs.test_id }} | \
-          tee /dev/tty | \
+          tee /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           'test result:.*finished in' \
           "

From a852cb674efa09aa1edf19d23d1a6111929f0348 Mon Sep 17 00:00:00 2001
From: Conrado Gouvea <conrado@zfnd.org>
Date: Tue, 28 Jun 2022 11:24:13 -0300
Subject: [PATCH 05/13] remove pipefail

---
 .github/workflows/deploy-gcp-tests.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 19455d0d78a..e58322ca834 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -512,7 +512,6 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
-          set -o pipefail; \
           docker logs \
           --tail all \
           --follow \
@@ -571,7 +570,6 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
-          set -o pipefail; \
           docker logs \
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
@@ -629,7 +627,6 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
-          set -o pipefail; \
           docker logs \
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \

From 4618a780ac99ba5b2aac6fd697b72971361f4cbc Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Wed, 29 Jun 2022 10:35:48 +1000
Subject: [PATCH 06/13] Revert "remove pipefail"

This reverts commit a7ee37bebdc107a4215e7dd307b189d925969234.
---
 .github/workflows/deploy-gcp-tests.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index e58322ca834..19455d0d78a 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -512,6 +512,7 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
+          set -o pipefail; \
           docker logs \
           --tail all \
           --follow \
@@ -570,6 +571,7 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
+          set -o pipefail; \
           docker logs \
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
@@ -627,6 +629,7 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
+          set -o pipefail; \
           docker logs \
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \

From d35e0c40415c12fe520e2372e33df8edd7a409f2 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Wed, 29 Jun 2022 10:37:20 +1000
Subject: [PATCH 07/13] Make tee ignore errors writing to a grep pipe

---
 .github/workflows/deploy-gcp-tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 19455d0d78a..46276fbeca2 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -517,7 +517,7 @@ jobs:
           --tail all \
           --follow \
           ${{ inputs.test_id }} | \
-          tee /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \
           "
@@ -576,7 +576,7 @@ jobs:
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
           ${{ inputs.test_id }} | \
-          tee /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           '(verified final checkpoint)|(test result:.*finished in)' \
           "
@@ -634,7 +634,7 @@ jobs:
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
           ${{ inputs.test_id }} | \
-          tee /dev/stderr | \
+          tee --output-error=exit-nopipe /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           'test result:.*finished in' \
           "

From a4857ff13123cd194caed445a63721e8a5935aa5 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Wed, 29 Jun 2022 10:48:48 +1000
Subject: [PATCH 08/13] Avoid launching multiple docker instances for duplicate
 jobs

---
 .github/workflows/deploy-gcp-tests.yml | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 46276fbeca2..31b7609639e 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -162,9 +162,8 @@ jobs:
   launch-without-cached-state:
     name: Launch ${{ inputs.test_id }} test
     needs: [ setup-without-cached-state ]
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
-    if: ${{ !cancelled() && !inputs.needs_zebra_state }}
+    # If creating the Google Cloud instance fails, we don't want to launch another docker instance.
+    if: ${{ !cancelled() && !failure() && !inputs.needs_zebra_state }}
     runs-on: ubuntu-latest
     permissions:
       contents: 'read'
@@ -339,9 +338,8 @@ jobs:
   launch-with-cached-state:
     name: Launch ${{ inputs.test_id }} test
     needs: [ setup-with-cached-state ]
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
-    if: ${{ !cancelled() && inputs.needs_zebra_state }}
+    # If creating the Google Cloud instance fails, we don't want to launch another docker instance.
+    if: ${{ !cancelled() && !failure() && inputs.needs_zebra_state }}
     runs-on: ubuntu-latest
     permissions:
       contents: 'read'
@@ -465,10 +463,9 @@ jobs:
   # follow the logs of the test we just launched, up to Sapling activation (or the test finishing)
   logs-sprout:
     name: Log ${{ inputs.test_id }} test (sprout)
-    needs: [ launch-with-cached-state, launch-without-cached-state ]
     # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one.
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
+    needs: [ launch-with-cached-state, launch-without-cached-state ]
+    # If the previous job fails, we still want to show the logs.
     if: ${{ !cancelled() }}
     runs-on: ubuntu-latest
     permissions:
@@ -527,8 +524,7 @@ jobs:
   logs-checkpoint:
     name: Log ${{ inputs.test_id }} test (checkpoint)
     needs: [ logs-sprout ]
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
+    # If the previous job fails, we still want to show the logs.
     if: ${{ !cancelled() }}
     runs-on: ubuntu-latest
     permissions:
@@ -585,8 +581,7 @@ jobs:
   logs-end:
     name: Log ${{ inputs.test_id }} test (end)
     needs: [ logs-checkpoint ]
-    # If the previous job fails, we also want to run and fail this job,
-    # so that the branch protection rule fails in Mergify and GitHub.
+    # If the previous job fails, we still want to show the logs.
     if: ${{ !cancelled() }}
     runs-on: ubuntu-latest
     permissions:

From d04dd661217b98fc4a3a06ecc6f94b84536271b9 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Wed, 29 Jun 2022 12:42:09 +1000
Subject: [PATCH 09/13] Ignore broken pipe error messages and statuses

---
 .github/workflows/deploy-gcp-tests.yml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 31b7609639e..6710ba3e5c9 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -499,7 +499,10 @@ jobs:
           token_format: 'access_token'
 
       # Show all the logs since the container launched,
-      # following until Sapling activation (or the test finishes)
+      # following until Sapling activation (or the test finishes).
+      #
+      # The log pipeline ignores the exit status of `docker logs`.
+      # Errors in the tests are caught by the final test status job.
       - name: Show logs for ${{ inputs.test_id }} test (sprout)
         run: |
           gcloud compute ssh \
@@ -509,12 +512,11 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
-          set -o pipefail; \
           docker logs \
           --tail all \
           --follow \
           ${{ inputs.test_id }} | \
-          tee --output-error=exit-nopipe /dev/stderr | \
+          tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \
           "
@@ -567,12 +569,11 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
-          set -o pipefail; \
           docker logs \
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
           ${{ inputs.test_id }} | \
-          tee --output-error=exit-nopipe /dev/stderr | \
+          tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           '(verified final checkpoint)|(test result:.*finished in)' \
           "
@@ -624,12 +625,11 @@ jobs:
           --ssh-flag="-o ServerAliveInterval=5" \
           --command \
           "\
-          set -o pipefail; \
           docker logs \
           --tail ${{ env.EXTRA_LOG_LINES }} \
           --follow \
           ${{ inputs.test_id }} | \
-          tee --output-error=exit-nopipe /dev/stderr | \
+          tee --output-error=exit /dev/stderr | \
           grep --max-count=1 --extended-regexp --color=always \
           'test result:.*finished in' \
           "

From f8d417639bcabb62ef6f7f297f0840a7cf70a660 Mon Sep 17 00:00:00 2001
From: Gustavo Valverde <gustavo@iterativo.do>
Date: Wed, 29 Jun 2022 16:46:02 -0400
Subject: [PATCH 10/13] fix(ci): docker wait not finding container

We had this issue before, I can't recall if this was a parsing error between GitHub Actions and gcloud `--command` parsing, but we had to change this into two pieces.

This implementation keeps it how we did it before https://github.com/ZcashFoundation/zebra/blob/9b9578c99975952a291006dde8d2828fd3e97799/.github/workflows/test.yml#L235-L243
---
 .github/workflows/deploy-gcp-tests.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 6710ba3e5c9..dd75b76870d 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -680,15 +680,15 @@ jobs:
       # `docker wait` can also wait for multiple containers, but we only ever wait for a single container.
       - name: Result of ${{ inputs.test_id }} test
         run: |
+          EXIT_CODE=$(\
           gcloud compute ssh \
           ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ env.ZONE }} \
           --quiet \
           --ssh-flag="-o ServerAliveInterval=5" \
-          --command \
-          "\
-          exit $(docker wait ${{ inputs.test_id }}) \
-          "
+          --command="docker wait ${{ inputs.test_id }}")
+
+          exit ${EXIT_CODE}
 
 
   # create a state image from the instance's state disk, if requested by the caller

From 3ae36af4633c479c31c0f7b47a6fc4727c77ba91 Mon Sep 17 00:00:00 2001
From: Gustavo Valverde <gustavo@iterativo.do>
Date: Wed, 29 Jun 2022 16:57:06 -0400
Subject: [PATCH 11/13] docs: remove pending TODO

We can't remove  `actions/checkout` nor set `create_credentials_file` to `false` as next steps won't be able to authenticate to GCP.

We can surely remove `actions/checkout` and leave `create_credentials_file` as `true`, but this will raise a warning on each step, and there's no benefit of doing so.
---
 .github/workflows/deploy-gcp-tests.yml | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index dd75b76870d..bbfddc1cf5a 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -100,8 +100,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -169,8 +167,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -345,8 +341,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -472,8 +466,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -533,8 +525,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -589,8 +579,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -648,8 +636,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false
@@ -796,8 +782,6 @@ jobs:
       contents: 'read'
       id-token: 'write'
     steps:
-      # TODO: can we delete this step and set create_credentials_file to false in Google Cloud?
-      # Or will that break the slug-action variables we use to find the instance?
       - uses: actions/checkout@v3.0.2
         with:
           persist-credentials: false

From ef331624b419efe43bec7da41e03d04560aaa87a Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Thu, 30 Jun 2022 08:09:05 +1000
Subject: [PATCH 12/13] Show `docker wait` and `gcloud ssh` output

---
 .github/workflows/deploy-gcp-tests.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index bbfddc1cf5a..602630e0a6a 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -672,9 +672,10 @@ jobs:
           --zone ${{ env.ZONE }} \
           --quiet \
           --ssh-flag="-o ServerAliveInterval=5" \
-          --command="docker wait ${{ inputs.test_id }}")
-
-          exit ${EXIT_CODE}
+          --command="docker wait ${{ inputs.test_id }}" \
+          )
+          echo "$EXIT_CODE"
+          exit "$EXIT_CODE"
 
 
   # create a state image from the instance's state disk, if requested by the caller

From 74a9ddaa64b2f7adfffe1af31bff7b270e7a615b Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Thu, 30 Jun 2022 11:58:29 +1000
Subject: [PATCH 13/13] If `docker wait` fails, get the exit code using `docker
 inspect`

---
 .github/workflows/deploy-gcp-tests.yml | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml
index 602630e0a6a..fedba1361af 100644
--- a/.github/workflows/deploy-gcp-tests.yml
+++ b/.github/workflows/deploy-gcp-tests.yml
@@ -662,20 +662,28 @@ jobs:
 
       # Wait for the container to finish, then exit with the test's exit status.
       #
-      # `docker wait` prints the container exit status as a string, but we need to exit `ssh` with that status.
-      # `docker wait` can also wait for multiple containers, but we only ever wait for a single container.
+      # If the container has already finished, `docker wait` should return its status.
+      # But sometimes this doesn't work, so we use `docker inspect` as a fallback.
+      #
+      # `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command
+      # with that status.
+      # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.)
       - name: Result of ${{ inputs.test_id }} test
         run: |
-          EXIT_CODE=$(\
           gcloud compute ssh \
           ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \
           --zone ${{ env.ZONE }} \
           --quiet \
           --ssh-flag="-o ServerAliveInterval=5" \
-          --command="docker wait ${{ inputs.test_id }}" \
-          )
-          echo "$EXIT_CODE"
-          exit "$EXIT_CODE"
+          --command=' \
+          EXIT_STATUS=$( \
+          docker wait ${{ inputs.test_id }} || \
+          docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \
+          echo "missing container, or missing exit status for container" \
+          ); \
+          echo "docker exit status: $EXIT_STATUS"; \
+          exit "$EXIT_STATUS" \
+          '
 
 
   # create a state image from the instance's state disk, if requested by the caller