diff --git a/.github/workflows/deploy-gcp-tests.yml b/.github/workflows/deploy-gcp-tests.yml index e6ae8ed59c7..fedba1361af 100644 --- a/.github/workflows/deploy-gcp-tests.yml +++ b/.github/workflows/deploy-gcp-tests.yml @@ -75,10 +75,19 @@ on: description: 'Application name for Google Cloud instance metadata' env: + # where we get the Docker image from IMAGE_NAME: zebrad-test GAR_BASE: us-docker.pkg.dev/zealous-zebra/zebra + # what kind of Google Cloud instance we want to launch ZONE: us-central1-a MACHINE_TYPE: c2d-standard-16 + # How many previous log lines we show at the start of each new log job. + # Increase this number if some log lines are skipped between jobs + # + # We want to show all the logs since the last job finished, + # but we don't know how long it will be between jobs. + # 200 lines is about 6-15 minutes of sync logs, or one panic log. + EXTRA_LOG_LINES: 200 jobs: # set up the test, if it doesn't use any cached state @@ -94,6 +103,7 @@ jobs: - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -150,9 +160,8 @@ jobs: launch-without-cached-state: name: Launch ${{ inputs.test_id }} test needs: [ setup-without-cached-state ] - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. - if: ${{ !cancelled() && !inputs.needs_zebra_state }} + # If creating the Google Cloud instance fails, we don't want to launch another docker instance. + if: ${{ !cancelled() && !failure() && !inputs.needs_zebra_state }} runs-on: ubuntu-latest permissions: contents: 'read' @@ -161,6 +170,7 @@ jobs: - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -324,9 +334,8 @@ jobs: launch-with-cached-state: name: Launch ${{ inputs.test_id }} test needs: [ setup-with-cached-state ] - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. - if: ${{ !cancelled() && inputs.needs_zebra_state }} + # If creating the Google Cloud instance fails, we don't want to launch another docker instance. + if: ${{ !cancelled() && !failure() && inputs.needs_zebra_state }} runs-on: ubuntu-latest permissions: contents: 'read' @@ -445,13 +454,12 @@ jobs: " - # follow the logs of the test we just launched - follow-logs: - name: Show logs for ${{ inputs.test_id }} test - needs: [ launch-with-cached-state, launch-without-cached-state ] + # follow the logs of the test we just launched, up to Sapling activation (or the test finishing) + logs-sprout: + name: Log ${{ inputs.test_id }} test (sprout) # We run exactly one of without-cached-state or with-cached-state, and we always skip the other one. - # If the previous job fails, we also want to run and fail this job, - # so that the branch protection rule fails in Mergify and GitHub. + needs: [ launch-with-cached-state, launch-without-cached-state ] + # If the previous job fails, we still want to show the logs. if: ${{ !cancelled() }} runs-on: ubuntu-latest permissions: @@ -461,6 +469,7 @@ jobs: - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -481,8 +490,12 @@ jobs: service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' token_format: 'access_token' - # Show all the logs since the container launched - - name: Show logs for ${{ inputs.test_id }} test + # Show all the logs since the container launched, + # following until Sapling activation (or the test finishes). + # + # The log pipeline ignores the exit status of `docker logs`. + # Errors in the tests are caught by the final test status job. + - name: Show logs for ${{ inputs.test_id }} test (sprout) run: | gcloud compute ssh \ ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ @@ -494,14 +507,127 @@ jobs: docker logs \ --tail all \ --follow \ - ${{ inputs.test_id }} \ + ${{ inputs.test_id }} | \ + tee --output-error=exit /dev/stderr | \ + grep --max-count=1 --extended-regexp --color=always \ + '(estimated progress.*network_upgrade.*=.*Sapling)|(test result:.*finished in)' \ + " + + # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing) + # TODO: split out sapling logs when the mandatory checkpoint is above NU5 activation + logs-checkpoint: + name: Log ${{ inputs.test_id }} test (checkpoint) + needs: [ logs-sprout ] + # If the previous job fails, we still want to show the logs. + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + permissions: + contents: 'read' + id-token: 'write' + steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + with: + short-length: 7 + + - name: Downcase network name for disks + run: | + NETWORK_CAPS=${{ inputs.network }} + echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV + + # Setup gcloud CLI + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.8.0 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Show recent logs, following until the last checkpoint (or the test finishes) + - name: Show logs for ${{ inputs.test_id }} test (checkpoint) + run: | + gcloud compute ssh \ + ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ env.ZONE }} \ + --quiet \ + --ssh-flag="-o ServerAliveInterval=5" \ + --command \ + "\ + docker logs \ + --tail ${{ env.EXTRA_LOG_LINES }} \ + --follow \ + ${{ inputs.test_id }} | \ + tee --output-error=exit /dev/stderr | \ + grep --max-count=1 --extended-regexp --color=always \ + '(verified final checkpoint)|(test result:.*finished in)' \ + " + + # follow the logs of the test we just launched, up to the last checkpoint (or the test finishing) + logs-end: + name: Log ${{ inputs.test_id }} test (end) + needs: [ logs-checkpoint ] + # If the previous job fails, we still want to show the logs. + if: ${{ !cancelled() }} + runs-on: ubuntu-latest + permissions: + contents: 'read' + id-token: 'write' + steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + + - name: Inject slug/short variables + uses: rlespinasse/github-slug-action@v4 + with: + short-length: 7 + + - name: Downcase network name for disks + run: | + NETWORK_CAPS=${{ inputs.network }} + echo "NETWORK=${NETWORK_CAPS,,}" >> $GITHUB_ENV + + # Setup gcloud CLI + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.8.0 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Show recent logs, following until the test finishes + - name: Show logs for ${{ inputs.test_id }} test (end) + run: | + gcloud compute ssh \ + ${{ inputs.test_id }}-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }} \ + --zone ${{ env.ZONE }} \ + --quiet \ + --ssh-flag="-o ServerAliveInterval=5" \ + --command \ + "\ + docker logs \ + --tail ${{ env.EXTRA_LOG_LINES }} \ + --follow \ + ${{ inputs.test_id }} | \ + tee --output-error=exit /dev/stderr | \ + grep --max-count=1 --extended-regexp --color=always \ + 'test result:.*finished in' \ " + # wait for the result of the test test-result: # TODO: update the job name here, and in the branch protection rules name: Run ${{ inputs.test_id }} test - needs: [ follow-logs ] + needs: [ logs-end ] # If the previous job fails, we also want to run and fail this job, # so that the branch protection rule fails in Mergify and GitHub. if: ${{ !cancelled() }} @@ -513,6 +639,7 @@ jobs: - uses: actions/checkout@v3.0.2 with: persist-credentials: false + fetch-depth: '2' - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 @@ -535,8 +662,12 @@ jobs: # Wait for the container to finish, then exit with the test's exit status. # - # `docker wait` prints the container exit status as a string, but we need to exit `ssh` with that status. - # `docker wait` can also wait for multiple containers, but we only ever wait for a single container. + # If the container has already finished, `docker wait` should return its status. + # But sometimes this doesn't work, so we use `docker inspect` as a fallback. + # + # `docker wait` prints the container exit status as a string, but we need to exit the `ssh` command + # with that status. + # (`docker wait` can also wait for multiple containers, but we only ever wait for a single container.) - name: Result of ${{ inputs.test_id }} test run: | gcloud compute ssh \ @@ -544,10 +675,15 @@ jobs: --zone ${{ env.ZONE }} \ --quiet \ --ssh-flag="-o ServerAliveInterval=5" \ - --command \ - "\ - exit $(docker wait ${{ inputs.test_id }}) \ - " + --command=' \ + EXIT_STATUS=$( \ + docker wait ${{ inputs.test_id }} || \ + docker inspect --format "{{.State.ExitCode}}" ${{ inputs.test_id }} || \ + echo "missing container, or missing exit status for container" \ + ); \ + echo "docker exit status: $EXIT_STATUS"; \ + exit "$EXIT_STATUS" \ + ' # create a state image from the instance's state disk, if requested by the caller @@ -563,6 +699,11 @@ jobs: contents: 'read' id-token: 'write' steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 with: @@ -650,6 +791,11 @@ jobs: contents: 'read' id-token: 'write' steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + fetch-depth: '2' + - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 with: