diff --git a/.github/workflows/test-full-sync.yml b/.github/workflows/test-full-sync.yml index 418bcc3c95b..12dd967ada8 100644 --- a/.github/workflows/test-full-sync.yml +++ b/.github/workflows/test-full-sync.yml @@ -55,11 +55,47 @@ env: MACHINE_TYPE: c2d-standard-16 jobs: + validate-state: + name: Validate local state version vs cached state + runs-on: ubuntu-latest + outputs: + any_changed: ${{ steps.changed-files-specific.outputs.any_changed }} + permissions: + contents: 'read' + id-token: 'write' + steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.7.1 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # Before executing any further steps, validate the local state and remote version are the same, + # or at least that the local state version is greater than the available cached state version from main. + - name: Validate constants.rs version vs cached state version + id: validate-state-version + run: | + LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "$GITHUB_WORKSPACE/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1) + echo "LOCAL_STATE_VERSION: $LOCAL_STATE_VERSION" + + GCP_STATE_DISK=$(gcloud compute images list --filter="name~zebrad-cache-main AND name~-tip" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1) + GCP_STATE_VERSION=$(echo "$GCP_STATE_DISK" | grep -oE "v[0-9]+" | grep -oE "[0-9]+") + echo "GCP_STATE_VERSION: $GCP_STATE_VERSION" + + if [[ "$LOCAL_STATE_VERSION" -lt "$GCP_STATE_VERSION" ]]; then echo "Local version is lower than cached version" && exit 1; fi + build: # TODO add `startsWith(github.head_ref, 'mergify/merge-queue/')` to the condition to # only run on Mergify head branches, and on manual dispatch: # https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#running-your-workflow-based-on-the-head-or-base-branch-of-a-pull-request-1 if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} + needs: validate-state uses: ./.github/workflows/docker-image-build.yml with: dockerfile_path: ./docker/Dockerfile diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 3c92896cc4a..2e236bc02a8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -61,7 +61,55 @@ env: MACHINE_TYPE: c2d-standard-4 jobs: + validate-state: + name: Validate local state version vs cached state + runs-on: ubuntu-latest + outputs: + any_changed: ${{ steps.changed-files-specific.outputs.any_changed }} + permissions: + contents: 'read' + id-token: 'write' + steps: + - uses: actions/checkout@v3.0.2 + with: + persist-credentials: false + + - name: Authenticate to Google Cloud + id: auth + uses: google-github-actions/auth@v0.7.1 + with: + workload_identity_provider: 'projects/143793276228/locations/global/workloadIdentityPools/github-actions/providers/github-oidc' + service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' + token_format: 'access_token' + + # If we change the version we want to rebuild the cached state disks, + # so we want to run the regenerate-stateful-disks to use the newer state in tests. + # + # If we change the state path without changing the version, + # the regenerate-stateful-disks job will take a few hours because it will do a full rebuild. + - name: Get specific changed files + id: changed-files-specific + uses: tj-actions/changed-files@v18.6 + with: + files: | + zebra-state/**/constants.rs + + # Before executing any further steps, validate the local state and remote version are the same, + # or at least that the local state version is greater than the available cached state version from main. + - name: Validate constants.rs version vs cached state version + id: validate-state-version + run: | + LOCAL_STATE_VERSION=$(grep -oE "DATABASE_FORMAT_VERSION: .* [0-9]+" "$GITHUB_WORKSPACE/zebra-state/src/constants.rs" | grep -oE "[0-9]+" | tail -n1) + echo "LOCAL_STATE_VERSION: $LOCAL_STATE_VERSION" + + GCP_STATE_DISK=$(gcloud compute images list --filter="name~zebrad-cache-main AND name~-checkpoint" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1) + GCP_STATE_VERSION=$(echo "$GCP_STATE_DISK" | grep -oE "v[0-9]+" | grep -oE "[0-9]+") + echo "GCP_STATE_VERSION: $GCP_STATE_VERSION" + + if [[ "$LOCAL_STATE_VERSION" -lt "$GCP_STATE_VERSION" ]]; then echo "Local version is lower than cached version" && exit 1; fi + build: + needs: validate-state uses: ./.github/workflows/docker-image-build.yml with: dockerfile_path: ./docker/Dockerfile @@ -153,9 +201,7 @@ jobs: regenerate-stateful-disks: name: Regenerate stateful disks runs-on: ubuntu-latest - needs: build - outputs: - any_changed: ${{ steps.changed-files-specific.outputs.any_changed }} + needs: [ build, validate-state] permissions: contents: 'read' id-token: 'write' @@ -165,21 +211,6 @@ jobs: persist-credentials: false fetch-depth: '2' - # TODO move the `changed-files-specific` step to the build job for a better dependency tree - # Only run this job if the database format version has (likely) changed. - # - # If we have accidentally changed the format, but not changed the version, - # we want to run with the old cached state, so this job fails. - # - # If we change the state path without changing the version, - # this job will take a few hours, because it will do a full rebuild. - - name: Get specific changed files - id: changed-files-specific - uses: tj-actions/changed-files@v19 - with: - files: | - zebra-state/**/constants.rs - - name: Inject slug/short variables uses: rlespinasse/github-slug-action@v4 with: @@ -200,8 +231,8 @@ jobs: token_format: 'access_token' - name: Create GCP compute instance + if: ${{ (needs.validate-state.outputs.any_changed == 'true' || github.event.inputs.regenerate-disks == 'true' }} id: create-instance - if: ${{ steps.changed-files-specific.outputs.any_changed == 'true' || github.event.inputs.regenerate-disks == 'true' || github.event_name == 'push'}} run: | gcloud compute instances create-with-container "regenerate-disk-${{ env.GITHUB_REF_SLUG_URL }}-${{ env.GITHUB_SHA_SHORT }}" \ --boot-disk-size 100GB \ @@ -325,7 +356,6 @@ jobs: gcloud compute instances delete "${INSTANCE}" --zone "${{ env.ZONE }}" --delete-disks all --quiet fi - # Test that Zebra syncs and fully validates a few thousand blocks from a cached post-checkpoint state test-stateful-sync: name: Test full validation sync from cached state @@ -354,10 +384,36 @@ jobs: service_account: 'github-service-account@zealous-zebra.iam.gserviceaccount.com' token_format: 'access_token' - - name: Get disk state name from gcloud - id: get-disk-name + # If a triggering file for a rebuild was changed on a PR or we manually triggered a rebuild, + # use the latest available disk state as the regenerate-stateful-disks job was executed + # + # In the regenerate-stateful-disks job we don't validate the github.event_name because we + # want to ensure the disk are regenerated on PR syncs and push events (to main), and when + # manually triggered. + # + # In this scenario we also validate if a file like constants.rs was changed and a disk + # rebuild was triggered. If this happened on a PR we want to use the cached disk from + # the PR and not the cached disk from main. Otherwise we move to the next step, which is + # get-main-disk-name as this was most likely triggered by a push in main, or constanst.rs + # was not changed, which allows us to use the disk from main, as a disk rebuild did not happened, + # and if the rebuilt happened then it was triggered on main and we want to use main's reference. + - name: Get latest state disk name from gcloud + id: get-latest-disk-name + if: ${{ (needs.validate-state.outputs.any_changed == 'true' && github.event_name == 'pull_request') || github.event.inputs.regenerate-disks == 'true' }} + run: | + GCP_STATE_DISK=$(gcloud compute images list --filter="name~zebrad-cache-${{ env.GITHUB_REF_SLUG_URL }} AND name~-checkpoint" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1) + echo "Disk: $GCP_STATE_DISK" + echo "Description: $(gcloud compute images describe $GCP_STATE_DISK --format='value(DESCRIPTION)')" + + echo "CACHED_DISK_NAME=$GCP_STATE_DISK" >> $GITHUB_ENV + + # If the conditions in the get-latest-disk-name step were not applied means this was triggered by a push event, + # and we need to use the cached state from the main branch. + - name: Get main state disk name from gcloud + id: get-main-disk-name + if: ${{ steps.get-latest-disk-name.outcome == 'skipped' }} run: | - GCP_STATE_DISK=$(gcloud compute images list --filter="name~zebrad-cache AND name~-checkpoint" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1) + GCP_STATE_DISK=$(gcloud compute images list --filter="name~zebrad-cache-main AND name~-checkpoint" --format="value(NAME)" --sort-by=~creationTimestamp --limit=1) echo "Disk: $GCP_STATE_DISK" echo "Description: $(gcloud compute images describe $GCP_STATE_DISK --format='value(DESCRIPTION)')"