diff --git a/.github/workflows/docker-build-test.yaml b/.github/workflows/docker-build-test.yaml index 10efb527cc79f..532773dcd8aaa 100644 --- a/.github/workflows/docker-build-test.yaml +++ b/.github/workflows/docker-build-test.yaml @@ -277,6 +277,7 @@ jobs: # by this GHA. If there is a Forge namespace collision, Forge will pre-empt the existing test running in the namespace. FORGE_NAMESPACE: forge-e2e-${{ needs.determine-docker-build-metadata.outputs.targetCacheId }} SKIP_JOB: ${{ needs.file_change_determinator.outputs.only_docs_changed == 'true' }} + SEND_RESULTS_TO_TRUNK: true # This job determines the last released docker image tag, which is used by forge compat test. fetch-last-released-docker-image-tag: @@ -356,6 +357,7 @@ jobs: COMMENT_HEADER: forge-compat FORGE_NAMESPACE: forge-compat-${{ needs.determine-docker-build-metadata.outputs.targetCacheId }} SKIP_JOB: ${{ needs.file_change_determinator.outputs.only_docs_changed == 'true' }} + SEND_RESULTS_TO_TRUNK: true # Run forge framework upgradability test. This is a PR required job. forge-framework-upgrade-test: @@ -385,6 +387,7 @@ jobs: COMMENT_HEADER: forge-framework-upgrade FORGE_NAMESPACE: forge-framework-upgrade-${{ needs.determine-docker-build-metadata.outputs.targetCacheId }} SKIP_JOB: ${{ !contains(github.event.pull_request.labels.*.name, 'CICD:run-framework-upgrade-test') && (needs.test-target-determinator.outputs.run_framework_upgrade_test == 'false') }} + SEND_RESULTS_TO_TRUNK: true forge-consensus-only-perf-test: needs: diff --git a/.github/workflows/forge-stable.yaml b/.github/workflows/forge-stable.yaml index 73947c9936a33..e9841704cff90 100644 --- a/.github/workflows/forge-stable.yaml +++ b/.github/workflows/forge-stable.yaml @@ -8,10 +8,6 @@ permissions: id-token: write actions: write #required for workflow cancellation via check-aptos-core -concurrency: - group: forge-stable-${{ github.ref_name }} - cancel-in-progress: true - on: # Allow triggering manually workflow_dispatch: @@ -24,6 +20,35 @@ on: required: false type: string description: The git SHA1 to checkout. This affects the Forge test runner that is used. If not specified, the latest main will be used + TEST_NAME: + required: true + type: choice + description: The specific stable test to run. If 'all', all stable tests will be run + default: 'all' + options: + - all + - framework-upgrade-test + - realistic-env-load-sweep + - realistic-env-workload-sweep + - realistic-env-graceful-overload + - realistic-env-graceful-workload-sweep + - realistic-env-fairness-workload-sweep + - realistic-network-tuned-for-throughput + - consensus-stress-test + - workload-mix-test + - single-vfn-perf + - fullnode-reboot-stress-test + - compat + - changing-working-quorum-test + - changing-working-quorum-test-high-load + - pfn-const-tps-realistic-env + - realistic-env-max-load-long + JOB_PARALLELISM: + required: false + type: number + description: The number of test jobs to run in parallel. If not specified, defaults to 1 + default: 1 + # NOTE: to support testing different branches on different schedules, you need to specify the cron schedule in the 'determine-test-branch' step as well below # Reference: https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows#schedule schedule: @@ -33,6 +58,10 @@ on: - ".github/workflows/forge-stable.yaml" - "testsuite/find_latest_image.py" +concurrency: + group: forge-stable-${{ format('{0}-{1}-{2}-{3}', github.ref_name, inputs.GIT_SHA, inputs.IMAGE_TAG, inputs.TEST_NAME) }} + cancel-in-progress: true + env: AWS_ACCOUNT_NUM: ${{ secrets.ENV_ECR_AWS_ACCOUNT_NUM }} AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} @@ -41,10 +70,71 @@ env: AWS_REGION: us-west-2 jobs: + generate-matrix: + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + imageVariants: ${{ steps.set-matrix.outputs.imageVariants }} + steps: + - name: Compute matrix + id: set-matrix + uses: actions/github-script@v7 + env: + TEST_NAME: ${{ inputs.TEST_NAME }} + with: + result-encoding: string + script: | + const testName = process.env.TEST_NAME || 'all'; + console.log(`Running job: ${testName}`); + const tests = [ + { TEST_NAME: 'framework-upgrade-test', FORGE_RUNNER_DURATION_SECS: 7200, FORGE_TEST_SUITE: 'framework_upgrade' }, + { TEST_NAME: 'realistic-env-load-sweep', FORGE_RUNNER_DURATION_SECS: 1800, FORGE_TEST_SUITE: 'realistic_env_load_sweep' }, + { TEST_NAME: 'realistic-env-workload-sweep', FORGE_RUNNER_DURATION_SECS: 2000, FORGE_TEST_SUITE: 'realistic_env_workload_sweep' }, + { TEST_NAME: 'realistic-env-graceful-overload', FORGE_RUNNER_DURATION_SECS: 1200, FORGE_TEST_SUITE: 'realistic_env_graceful_overload' }, + { TEST_NAME: 'realistic-env-graceful-workload-sweep', FORGE_RUNNER_DURATION_SECS: 2100, FORGE_TEST_SUITE: 'realistic_env_graceful_workload_sweep' }, + { TEST_NAME: 'realistic-env-fairness-workload-sweep', FORGE_RUNNER_DURATION_SECS: 900, FORGE_TEST_SUITE: 'realistic_env_fairness_workload_sweep' }, + { TEST_NAME: 'realistic-network-tuned-for-throughput', FORGE_RUNNER_DURATION_SECS: 900, FORGE_TEST_SUITE: 'realistic_network_tuned_for_throughput', FORGE_ENABLE_PERFORMANCE: true }, + { TEST_NAME: 'consensus-stress-test', FORGE_RUNNER_DURATION_SECS: 2400, FORGE_TEST_SUITE: 'consensus_stress_test' }, + { TEST_NAME: 'workload-mix-test', FORGE_RUNNER_DURATION_SECS: 900, FORGE_TEST_SUITE: 'workload_mix' }, + { TEST_NAME: 'single-vfn-perf', FORGE_RUNNER_DURATION_SECS: 480, FORGE_TEST_SUITE: 'single_vfn_perf' }, + { TEST_NAME: 'fullnode-reboot-stress-test', FORGE_RUNNER_DURATION_SECS: 1800, FORGE_TEST_SUITE: 'fullnode_reboot_stress_test' }, + { TEST_NAME: 'compat', FORGE_RUNNER_DURATION_SECS: 300, FORGE_TEST_SUITE: 'compat' }, + { TEST_NAME: 'changing-working-quorum-test', FORGE_RUNNER_DURATION_SECS: 1200, FORGE_TEST_SUITE: 'changing_working_quorum_test', FORGE_ENABLE_FAILPOINTS: true }, + { TEST_NAME: 'changing-working-quorum-test-high-load', FORGE_RUNNER_DURATION_SECS: 900, FORGE_TEST_SUITE: 'changing_working_quorum_test_high_load', FORGE_ENABLE_FAILPOINTS: true }, + { TEST_NAME: 'pfn-const-tps-realistic-env', FORGE_RUNNER_DURATION_SECS: 900, FORGE_TEST_SUITE: 'pfn_const_tps_with_realistic_env' }, + { TEST_NAME: 'realistic-env-max-load-long', FORGE_RUNNER_DURATION_SECS: 7200, FORGE_TEST_SUITE: 'realistic_env_max_load_large' } + ]; + + const matrix = testName != "all" ? tests.filter(test => test.TEST_NAME === testName) : tests; + core.debug(`Matrix: ${JSON.stringify(matrix)}`); + + core.summary.addHeading('Forge Stable Run'); + + const testsToRunNames = matrix.map(test => `${test.TEST_NAME} (Needs Failpoint: ${test.FORGE_ENABLE_FAILPOINTS || false}, Needs Performance: ${test.FORGE_ENABLE_PERFORMANCE || false})`); + core.summary.addRaw("The following tests will be run:", true); + core.summary.addList(testsToRunNames); + + core.summary.write(); + + const needsFailpoints = matrix.some(test => test.FORGE_ENABLE_FAILPOINTS); + const needsPerformance = matrix.some(test => test.FORGE_ENABLE_PERFORMANCE); + + let requiredImageVariants = []; + if (needsFailpoints) { + requiredImageVariants.push('failpoints'); + } + if (needsPerformance) { + requiredImageVariants.push('performance'); + } + + core.setOutput('matrix', JSON.stringify({ include: matrix })); + core.setOutput('imageVariants', requiredImageVariants.join(' ')); + # This job determines the image tag and branch to test, and passes them to the other jobs # NOTE: this may be better as a separate workflow as the logic is quite complex but generalizable determine-test-metadata: runs-on: ubuntu-latest + needs: ["generate-matrix"] outputs: IMAGE_TAG: ${{ steps.get-docker-image-tag.outputs.IMAGE_TAG }} IMAGE_TAG_FOR_COMPAT_TEST: ${{ steps.get-last-released-image-tag-for-compat-test.outputs.IMAGE_TAG }} @@ -102,13 +192,13 @@ jobs: id: get-docker-image-tag with: branch: ${{ steps.determine-test-branch.outputs.BRANCH }} - variants: "failpoints performance" + variants: ${{ needs.generate-matrix.outputs.imageVariants }} - uses: aptos-labs/aptos-core/.github/actions/determine-or-use-target-branch-and-get-last-released-image@main id: get-last-released-image-tag-for-compat-test with: base-branch: ${{ steps.determine-test-branch.outputs.BRANCH }} - variants: "failpoints performance" + variants: ${{ needs.generate-matrix.outputs.imageVariants }} - name: Write summary run: | @@ -122,212 +212,22 @@ jobs: echo "IMAGE_TAG: [${IMAGE_TAG}](https://github.com/${{ github.repository }}/commit/${IMAGE_TAG})" >> $GITHUB_STEP_SUMMARY echo "To cancel this job, do `pnpm infra ci cancel-workflow ${{ github.run_id }}` from internal-ops" >> $GITHUB_STEP_SUMMARY - ### Real-world-network tests. - # Run forge framework upgradability test. This is a PR required job. - run-forge-framework-upgrade-test: + run: + name: forge-${{ matrix.TEST_NAME }} + needs: [determine-test-metadata, generate-matrix] if: ${{ github.event_name != 'pull_request' }} - needs: - - determine-test-metadata - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG_FOR_COMPAT_TEST }} - FORGE_NAMESPACE: forge-framework-upgrade-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 7200 # Run for 2 hours - FORGE_TEST_SUITE: framework_upgrade - POST_TO_SLACK: true - - run-forge-realistic-env-load-sweep: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-framework-upgrade-test] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-realistic-env-load-sweep-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 1800 # Run for 30 minutes (6 tests, each for 300 seconds) - FORGE_TEST_SUITE: realistic_env_load_sweep - POST_TO_SLACK: true - - run-forge-realistic-env-workload-sweep: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-realistic-env-load-sweep] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-realistic-env-workload-sweep-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 2000 # Run for 33 minutes (5 tests, each for 400 seconds) - FORGE_TEST_SUITE: realistic_env_workload_sweep - POST_TO_SLACK: true - - run-forge-realistic-env-graceful-overload: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-realistic-env-workload-sweep] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-realistic-env-graceful-overload-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 1200 # Run for 20 minutes - FORGE_TEST_SUITE: realistic_env_graceful_overload - POST_TO_SLACK: true - - run-forge-realistic-env-graceful-workload-sweep: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-realistic-env-graceful-overload] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-realistic-env-graceful-workload-sweep-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 2100 # Run for 5 minutes per test, 7 tests. - FORGE_TEST_SUITE: realistic_env_graceful_workload_sweep - POST_TO_SLACK: true - - run-forge-realistic-env-fairness-workload-sweep: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-realistic-env-graceful-workload-sweep] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-realistic-env-fairness-workload-sweep-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 900 # Run for 5 minutes per test, 3 tests. - FORGE_TEST_SUITE: realistic_env_fairness_workload_sweep - POST_TO_SLACK: true - - run-forge-realistic-network-tuned-for-throughput: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [ determine-test-metadata, run-forge-realistic-env-fairness-workload-sweep ] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-realistic-network-tuned-for-throughput-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 900 # Run for 15 minutes - FORGE_TEST_SUITE: realistic_network_tuned_for_throughput - FORGE_ENABLE_PERFORMANCE: true - POST_TO_SLACK: true - - ### Forge Correctness/Componenet/Stress tests - - run-forge-consensus-stress-test: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-realistic-network-tuned-for-throughput] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-consensus-stress-test-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 2400 # Run for 40 minutes - FORGE_TEST_SUITE: consensus_stress_test - POST_TO_SLACK: true - - run-forge-workload-mix-test: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-consensus-stress-test] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-workload-mix-test-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 900 # Run for 15 minutes - FORGE_TEST_SUITE: workload_mix - POST_TO_SLACK: true - - run-forge-single-vfn-perf: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-workload-mix-test] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-continuous-e2e-single-vfn-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 480 # Run for 8 minutes - FORGE_TEST_SUITE: single_vfn_perf - POST_TO_SLACK: true - - run-forge-fullnode-reboot-stress-test: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-single-vfn-perf] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-fullnode-reboot-stress-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 1800 # Run for 30 minutes - FORGE_TEST_SUITE: fullnode_reboot_stress_test - POST_TO_SLACK: true - - ### Compatibility Forge tests - - run-forge-compat: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-fullnode-reboot-stress-test] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - FORGE_NAMESPACE: forge-compat-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 300 # Run for 5 minutes - # This will upgrade from testnet branch to the latest main - FORGE_TEST_SUITE: compat - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG_FOR_COMPAT_TEST }} - GIT_SHA: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} # this is the git ref to checkout - POST_TO_SLACK: true - - ### Changing working quorum Forge tests - - run-forge-changing-working-quorum-test: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-compat] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-changing-working-quorum-test-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 1200 # Run for 20 minutes - FORGE_TEST_SUITE: changing_working_quorum_test - POST_TO_SLACK: true - FORGE_ENABLE_FAILPOINTS: true - - run-forge-changing-working-quorum-test-high-load: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-changing-working-quorum-test] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-changing-working-quorum-test-high-load-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 900 # Run for 15 minutes - FORGE_TEST_SUITE: changing_working_quorum_test_high_load - POST_TO_SLACK: true - FORGE_ENABLE_FAILPOINTS: true - - # Measures PFN latencies with a constant TPS (with a realistic environment) - run-forge-pfn-const-tps-realistic-env: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-changing-working-quorum-test-high-load] # Only run after the previous job completes - uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main - secrets: inherit - with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-pfn-const-tps-with-realistic-env-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 900 # Run for 15 minutes - FORGE_TEST_SUITE: pfn_const_tps_with_realistic_env - POST_TO_SLACK: true - - - # longest test for last, to get useful signal from short tests first - - run-forge-realistic-env-max-load-long: - if: ${{ github.event_name != 'pull_request' && always() }} - needs: [determine-test-metadata, run-forge-pfn-const-tps-realistic-env] # Only run after the previous job completes + strategy: + fail-fast: false + max-parallel: ${{ inputs.JOB_PARALLELISM && fromJson(inputs.JOB_PARALLELISM) || 1 }} + matrix: ${{ fromJson(needs.generate-matrix.outputs.matrix) }} uses: aptos-labs/aptos-core/.github/workflows/workflow-run-forge.yaml@main secrets: inherit with: - IMAGE_TAG: ${{ needs.determine-test-metadata.outputs.IMAGE_TAG }} - FORGE_NAMESPACE: forge-realistic-env-max-load-long-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} - FORGE_RUNNER_DURATION_SECS: 7200 # Run for 2 hours - FORGE_TEST_SUITE: realistic_env_max_load_large + IMAGE_TAG: ${{ ((matrix.FORGE_TEST_SUITE == 'compat' && needs.determine-test-metadata.outputs.IMAGE_TAG_FOR_COMPAT_TEST) || needs.determine-test-metadata.outputs.IMAGE_TAG) }} + FORGE_NAMESPACE: forge-${{ matrix.TEST_NAME }}-${{ needs.determine-test-metadata.outputs.BRANCH_HASH }} + FORGE_TEST_SUITE: ${{ matrix.FORGE_TEST_SUITE }} + FORGE_RUNNER_DURATION_SECS: ${{ matrix.FORGE_RUNNER_DURATION_SECS }} + FORGE_ENABLE_PERFORMANCE: ${{ matrix.FORGE_ENABLE_PERFORMANCE || false }} + FORGE_ENABLE_FAILPOINTS: ${{ matrix.FORGE_ENABLE_FAILPOINTS || false }} POST_TO_SLACK: true + SEND_RESULTS_TO_TRUNK: true \ No newline at end of file diff --git a/.github/workflows/workflow-run-forge.yaml b/.github/workflows/workflow-run-forge.yaml index 05d0251194f8c..bf085906a15e0 100644 --- a/.github/workflows/workflow-run-forge.yaml +++ b/.github/workflows/workflow-run-forge.yaml @@ -87,6 +87,10 @@ on: required: false type: string description: The deployer profile used to spin up and configure forge infrastructure + SEND_RESULTS_TO_TRUNK: + required: false + type: boolean + description: Send forge results to trunk.io env: AWS_ACCOUNT_NUM: ${{ secrets.ENV_ECR_AWS_ACCOUNT_NUM }} @@ -118,6 +122,7 @@ env: VERBOSE: true FORGE_NUM_VALIDATORS: ${{ inputs.FORGE_NUM_VALIDATORS }} FORGE_NUM_VALIDATOR_FULLNODES: ${{ inputs.FORGE_NUM_VALIDATOR_FULLNODES }} + FORGE_JUNIT_XML_PATH: ${{ inputs.SEND_RESULTS_TO_TRUNK && '/tmp/test.xml' || '' }} # TODO: should we migrate this to a composite action, so that we can skip it # at the call site, and don't need to wrap each step in an if statement? @@ -228,3 +233,14 @@ jobs: # Print out whether the job was skipped. - run: echo "Skipping forge test!" if: ${{ inputs.SKIP_JOB }} + + - name: Upload results + # Run this step even if the test step ahead fails + if: ${{ !inputs.SKIP_JOB && inputs.SEND_RESULTS_TO_TRUNK && !cancelled() }} + uses: trunk-io/analytics-uploader@main + with: + # Configured in the nextest.toml file + junit-paths: ${{ env.FORGE_JUNIT_XML_PATH }} + org-slug: aptoslabs + token: ${{ secrets.TRUNK_API_TOKEN }} + continue-on-error: true diff --git a/Cargo.lock b/Cargo.lock index 91043badffca3..3281279bd27ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1767,6 +1767,7 @@ dependencies = [ "num_cpus", "once_cell", "prometheus-http-query", + "quick-junit", "rand 0.7.3", "regex", "reqwest 0.11.23", @@ -1774,11 +1775,13 @@ dependencies = [ "serde_json", "serde_merge", "serde_yaml 0.8.26", + "sugars", "tempfile", "termcolor", "thiserror", "tokio", "url", + "uuid", ] [[package]] @@ -1805,6 +1808,7 @@ dependencies = [ "reqwest 0.11.23", "serde_json", "serde_yaml 0.8.26", + "sugars", "tokio", "url", ] @@ -8873,7 +8877,7 @@ dependencies = [ "fixedbitset 0.4.2", "guppy-summaries", "guppy-workspace-hack", - "indexmap 2.2.5", + "indexmap 2.6.0", "itertools 0.12.1", "nested", "once_cell", @@ -8922,7 +8926,7 @@ dependencies = [ "futures-sink", "futures-util", "http 0.2.11", - "indexmap 2.2.5", + "indexmap 2.6.0", "slab", "tokio", "tokio-util 0.7.10", @@ -8941,7 +8945,7 @@ dependencies = [ "futures-core", "futures-sink", "http 1.1.0", - "indexmap 2.2.5", + "indexmap 2.6.0", "slab", "tokio", "tokio-util 0.7.10", @@ -9021,6 +9025,12 @@ dependencies = [ "allocator-api2", ] +[[package]] +name = "hashbrown" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" + [[package]] name = "hdrhistogram" version = "7.5.4" @@ -9714,12 +9724,12 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.5" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0b929d511467233429c45a44ac1dcaa21ba0f5ba11e4879e6ed28ddb4f9df4" +checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" dependencies = [ "equivalent", - "hashbrown 0.14.3", + "hashbrown 0.15.0", "serde", ] @@ -9766,7 +9776,7 @@ dependencies = [ "crossbeam-utils", "dashmap", "env_logger", - "indexmap 2.2.5", + "indexmap 2.6.0", "is-terminal", "itoa", "log", @@ -11898,6 +11908,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4a24736216ec316047a1fc4252e27dabb04218aa4a3f37c6e7ddbf1f9782b54" +[[package]] +name = "newtype-uuid" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4933943834e236c864a48aefdc2da43885dbd5eb77bff3ab20f31e0c3146f5" +dependencies = [ + "uuid", +] + [[package]] name = "nix" version = "0.26.4" @@ -12600,7 +12619,7 @@ dependencies = [ "ciborium", "coset", "data-encoding", - "indexmap 2.2.5", + "indexmap 2.6.0", "rand 0.8.5", "serde", "serde_json", @@ -12786,7 +12805,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" dependencies = [ "fixedbitset 0.4.2", - "indexmap 2.2.5", + "indexmap 2.6.0", ] [[package]] @@ -13058,7 +13077,7 @@ dependencies = [ "bytes", "derive_more", "futures-util", - "indexmap 2.2.5", + "indexmap 2.6.0", "mime", "num-traits", "poem", @@ -13081,7 +13100,7 @@ source = "git+https://github.com/poem-web/poem.git?rev=809b2816d3504beeba140fef3 dependencies = [ "darling 0.20.9", "http 1.1.0", - "indexmap 2.2.5", + "indexmap 2.6.0", "mime", "proc-macro-crate 3.1.0", "proc-macro2", @@ -13830,6 +13849,21 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" +[[package]] +name = "quick-junit" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ffd2f9a162cfae131bed6d9d1ed60adced33be340a94f96952897d7cb0c240" +dependencies = [ + "chrono", + "indexmap 2.6.0", + "newtype-uuid", + "quick-xml 0.36.2", + "strip-ansi-escapes", + "thiserror", + "uuid", +] + [[package]] name = "quick-xml" version = "0.23.1" @@ -13858,6 +13892,15 @@ dependencies = [ "serde", ] +[[package]] +name = "quick-xml" +version = "0.36.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7649a7b4df05aed9ea7ec6f628c67c9953a43869b8bc50929569b2999d443fe" +dependencies = [ + "memchr", +] + [[package]] name = "quick_cache" version = "0.5.1" @@ -15117,7 +15160,7 @@ version = "1.0.114" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c5f09b1bd632ef549eaa9f60a1f8de742bdbc698e6cee2095fc84dde5f549ae0" dependencies = [ - "indexmap 2.2.5", + "indexmap 2.6.0", "itoa", "ryu", "serde", @@ -15196,7 +15239,7 @@ dependencies = [ "chrono", "hex", "indexmap 1.9.3", - "indexmap 2.2.5", + "indexmap 2.6.0", "serde", "serde_json", "serde_with_macros", @@ -15233,7 +15276,7 @@ version = "0.9.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b1bf28c79a99f70ee1f1d83d10c875d2e70618417fda01ad1785e027579d9d38" dependencies = [ - "indexmap 2.2.5", + "indexmap 2.6.0", "itoa", "ryu", "serde", @@ -15817,6 +15860,15 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "strip-ansi-escapes" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55ff8ef943b384c414f54aefa961dd2bd853add74ec75e7ac74cf91dba62bcfa" +dependencies = [ + "vte", +] + [[package]] name = "strsim" version = "0.8.0" @@ -15943,6 +15995,12 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "734676eb262c623cec13c3155096e08d1f8f29adce39ba17948b18dad1e54142" +[[package]] +name = "sugars" +version = "3.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc0db74f9ee706e039d031a560bd7d110c7022f016051b3d33eeff9583e3e67a" + [[package]] name = "symbolic-common" version = "10.2.1" @@ -16267,18 +16325,18 @@ checksum = "222a222a5bfe1bba4a77b45ec488a741b3cb8872e5e499451fd7d0129c9c7c3d" [[package]] name = "thiserror" -version = "1.0.61" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709" +checksum = "d50af8abc119fb8bb6dbabcfa89656f46f84aa0ac7688088608076ad2b459a84" dependencies = [ "thiserror-impl", ] [[package]] name = "thiserror-impl" -version = "1.0.61" +version = "1.0.64" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" +checksum = "08904e7672f5eb876eaaf87e0ce17857500934f4981c4a0ab2b4aa98baac7fc3" dependencies = [ "proc-macro2", "quote", @@ -16652,7 +16710,7 @@ version = "0.19.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b5bb770da30e5cbfde35a2d7b9b8a2c4b8ef89548a7a6aeab5c9a576e3e7421" dependencies = [ - "indexmap 2.2.5", + "indexmap 2.6.0", "serde", "serde_spanned", "toml_datetime", @@ -16665,7 +16723,7 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "396e4d48bbb2b7554c944bde63101b5ae446cff6ec4a24227428f15eb72ef338" dependencies = [ - "indexmap 2.2.5", + "indexmap 2.6.0", "toml_datetime", "winnow", ] @@ -16676,7 +16734,7 @@ version = "0.21.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a8534fd7f78b5405e860340ad6575217ce99f38d4d5c8f2442cb5ecb50090e1" dependencies = [ - "indexmap 2.2.5", + "indexmap 2.6.0", "toml_datetime", "winnow", ] @@ -17361,9 +17419,9 @@ checksum = "711b9620af191e0cdc7468a8d14e709c3dcdb115b36f838e601583af800a370a" [[package]] name = "uuid" -version = "1.9.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5de17fd2f7da591098415cff336e12965a28061ddace43b59cb3c430179c9439" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" dependencies = [ "getrandom 0.2.11", "serde", @@ -17415,6 +17473,26 @@ version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +[[package]] +name = "vte" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5022b5fbf9407086c180e9557be968742d839e68346af7792b8592489732197" +dependencies = [ + "utf8parse", + "vte_generate_state_changes", +] + +[[package]] +name = "vte_generate_state_changes" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e369bee1b05d510a7b4ed645f5faa90619e05437111783ea5848f28d97d3c2e" +dependencies = [ + "proc-macro2", + "quote", +] + [[package]] name = "wait-timeout" version = "0.2.0" diff --git a/Cargo.toml b/Cargo.toml index 2156a478de562..bac2fee860de1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -521,7 +521,12 @@ cfg_block = "0.1.1" cfg-if = "1.0.0" ciborium = "0.2" claims = "0.7" -clap = { version = "4.3.9", features = ["derive", "env", "unstable-styles", "wrap_help"] } +clap = { version = "4.3.9", features = [ + "derive", + "env", + "unstable-styles", + "wrap_help", +] } clap-verbosity-flag = "2.1.1" clap_complete = "4.4.1" cloud-storage = { version = "0.11.1", features = [ @@ -677,8 +682,14 @@ petgraph = "0.6.5" pin-project = "1.0.10" plotters = { version = "0.3.5", default-features = false } # We're using git deps until https://github.com/poem-web/poem/pull/829 gets formally released. -poem = { git = "https://github.com/poem-web/poem.git", rev = "809b2816d3504beeba140fef3fdfe9432d654c5b", features = ["anyhow", "rustls"] } -poem-openapi = { git = "https://github.com/poem-web/poem.git", rev = "809b2816d3504beeba140fef3fdfe9432d654c5b", features = ["swagger-ui", "url"] } +poem = { git = "https://github.com/poem-web/poem.git", rev = "809b2816d3504beeba140fef3fdfe9432d654c5b", features = [ + "anyhow", + "rustls", +] } +poem-openapi = { git = "https://github.com/poem-web/poem.git", rev = "809b2816d3504beeba140fef3fdfe9432d654c5b", features = [ + "swagger-ui", + "url", +] } poem-openapi-derive = { git = "https://github.com/poem-web/poem.git", rev = "809b2816d3504beeba140fef3fdfe9432d654c5b" } poseidon-ark = { git = "https://github.com/arnaucube/poseidon-ark.git", rev = "6d2487aa1308d9d3860a2b724c485d73095c1c68" } pprof = { version = "0.11", features = ["flamegraph", "protobuf-codec"] } @@ -696,6 +707,7 @@ prost = { version = "0.12.3", features = ["no-recursion-limit"] } prost-types = "0.12.3" quanta = "0.10.1" quick_cache = "0.5.1" +quick-junit = "0.5.0" quote = "1.0.18" rand = "0.7.3" rand_core = "0.5.1" @@ -758,6 +770,7 @@ stats_alloc = "0.1.8" status-line = "0.2.0" strum = "0.24.1" strum_macros = "0.24.2" +sugars = "3.0.1" syn = { version = "1.0.92", features = ["derive", "extra-traits"] } sysinfo = "0.28.4" tar = "0.4.40" diff --git a/aptos-move/aptos-transactional-test-harness/src/aptos_test_harness.rs b/aptos-move/aptos-transactional-test-harness/src/aptos_test_harness.rs index b9df74dea7734..1012f4307c152 100644 --- a/aptos-move/aptos-transactional-test-harness/src/aptos_test_harness.rs +++ b/aptos-move/aptos-transactional-test-harness/src/aptos_test_harness.rs @@ -519,11 +519,11 @@ impl<'a> AptosTestAdapter<'a> { let txn_block = vec![txn]; let sig_verified_block = into_signature_verified_block(txn_block); let txn_provider = Arc::new(DefaultTxnProvider::new(sig_verified_block)); - let onchain_config = BlockExecutorConfigFromOnchain { + let onchain_config = BlockExecutorConfigFromOnchain::new( // TODO fetch values from state? // Or should we just use execute_block_no_limit ? - block_gas_limit_type: BlockGasLimitType::Limit(30000), - }; + BlockGasLimitType::Limit(30000), + ); let (mut outputs, _) = AptosVM::execute_block(txn_provider, &self.storage.clone(), onchain_config)? .into_inner(); diff --git a/aptos-move/block-executor/src/executor.rs b/aptos-move/block-executor/src/executor.rs index f9339ac61a0a9..b837c661b5f7c 100644 --- a/aptos-move/block-executor/src/executor.rs +++ b/aptos-move/block-executor/src/executor.rs @@ -948,6 +948,7 @@ where let shared_commit_state = ExplicitSyncWrapper::new(BlockGasLimitProcessor::new( self.config.onchain.block_gas_limit_type.clone(), + self.config.onchain.block_gas_limit_override, num_txns, )); let shared_maybe_error = AtomicBool::new(false); @@ -1120,6 +1121,7 @@ where let mut ret = Vec::with_capacity(num_txns); let mut block_limit_processor = BlockGasLimitProcessor::::new( self.config.onchain.block_gas_limit_type.clone(), + self.config.onchain.block_gas_limit_override, num_txns, ); diff --git a/aptos-move/block-executor/src/limit_processor.rs b/aptos-move/block-executor/src/limit_processor.rs index b687da43e1e39..cf3d86115c547 100644 --- a/aptos-move/block-executor/src/limit_processor.rs +++ b/aptos-move/block-executor/src/limit_processor.rs @@ -13,6 +13,7 @@ use std::time::Instant; pub struct BlockGasLimitProcessor { block_gas_limit_type: BlockGasLimitType, + block_gas_limit_override: Option, accumulated_effective_block_gas: u64, accumulated_approx_output_size: u64, accumulated_fee_statement: FeeStatement, @@ -23,9 +24,14 @@ pub struct BlockGasLimitProcessor { } impl BlockGasLimitProcessor { - pub fn new(block_gas_limit_type: BlockGasLimitType, init_size: usize) -> Self { + pub fn new( + block_gas_limit_type: BlockGasLimitType, + block_gas_limit_override: Option, + init_size: usize, + ) -> Self { Self { block_gas_limit_type, + block_gas_limit_override, accumulated_effective_block_gas: 0, accumulated_approx_output_size: 0, accumulated_fee_statement: FeeStatement::zero(), @@ -118,8 +124,16 @@ impl BlockGasLimitProcessor { self.module_rw_conflict = true; } + fn block_gas_limit(&self) -> Option { + if self.block_gas_limit_override.is_some() { + self.block_gas_limit_override + } else { + self.block_gas_limit_type.block_gas_limit() + } + } + fn should_end_block(&mut self, mode: &str) -> bool { - if let Some(per_block_gas_limit) = self.block_gas_limit_type.block_gas_limit() { + if let Some(per_block_gas_limit) = self.block_gas_limit() { // When the accumulated block gas of the committed txns exceeds // PER_BLOCK_GAS_LIMIT, early halt BlockSTM. let accumulated_block_gas = self.get_effective_accumulated_block_gas(); @@ -210,8 +224,8 @@ impl BlockGasLimitProcessor { info!( effective_block_gas = accumulated_effective_block_gas, block_gas_limit = self.block_gas_limit_type.block_gas_limit().unwrap_or(0), + block_gas_limit_override = self.block_gas_limit_override.unwrap_or(0), block_gas_limit_exceeded = self - .block_gas_limit_type .block_gas_limit() .map_or(false, |limit| accumulated_effective_block_gas >= limit), approx_output_size = accumulated_approx_output_size, @@ -255,7 +269,6 @@ impl BlockGasLimitProcessor { pub(crate) fn get_block_end_info(&self) -> BlockEndInfo { BlockEndInfo::V0 { block_gas_limit_reached: self - .block_gas_limit_type .block_gas_limit() .map(|per_block_gas_limit| { self.get_effective_accumulated_block_gas() >= per_block_gas_limit @@ -302,7 +315,7 @@ mod test { #[test] fn test_output_limit_not_used() { - let mut processor = BlockGasLimitProcessor::::new(DEFAULT_COMPLEX_LIMIT, 10); + let mut processor = BlockGasLimitProcessor::::new(DEFAULT_COMPLEX_LIMIT, None, 10); // Assert passing none here doesn't panic. processor.accumulate_fee_statement(FeeStatement::zero(), None, None); assert!(!processor.should_end_block_parallel()); @@ -326,7 +339,7 @@ mod test { use_granular_resource_group_conflicts: false, }; - let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, 10); + let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, None, 10); processor.accumulate_fee_statement(execution_fee(10), None, None); assert!(!processor.should_end_block_parallel()); @@ -350,7 +363,7 @@ mod test { use_granular_resource_group_conflicts: false, }; - let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, 10); + let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, None, 10); processor.accumulate_fee_statement(FeeStatement::zero(), None, Some(10)); assert_eq!(processor.accumulated_approx_output_size, 10); @@ -390,7 +403,7 @@ mod test { use_granular_resource_group_conflicts: false, }; - let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, 10); + let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, None, 10); processor.accumulate_fee_statement( execution_fee(10), @@ -452,7 +465,7 @@ mod test { use_granular_resource_group_conflicts: true, }; - let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, 10); + let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, None, 10); assert!(!processor.should_end_block_parallel()); processor.accumulate_fee_statement( @@ -494,7 +507,7 @@ mod test { use_granular_resource_group_conflicts: true, }; - let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, 10); + let mut processor = BlockGasLimitProcessor::::new(block_gas_limit, None, 10); processor.accumulate_fee_statement( execution_fee(10), Some(ReadWriteSummary::new( diff --git a/config/src/config/consensus_config.rs b/config/src/config/consensus_config.rs index 1747481696145..4fa892f8cb867 100644 --- a/config/src/config/consensus_config.rs +++ b/config/src/config/consensus_config.rs @@ -123,9 +123,9 @@ pub struct ExecutionBackpressureConfig { pub percentile: f64, /// Recalibrating max block size, to target blocks taking this long. pub target_block_time_ms: usize, - /// A minimal number of transactions per block, even if calibration suggests otherwise + /// A minimal block gas limit, even if calibration suggests otherwise /// To make sure backpressure doesn't become too aggressive. - pub min_calibrated_txns_per_block: u64, + pub min_calibrated_block_gas_limit: u64, // We compute re-calibrated block size, and use that for `max_txns_in_block`. // But after execution pool and cost of overpacking being minimal - we should // change so that backpressure sets `max_txns_to_execute` instead @@ -194,13 +194,13 @@ impl Default for ConsensusConfig { vote_back_pressure_limit: 7, min_max_txns_in_block_after_filtering_from_backpressure: MIN_BLOCK_TXNS_AFTER_FILTERING, execution_backpressure: Some(ExecutionBackpressureConfig { - num_blocks_to_look_at: 12, + num_blocks_to_look_at: 20, min_blocks_to_activate: 4, percentile: 0.5, - target_block_time_ms: 250, + target_block_time_ms: 200, min_block_time_ms_to_activate: 100, - // allow at least two spreading group from reordering in a single block, to utilize paralellism - min_calibrated_txns_per_block: 8, + // TODO: appropriate value? + min_calibrated_block_gas_limit: 1000, }), pipeline_backpressure: vec![ PipelineBackpressureValues { @@ -209,21 +209,21 @@ impl Default for ConsensusConfig { // Block enters the pipeline after consensus orders it, and leaves the // pipeline once quorum on execution result among validators has been reached // (so-(badly)-called "commit certificate"), meaning 2f+1 validators have finished execution. - back_pressure_pipeline_latency_limit_ms: 1200, + back_pressure_pipeline_latency_limit_ms: 1500, max_sending_block_txns_after_filtering_override: MAX_SENDING_BLOCK_TXNS_AFTER_FILTERING, max_sending_block_bytes_override: 5 * 1024 * 1024, backpressure_proposal_delay_ms: 50, }, PipelineBackpressureValues { - back_pressure_pipeline_latency_limit_ms: 1500, + back_pressure_pipeline_latency_limit_ms: 1900, max_sending_block_txns_after_filtering_override: MAX_SENDING_BLOCK_TXNS_AFTER_FILTERING, max_sending_block_bytes_override: 5 * 1024 * 1024, backpressure_proposal_delay_ms: 100, }, PipelineBackpressureValues { - back_pressure_pipeline_latency_limit_ms: 1900, + back_pressure_pipeline_latency_limit_ms: 5000, max_sending_block_txns_after_filtering_override: MAX_SENDING_BLOCK_TXNS_AFTER_FILTERING, max_sending_block_bytes_override: 5 * 1024 * 1024, @@ -231,25 +231,25 @@ impl Default for ConsensusConfig { }, // with execution backpressure, only later start reducing block size PipelineBackpressureValues { - back_pressure_pipeline_latency_limit_ms: 5000, + back_pressure_pipeline_latency_limit_ms: 7000, max_sending_block_txns_after_filtering_override: 1000, max_sending_block_bytes_override: MIN_BLOCK_BYTES_OVERRIDE, backpressure_proposal_delay_ms: 300, }, PipelineBackpressureValues { - back_pressure_pipeline_latency_limit_ms: 7000, + back_pressure_pipeline_latency_limit_ms: 9000, max_sending_block_txns_after_filtering_override: 200, max_sending_block_bytes_override: MIN_BLOCK_BYTES_OVERRIDE, backpressure_proposal_delay_ms: 300, }, PipelineBackpressureValues { - back_pressure_pipeline_latency_limit_ms: 9000, + back_pressure_pipeline_latency_limit_ms: 12000, max_sending_block_txns_after_filtering_override: 30, max_sending_block_bytes_override: MIN_BLOCK_BYTES_OVERRIDE, backpressure_proposal_delay_ms: 300, }, PipelineBackpressureValues { - back_pressure_pipeline_latency_limit_ms: 12000, + back_pressure_pipeline_latency_limit_ms: 15000, // in practice, latencies and delay make it such that ~2 blocks/s is max, // meaning that most aggressively we limit to ~10 TPS // For transactions that are more expensive than that, we should diff --git a/config/src/config/consensus_observer_config.rs b/config/src/config/consensus_observer_config.rs index bddc66843d614..65aaa4f54d06c 100644 --- a/config/src/config/consensus_observer_config.rs +++ b/config/src/config/consensus_observer_config.rs @@ -9,8 +9,8 @@ use serde::{Deserialize, Serialize}; use serde_yaml::Value; // Useful constants for enabling consensus observer on different node types -const ENABLE_ON_VALIDATORS: bool = true; -const ENABLE_ON_VALIDATOR_FULLNODES: bool = true; +const ENABLE_ON_VALIDATORS: bool = false; +const ENABLE_ON_VALIDATOR_FULLNODES: bool = false; const ENABLE_ON_PUBLIC_FULLNODES: bool = false; #[derive(Clone, Copy, Debug, Deserialize, PartialEq, Serialize)] diff --git a/config/src/config/quorum_store_config.rs b/config/src/config/quorum_store_config.rs index 53d5e9698025d..e272126929dc0 100644 --- a/config/src/config/quorum_store_config.rs +++ b/config/src/config/quorum_store_config.rs @@ -31,7 +31,7 @@ impl Default for QuorumStoreBackPressureConfig { QuorumStoreBackPressureConfig { // QS will be backpressured if the remaining total txns is more than this number // Roughly, target TPS * commit latency seconds - backlog_txn_limit_count: 72_000, + backlog_txn_limit_count: u64::MAX, // QS will create batches at the max rate until this number is reached backlog_per_validator_batch_limit_count: 20, decrease_duration_ms: 1000, diff --git a/consensus/consensus-types/src/block.rs b/consensus/consensus-types/src/block.rs index ebb99828b28c7..84515d429ec93 100644 --- a/consensus/consensus-types/src/block.rs +++ b/consensus/consensus-types/src/block.rs @@ -113,7 +113,7 @@ impl Block { Payload::InQuorumStore(pos) => pos.proofs.len(), Payload::DirectMempool(_txns) => 0, Payload::InQuorumStoreWithLimit(pos) => pos.proof_with_data.proofs.len(), - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { inline_batches.len() + proof_with_data.proofs.len() }, Payload::OptQuorumStore(opt_quorum_store_payload) => { diff --git a/consensus/consensus-types/src/common.rs b/consensus/consensus-types/src/common.rs index 3c86dfcf2bb72..37b0584216899 100644 --- a/consensus/consensus-types/src/common.rs +++ b/consensus/consensus-types/src/common.rs @@ -210,38 +210,49 @@ impl ProofWithData { } #[derive(Deserialize, Serialize, Clone, Debug)] -pub struct ProofWithDataWithTxnLimit { +pub struct ProofWithDataWithLimits { pub proof_with_data: ProofWithData, pub max_txns_to_execute: Option, + pub block_gas_limit: Option, } -impl PartialEq for ProofWithDataWithTxnLimit { +impl PartialEq for ProofWithDataWithLimits { fn eq(&self, other: &Self) -> bool { self.proof_with_data == other.proof_with_data && self.max_txns_to_execute == other.max_txns_to_execute + && self.block_gas_limit == other.block_gas_limit } } -impl Eq for ProofWithDataWithTxnLimit {} +impl Eq for ProofWithDataWithLimits {} -impl ProofWithDataWithTxnLimit { - pub fn new(proof_with_data: ProofWithData, max_txns_to_execute: Option) -> Self { +impl ProofWithDataWithLimits { + pub fn new( + proof_with_data: ProofWithData, + max_txns_to_execute: Option, + block_gas_limit: Option, + ) -> Self { Self { proof_with_data, max_txns_to_execute, + block_gas_limit, } } - pub fn extend(&mut self, other: ProofWithDataWithTxnLimit) { + pub fn extend(&mut self, other: ProofWithDataWithLimits) { self.proof_with_data.extend(other.proof_with_data); // InQuorumStoreWithLimit TODO: what is the right logic here ??? if self.max_txns_to_execute.is_none() { self.max_txns_to_execute = other.max_txns_to_execute; } + if self.block_gas_limit.is_none() { + self.block_gas_limit = other.block_gas_limit; + } } } -fn sum_max_txns_to_execute(m1: Option, m2: Option) -> Option { +// TODO: does it make sense to extend? +fn sum_options(m1: Option, m2: Option) -> Option { match (m1, m2) { (None, _) => m2, (_, None) => m1, @@ -254,26 +265,36 @@ fn sum_max_txns_to_execute(m1: Option, m2: Option) -> Option { pub enum Payload { DirectMempool(Vec), InQuorumStore(ProofWithData), - InQuorumStoreWithLimit(ProofWithDataWithTxnLimit), + InQuorumStoreWithLimit(ProofWithDataWithLimits), QuorumStoreInlineHybrid( Vec<(BatchInfo, Arc>)>, ProofWithData, Option, + Option, ), OptQuorumStore(OptQuorumStorePayload), } impl Payload { - pub fn transform_to_quorum_store_v2(self, max_txns_to_execute: Option) -> Self { + pub fn transform_to_quorum_store_v2( + self, + max_txns_to_execute: Option, + block_gas_limit: Option, + ) -> Self { match self { - Payload::InQuorumStore(proof_with_status) => Payload::InQuorumStoreWithLimit( - ProofWithDataWithTxnLimit::new(proof_with_status, max_txns_to_execute), - ), - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::InQuorumStore(proof_with_status) => { + Payload::InQuorumStoreWithLimit(ProofWithDataWithLimits::new( + proof_with_status, + max_txns_to_execute, + block_gas_limit, + )) + }, + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { Payload::QuorumStoreInlineHybrid( inline_batches, proof_with_data, max_txns_to_execute, + block_gas_limit, ) }, Payload::InQuorumStoreWithLimit(_) => { @@ -283,8 +304,9 @@ impl Payload { panic!("Payload is in direct mempool format"); }, Payload::OptQuorumStore(mut opt_qs_payload) => { - opt_qs_payload.set_execution_limit(PayloadExecutionLimit::max_txns_to_execute( + opt_qs_payload.set_execution_limit(PayloadExecutionLimit::new( max_txns_to_execute, + block_gas_limit, )); Payload::OptQuorumStore(opt_qs_payload) }, @@ -294,7 +316,12 @@ impl Payload { pub fn empty(quorum_store_enabled: bool, allow_batches_without_pos_in_proposal: bool) -> Self { if quorum_store_enabled { if allow_batches_without_pos_in_proposal { - Payload::QuorumStoreInlineHybrid(Vec::new(), ProofWithData::new(Vec::new()), None) + Payload::QuorumStoreInlineHybrid( + Vec::new(), + ProofWithData::new(Vec::new()), + None, + None, + ) } else { Payload::InQuorumStore(ProofWithData::new(Vec::new())) } @@ -312,7 +339,7 @@ impl Payload { // where we prepare the block from the payload proof_with_status.proof_with_data.len() }, - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { proof_with_data.len() + inline_batches .iter() @@ -331,12 +358,13 @@ impl Payload { // here we return the actual length of the payload; limit is considered at the stage // where we prepare the block from the payload (proof_with_status.proof_with_data.len() as u64) - .min(proof_with_status.max_txns_to_execute.unwrap_or(u64::MAX)) + .min(proof_with_status.block_gas_limit.unwrap_or(u64::MAX)) }, Payload::QuorumStoreInlineHybrid( inline_batches, proof_with_data, max_txns_to_execute, + _, ) => ((proof_with_data.len() + inline_batches .iter() @@ -356,7 +384,7 @@ impl Payload { Payload::InQuorumStoreWithLimit(proof_with_status) => { proof_with_status.proof_with_data.proofs.is_empty() }, - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { proof_with_data.proofs.is_empty() && inline_batches.is_empty() }, Payload::OptQuorumStore(opt_qs_payload) => opt_qs_payload.is_empty(), @@ -381,49 +409,58 @@ impl Payload { Payload::InQuorumStoreWithLimit(p3) }, ( - Payload::QuorumStoreInlineHybrid(b1, p1, m1), - Payload::QuorumStoreInlineHybrid(b2, p2, m2), + Payload::QuorumStoreInlineHybrid(b1, p1, m1, g1), + Payload::QuorumStoreInlineHybrid(b2, p2, m2, g2), ) => { let mut b3 = b1; b3.extend(b2); let mut p3 = p1; p3.extend(p2); // TODO: What's the right logic here? - let m3 = sum_max_txns_to_execute(m1, m2); - Payload::QuorumStoreInlineHybrid(b3, p3, m3) + let m3 = sum_options(m1, m2); + let g3 = sum_options(g1, g2); + Payload::QuorumStoreInlineHybrid(b3, p3, m3, g3) }, - (Payload::QuorumStoreInlineHybrid(b1, p1, m1), Payload::InQuorumStore(p2)) => { + (Payload::QuorumStoreInlineHybrid(b1, p1, m1, g1), Payload::InQuorumStore(p2)) => { // TODO: How to update m1? let mut p3 = p1; p3.extend(p2); - Payload::QuorumStoreInlineHybrid(b1, p3, m1) + Payload::QuorumStoreInlineHybrid(b1, p3, m1, g1) }, - (Payload::QuorumStoreInlineHybrid(b1, p1, m1), Payload::InQuorumStoreWithLimit(p2)) => { + ( + Payload::QuorumStoreInlineHybrid(b1, p1, m1, g1), + Payload::InQuorumStoreWithLimit(p2), + ) => { // TODO: What's the right logic here? - let m3 = sum_max_txns_to_execute(m1, p2.max_txns_to_execute); + let m3 = sum_options(m1, p2.max_txns_to_execute); + let g3 = sum_options(g1, p2.block_gas_limit); let mut p3 = p1; p3.extend(p2.proof_with_data); - Payload::QuorumStoreInlineHybrid(b1, p3, m3) + Payload::QuorumStoreInlineHybrid(b1, p3, m3, g3) }, - (Payload::InQuorumStore(p1), Payload::QuorumStoreInlineHybrid(b2, p2, m2)) => { + (Payload::InQuorumStore(p1), Payload::QuorumStoreInlineHybrid(b2, p2, m2, g2)) => { let mut p3 = p1; p3.extend(p2); - Payload::QuorumStoreInlineHybrid(b2, p3, m2) + Payload::QuorumStoreInlineHybrid(b2, p3, m2, g2) }, - (Payload::InQuorumStoreWithLimit(p1), Payload::QuorumStoreInlineHybrid(b2, p2, m2)) => { + ( + Payload::InQuorumStoreWithLimit(p1), + Payload::QuorumStoreInlineHybrid(b2, p2, m2, g2), + ) => { // TODO: What's the right logic here? - let m3 = sum_max_txns_to_execute(p1.max_txns_to_execute, m2); + let m3 = sum_options(p1.max_txns_to_execute, m2); + let g3 = sum_options(p1.block_gas_limit, g2); let mut p3 = p1.proof_with_data; p3.extend(p2); - Payload::QuorumStoreInlineHybrid(b2, p3, m3) + Payload::QuorumStoreInlineHybrid(b2, p3, m3, g3) }, ( - Payload::QuorumStoreInlineHybrid(_inline_batches, _proofs, _limit), + Payload::QuorumStoreInlineHybrid(_inline_batches, _proofs, _, _), Payload::OptQuorumStore(_opt_qs), ) | ( Payload::OptQuorumStore(_opt_qs), - Payload::QuorumStoreInlineHybrid(_inline_batches, _proofs, _limit), + Payload::QuorumStoreInlineHybrid(_inline_batches, _proofs, _, _), ) => { unimplemented!( "Cannot extend OptQuorumStore with QuorumStoreInlineHybrid or viceversa" @@ -457,7 +494,7 @@ impl Payload { Payload::InQuorumStoreWithLimit(proof_with_status) => { proof_with_status.proof_with_data.num_bytes() }, - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { proof_with_data.num_bytes() + inline_batches .iter() @@ -504,7 +541,7 @@ impl Payload { validator, proof_cache, ), - (true, Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _)) => { + (true, Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _)) => { Self::verify_with_cache(&proof_with_data.proofs, validator, proof_cache)?; for (batch, payload) in inline_batches.iter() { // TODO: Can cloning be avoided here? @@ -548,7 +585,7 @@ impl fmt::Display for Payload { proof_with_status.proof_with_data.proofs.len() ) }, - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { write!( f, "Inline txns: {}, InMemory proofs: {}", @@ -622,20 +659,22 @@ impl BatchPayload { #[derive(Deserialize, Serialize, Clone, Debug, PartialEq, Eq)] pub enum PayloadFilter { DirectMempool(Vec), - InQuorumStore(HashSet), + InQuorumStore(HashSet<(Round, BatchInfo)>), Empty, } -impl From<&Vec<&Payload>> for PayloadFilter { - fn from(exclude_payloads: &Vec<&Payload>) -> Self { +impl From<&Vec<(Round, &Payload)>> for PayloadFilter { + fn from(exclude_payloads: &Vec<(Round, &Payload)>) -> Self { if exclude_payloads.is_empty() { return PayloadFilter::Empty; } - let direct_mode = exclude_payloads.iter().any(|payload| payload.is_direct()); + let direct_mode = exclude_payloads + .iter() + .any(|(_, payload)| payload.is_direct()); if direct_mode { let mut exclude_txns = Vec::new(); - for payload in exclude_payloads { + for (_, payload) in exclude_payloads { if let Payload::DirectMempool(txns) = payload { for txn in txns { exclude_txns.push(TransactionSummary { @@ -649,24 +688,24 @@ impl From<&Vec<&Payload>> for PayloadFilter { PayloadFilter::DirectMempool(exclude_txns) } else { let mut exclude_batches = HashSet::new(); - for payload in exclude_payloads { + for (round, payload) in exclude_payloads { match payload { Payload::InQuorumStore(proof_with_status) => { for proof in &proof_with_status.proofs { - exclude_batches.insert(proof.info().clone()); + exclude_batches.insert((*round, proof.info().clone())); } }, Payload::InQuorumStoreWithLimit(proof_with_status) => { for proof in &proof_with_status.proof_with_data.proofs { - exclude_batches.insert(proof.info().clone()); + exclude_batches.insert((*round, proof.info().clone())); } }, - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { for proof in &proof_with_data.proofs { - exclude_batches.insert(proof.info().clone()); + exclude_batches.insert((*round, proof.info().clone())); } for (batch_info, _) in inline_batches { - exclude_batches.insert(batch_info.clone()); + exclude_batches.insert((*round, batch_info.clone())); } }, Payload::DirectMempool(_) => { @@ -674,13 +713,13 @@ impl From<&Vec<&Payload>> for PayloadFilter { }, Payload::OptQuorumStore(opt_qs_payload) => { for batch in opt_qs_payload.inline_batches().iter() { - exclude_batches.insert(batch.info().clone()); + exclude_batches.insert((*round, batch.info().clone())); } for batch_info in &opt_qs_payload.opt_batches().batch_summary { - exclude_batches.insert(batch_info.clone()); + exclude_batches.insert((*round, batch_info.clone())); } for proof in &opt_qs_payload.proof_with_data().batch_summary { - exclude_batches.insert(proof.info().clone()); + exclude_batches.insert((*round, proof.info().clone())); } }, } @@ -702,8 +741,8 @@ impl fmt::Display for PayloadFilter { }, PayloadFilter::InQuorumStore(excluded_proofs) => { let mut proofs_str = "".to_string(); - for proof in excluded_proofs.iter() { - write!(proofs_str, "{} ", proof.digest())?; + for (round, proof) in excluded_proofs.iter() { + write!(proofs_str, "({}, {}) ", *round, proof.digest())?; } write!(f, "{}", proofs_str) }, diff --git a/consensus/consensus-types/src/payload.rs b/consensus/consensus-types/src/payload.rs index e3ff6f4a771ca..fe2b6542c3eb4 100644 --- a/consensus/consensus-types/src/payload.rs +++ b/consensus/consensus-types/src/payload.rs @@ -143,26 +143,40 @@ impl IntoIterator for BatchPointer { #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] pub enum PayloadExecutionLimit { None, - MaxTransactionsToExecute(u64), + Limits(Option, Option), } impl PayloadExecutionLimit { + pub(crate) fn new(max_txns_to_execute: Option, block_gas_limit: Option) -> Self { + if max_txns_to_execute.is_none() && block_gas_limit.is_none() { + Self::None + } else { + Self::Limits(max_txns_to_execute, block_gas_limit) + } + } + + fn extend_options(o1: Option, o2: Option) -> Option { + match (o1, o2) { + (Some(v1), Some(v2)) => Some(v1 + v2), + (Some(v), None) => Some(v), + (None, Some(v)) => Some(v), + _ => None, + } + } + pub(crate) fn extend(&mut self, other: PayloadExecutionLimit) { *self = match (&self, &other) { (PayloadExecutionLimit::None, _) => other, (_, PayloadExecutionLimit::None) => return, ( - PayloadExecutionLimit::MaxTransactionsToExecute(limit1), - PayloadExecutionLimit::MaxTransactionsToExecute(limit2), - ) => PayloadExecutionLimit::MaxTransactionsToExecute(*limit1 + *limit2), + PayloadExecutionLimit::Limits(max_txns_to_execute1, block_gas_limit1), + PayloadExecutionLimit::Limits(max_txns_to_execute2, block_gas_limit2), + ) => PayloadExecutionLimit::Limits( + Self::extend_options(*max_txns_to_execute1, *max_txns_to_execute2), + Self::extend_options(*block_gas_limit1, *block_gas_limit2), + ), }; } - - pub(crate) fn max_txns_to_execute(limit: Option) -> Self { - limit.map_or(PayloadExecutionLimit::None, |val| { - PayloadExecutionLimit::MaxTransactionsToExecute(val) - }) - } } #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] @@ -284,7 +298,14 @@ impl OptQuorumStorePayloadV1 { pub fn max_txns_to_execute(&self) -> Option { match self.execution_limits { PayloadExecutionLimit::None => None, - PayloadExecutionLimit::MaxTransactionsToExecute(max) => Some(max), + PayloadExecutionLimit::Limits(max_txns_to_execute, _) => max_txns_to_execute, + } + } + + pub fn block_gas_limit(&self) -> Option { + match self.execution_limits { + PayloadExecutionLimit::None => None, + PayloadExecutionLimit::Limits(_, block_gas_limit) => block_gas_limit, } } } diff --git a/consensus/consensus-types/src/payload_pull_params.rs b/consensus/consensus-types/src/payload_pull_params.rs index 682f9b2185194..71ac487e0c012 100644 --- a/consensus/consensus-types/src/payload_pull_params.rs +++ b/consensus/consensus-types/src/payload_pull_params.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::{ - common::{Author, PayloadFilter}, + common::{Author, PayloadFilter, Round}, utils::PayloadTxnsSize, }; use std::{collections::HashSet, time::Duration}; @@ -24,6 +24,7 @@ pub struct PayloadPullParameters { pub pending_uncommitted_blocks: usize, pub recent_max_fill_fraction: f32, pub block_timestamp: Duration, + pub block_round: Round, pub maybe_optqs_payload_pull_params: Option, } @@ -50,6 +51,7 @@ impl PayloadPullParameters { pending_uncommitted_blocks: usize, recent_max_fill_fraction: f32, block_timestamp: Duration, + block_round: Round, ) -> Self { Self { max_poll_time, @@ -62,6 +64,7 @@ impl PayloadPullParameters { pending_uncommitted_blocks, recent_max_fill_fraction, block_timestamp, + block_round, maybe_optqs_payload_pull_params: None, } } diff --git a/consensus/consensus-types/src/pipelined_block.rs b/consensus/consensus-types/src/pipelined_block.rs index 9d50e3b2e03ae..1f2fd36f874d4 100644 --- a/consensus/consensus-types/src/pipelined_block.rs +++ b/consensus/consensus-types/src/pipelined_block.rs @@ -33,27 +33,45 @@ use std::{ time::{Duration, Instant}, }; -#[derive(Clone, Eq, PartialEq, Serialize, Deserialize)] +#[derive(Clone)] pub struct OrderedBlockWindow { - blocks: Vec>, + blocks: Arc>>>>, } impl OrderedBlockWindow { pub fn new(blocks: Vec>) -> Self { - Self { blocks } + Self { + blocks: Arc::new(Mutex::new(Some(blocks))), + } } pub fn empty() -> Self { - Self { blocks: vec![] } + Self { + blocks: Arc::new(Mutex::new(Some(vec![]))), + } + } + + pub fn clear(&self) { + *self.blocks.lock() = None; } // TODO: clone required? pub fn blocks(&self) -> Vec { - self.blocks.iter().map(|b| b.block().clone()).collect() + self.blocks + .lock() + .as_ref() + .expect("window already cleared") + .iter() + .map(|b| b.block().clone()) + .collect() } - pub fn pipelined_blocks(&self) -> &Vec> { - &self.blocks + pub fn pipelined_blocks(&self) -> Vec> { + self.blocks + .lock() + .as_ref() + .expect("window already cleared") + .clone() } } @@ -66,9 +84,8 @@ pub struct PipelinedBlock { /// Block data that cannot be regenerated. block: Block, /// A window of blocks that are needed for execution with the execution pool, excluding the current block + #[derivative(PartialEq = "ignore")] block_window: OrderedBlockWindow, - /// Input transactions in the order of execution - input_transactions: Vec, /// The state_compute_result is calculated for all the pending blocks prior to insertion to /// the tree. The execution results are not persisted: they're recalculated again for the /// pending blocks upon restart. @@ -92,15 +109,14 @@ impl Serialize for PipelinedBlock { #[serde(rename = "PipelineBlock")] struct SerializedBlock<'a> { block: &'a Block, - block_window: &'a OrderedBlockWindow, - input_transactions: &'a Vec, + // Removed, keeping for backwards compatibility + _input_transactions: &'a Vec, randomness: Option<&'a Randomness>, } let serialized = SerializedBlock { block: &self.block, - block_window: &self.block_window, - input_transactions: &self.input_transactions, + _input_transactions: &vec![], randomness: self.randomness.get(), }; serialized.serialize(serializer) @@ -116,15 +132,14 @@ impl<'de> Deserialize<'de> for PipelinedBlock { #[serde(rename = "PipelineBlock")] struct SerializedBlock { block: Block, - block_window: OrderedBlockWindow, - input_transactions: Vec, + // Removed, keeping for backwards compatibility + _input_transactions: Vec, randomness: Option, } let SerializedBlock { block, - block_window, - input_transactions, + _input_transactions: _, randomness, } = SerializedBlock::deserialize(deserializer)?; @@ -136,8 +151,7 @@ impl<'de> Deserialize<'de> for PipelinedBlock { ); let block = PipelinedBlock { block, - block_window, - input_transactions, + block_window: OrderedBlockWindow::empty(), state_compute_result: StateComputeResult::new_dummy(), randomness: OnceCell::new(), pipeline_insertion_time: OnceCell::new(), @@ -158,14 +172,13 @@ impl PipelinedBlock { pipeline_execution_result: PipelineExecutionResult, ) -> Self { let PipelineExecutionResult { - input_txns, + input_txns: _, result, execution_time, pre_commit_fut, } = pipeline_execution_result; self.state_compute_result = result; - self.input_transactions = input_txns; self.pre_commit_fut = Arc::new(Mutex::new(Some(pre_commit_fut))); let mut to_commit = 0; @@ -179,6 +192,11 @@ impl PipelinedBlock { } let execution_summary = ExecutionSummary { + gas_used: self + .state_compute_result + .execution_output + .block_end_info() + .map(|info| info.block_effective_gas_units()), payload_len: self .block .payload() @@ -247,11 +265,7 @@ impl Display for PipelinedBlock { } impl PipelinedBlock { - pub fn new( - block: Block, - input_transactions: Vec, - state_compute_result: StateComputeResult, - ) -> Self { + pub fn new(block: Block, state_compute_result: StateComputeResult) -> Self { info!( "New PipelinedBlock with block_id: {}, parent_id: {}, round: {}, epoch: {}, txns: {}", block.id(), @@ -264,7 +278,6 @@ impl PipelinedBlock { Self { block, block_window: OrderedBlockWindow::new(vec![]), - input_transactions, state_compute_result, randomness: OnceCell::new(), pipeline_insertion_time: OnceCell::new(), @@ -286,7 +299,6 @@ impl PipelinedBlock { Self { block, block_window: window, - input_transactions: vec![], state_compute_result: StateComputeResult::new_dummy(), randomness: OnceCell::new(), pipeline_insertion_time: OnceCell::new(), @@ -309,7 +321,6 @@ impl PipelinedBlock { Self { block, block_window: OrderedBlockWindow::empty(), - input_transactions: vec![], state_compute_result: StateComputeResult::new_dummy(), randomness: OnceCell::new(), pipeline_insertion_time: OnceCell::new(), @@ -334,10 +345,6 @@ impl PipelinedBlock { self.block().id() } - pub fn input_transactions(&self) -> &Vec { - &self.input_transactions - } - pub fn epoch(&self) -> u64 { self.block.epoch() } @@ -546,6 +553,8 @@ impl PipelinedBlock { #[derive(Debug, Clone, Eq, PartialEq)] pub struct ExecutionSummary { + // TODO: would it ever be None? + pub gas_used: Option, pub payload_len: u64, pub to_commit: u64, pub to_retry: u64, diff --git a/consensus/consensus-types/src/request_response.rs b/consensus/consensus-types/src/request_response.rs index f10e35285e532..f422ed6464fb1 100644 --- a/consensus/consensus-types/src/request_response.rs +++ b/consensus/consensus-types/src/request_response.rs @@ -2,7 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 use crate::{ - common::{Payload, PayloadFilter}, + common::{Payload, PayloadFilter, Round}, payload_pull_params::OptQSPayloadPullParams, utils::PayloadTxnsSize, }; @@ -29,6 +29,8 @@ pub struct GetPayloadRequest { pub callback: oneshot::Sender>, // block timestamp pub block_timestamp: Duration, + // block round + pub block_round: Round, } pub enum GetPayloadCommand { diff --git a/consensus/src/block_preparer.rs b/consensus/src/block_preparer.rs index a22ce1c969e15..debab565d8df2 100644 --- a/consensus/src/block_preparer.rs +++ b/consensus/src/block_preparer.rs @@ -15,13 +15,17 @@ use aptos_logger::info; use aptos_types::transaction::SignedTransaction; use fail::fail_point; use futures::{stream::FuturesOrdered, StreamExt}; +use rayon::iter::{IntoParallelIterator, ParallelIterator}; use std::{cmp::Reverse, collections::HashSet, sync::Arc, time::Instant}; +const SECS_TO_MICROSECS: u64 = 1_000_000; + pub struct BlockPreparer { payload_manager: Arc, txn_filter: Arc, txn_deduper: Arc, txn_shuffler: Arc, + max_block_txns: u64, } impl BlockPreparer { @@ -30,12 +34,14 @@ impl BlockPreparer { txn_filter: Arc, txn_deduper: Arc, txn_shuffler: Arc, + max_block_txns: u64, ) -> Self { Self { payload_manager, txn_filter, txn_deduper, txn_shuffler, + max_block_txns, } } @@ -43,7 +49,12 @@ impl BlockPreparer { &self, block: &Block, block_window: &OrderedBlockWindow, - ) -> ExecutorResult<(Vec<(Arc>, u64)>, Option)> { + ) -> ExecutorResult<( + Vec<(Arc>, u64)>, + Option, + Option, + )> { + let window_blocks = block_window.pipelined_blocks(); let mut txns = vec![]; let mut futures = FuturesOrdered::new(); info!( @@ -51,8 +62,7 @@ impl BlockPreparer { block.epoch(), block.round() ); - for block in block_window - .pipelined_blocks() + for block in window_blocks .iter() .map(|b| b.block()) .chain(std::iter::once(block)) @@ -63,13 +73,15 @@ impl BlockPreparer { let mut idx = 0; let mut max_txns_from_block_to_execute = None; + let mut block_gas_limit_override = None; loop { info!("get_transactions waiting for next: {}", idx); match futures.next().await { - // TODO: we are turning off the max txns from block to execute feature for now - Some(Ok((block_txns, _max_txns))) => { + Some(Ok((block_txns, max_txns, gas_limit))) => { txns.extend(block_txns); - max_txns_from_block_to_execute = None; + // We only care about max_txns from the current block, which is the last future + max_txns_from_block_to_execute = max_txns; + block_gas_limit_override = gas_limit; }, Some(Err(e)) => { return Err(e); @@ -83,14 +95,18 @@ impl BlockPreparer { block.epoch(), block.round() ); - Ok((txns, max_txns_from_block_to_execute)) + Ok(( + txns, + max_txns_from_block_to_execute, + block_gas_limit_override, + )) } pub async fn prepare_block( &self, block: &Block, block_window: &OrderedBlockWindow, - ) -> ExecutorResult> { + ) -> ExecutorResult<(Vec, Option, Option)> { fail_point!("consensus::prepare_block", |_| { use aptos_executor_types::ExecutorError; use std::{thread, time::Duration}; @@ -115,16 +131,18 @@ impl BlockPreparer { let mut committed_transactions = HashSet::new(); // TODO: don't materialize these? - let (mut batched_txns, max_txns_from_block_to_execute) = monitor!("get_transactions", { - self.get_transactions(block, block_window).await? - }); + let (mut batched_txns, max_txns_from_block_to_execute, block_gas_limit_override) = + monitor!("get_transactions", { + self.get_transactions(block, block_window).await? + }); // TODO: lots of repeated code here monitor!("wait_for_committed_transactions", { + let num_blocks_in_window = block_window.pipelined_blocks().len(); for b in block_window .pipelined_blocks() .iter() - .filter(|window_block| window_block.round() < block.round() - 1) + .take(num_blocks_in_window.saturating_sub(1)) { info!( "BlockPreparer: Waiting for committed transactions at block {} for block {}", @@ -151,18 +169,25 @@ impl BlockPreparer { let txn_deduper = self.txn_deduper.clone(); let block_id = block.id(); let block_timestamp_usecs = block.timestamp_usecs(); + let block_timestamp_secs = block.timestamp_usecs() / SECS_TO_MICROSECS; + // Always use max_block_txns * 2 regardless of max_txns_from_block_to_execute for better shuffling + let max_prepared_block_txns = self.max_block_txns * 2; // Transaction filtering, deduplication and shuffling are CPU intensive tasks, so we run them in a blocking task. let result = tokio::task::spawn_blocking(move || { // stable sort to ensure batches with same gas are in the same order batched_txns.sort_by_key(|(_, gas)| Reverse(*gas)); let batched_txns: Vec> = monitor!( - "filter_committed_transactions", + "filter_committed_and_expired_transactions", batched_txns - .into_iter() + .into_par_iter() .map(|(txns, _)| { txns.iter() - .filter(|txn| !committed_transactions.contains(&txn.committed_hash())) + .filter(|txn| { + !committed_transactions.contains(&txn.committed_hash()) + && block_timestamp_secs < txn.expiration_timestamp_secs() + }) + // TODO: avoid clone by using references? .cloned() .collect() }) @@ -170,17 +195,30 @@ impl BlockPreparer { ); let txns: Vec<_> = monitor!( "flatten_transactions", - batched_txns.into_iter().flatten().collect() + batched_txns + .into_iter() + .flatten() + .take(max_prepared_block_txns as usize) + .collect() ); let filtered_txns = monitor!("filter_transactions", { txn_filter.filter(block_id, block_timestamp_usecs, txns) }); - let mut deduped_txns = monitor!("dedup_transactions", txn_deduper.dedup(filtered_txns)); + let deduped_txns = monitor!("dedup_transactions", txn_deduper.dedup(filtered_txns)); + // TODO: cannot truncate here, need to pass it to execution + let mut num_txns_to_execute = deduped_txns.len() as u64; if let Some(max_txns_from_block_to_execute) = max_txns_from_block_to_execute { - deduped_txns.truncate(max_txns_from_block_to_execute as usize); + num_txns_to_execute = num_txns_to_execute.min(max_txns_from_block_to_execute); + } + MAX_TXNS_FROM_BLOCK_TO_EXECUTE.observe(num_txns_to_execute as f64); + if let Some(block_gas_limit_override) = block_gas_limit_override { + counters::BLOCK_GAS_LIMIT_OVERRIDE.observe(block_gas_limit_override as f64); } - MAX_TXNS_FROM_BLOCK_TO_EXECUTE.observe(deduped_txns.len() as f64); - Ok(deduped_txns) + Ok(( + deduped_txns, + max_txns_from_block_to_execute, + block_gas_limit_override, + )) }) .await .expect("Failed to spawn blocking task for transaction generation"); diff --git a/consensus/src/block_storage/block_store.rs b/consensus/src/block_storage/block_store.rs index 7255c7b0ff94d..40b41dfb3bc80 100644 --- a/consensus/src/block_storage/block_store.rs +++ b/consensus/src/block_storage/block_store.rs @@ -196,7 +196,6 @@ impl BlockStore { let pipelined_root_block = PipelinedBlock::new( *window_block, - vec![], // Create a dummy state_compute_result with necessary fields filled in. result, ); diff --git a/consensus/src/block_storage/block_tree.rs b/consensus/src/block_storage/block_tree.rs index 48ee89150b3b9..88ef2a083949e 100644 --- a/consensus/src/block_storage/block_tree.rs +++ b/consensus/src/block_storage/block_tree.rs @@ -442,6 +442,7 @@ impl BlockTree { info!("blocks_to_be_pruned: {:?}", blocks_to_be_pruned); while let Some(block_to_remove) = blocks_to_be_pruned.pop() { info!("block_to_remove: {:?}", block_to_remove); + block_to_remove.executed_block.block_window().clear(); // Add the children to the blocks to be pruned (if any), but stop when it reaches the // new root for child_id in block_to_remove.children() { diff --git a/consensus/src/consensus_observer/network/observer_message.rs b/consensus/src/consensus_observer/network/observer_message.rs index b30dd8b9173c0..eb3acac64883d 100644 --- a/consensus/src/consensus_observer/network/observer_message.rs +++ b/consensus/src/consensus_observer/network/observer_message.rs @@ -158,7 +158,7 @@ impl Display for ConsensusObserverDirectSend { "BlockPayload: {}. Number of transactions: {}, limit: {:?}, proofs: {:?}", block_payload.block, block_payload.transaction_payload.transactions().len(), - block_payload.transaction_payload.limit(), + block_payload.transaction_payload.transaction_limit(), block_payload.transaction_payload.payload_proofs(), ) }, @@ -343,13 +343,19 @@ impl PayloadWithProof { pub struct PayloadWithProofAndLimit { payload_with_proof: PayloadWithProof, transaction_limit: Option, + block_gas_limit: Option, } impl PayloadWithProofAndLimit { - pub fn new(payload_with_proof: PayloadWithProof, limit: Option) -> Self { + pub fn new( + payload_with_proof: PayloadWithProof, + transaction_limit: Option, + block_gas_limit: Option, + ) -> Self { Self { payload_with_proof, - transaction_limit: limit, + transaction_limit, + block_gas_limit, } } @@ -359,6 +365,7 @@ impl PayloadWithProofAndLimit { Self { payload_with_proof: PayloadWithProof::empty(), transaction_limit: None, + block_gas_limit: None, } } } @@ -386,10 +393,12 @@ impl BlockTransactionPayload { pub fn new_in_quorum_store_with_limit( transactions: Vec<(Arc>, u64)>, proofs: Vec, - limit: Option, + max_txns_to_execute: Option, + block_gas_limit: Option, ) -> Self { let payload_with_proof = PayloadWithProof::new(transactions, proofs); - let proof_with_limit = PayloadWithProofAndLimit::new(payload_with_proof, limit); + let proof_with_limit = + PayloadWithProofAndLimit::new(payload_with_proof, max_txns_to_execute, block_gas_limit); Self::InQuorumStoreWithLimit(proof_with_limit) } @@ -397,22 +406,26 @@ impl BlockTransactionPayload { pub fn new_quorum_store_inline_hybrid( transactions: Vec<(Arc>, u64)>, proofs: Vec, - limit: Option, + max_txns_to_execute: Option, + block_gas_limit: Option, inline_batches: Vec, ) -> Self { let payload_with_proof = PayloadWithProof::new(transactions, proofs); - let proof_with_limit = PayloadWithProofAndLimit::new(payload_with_proof, limit); + let proof_with_limit = + PayloadWithProofAndLimit::new(payload_with_proof, max_txns_to_execute, block_gas_limit); Self::QuorumStoreInlineHybrid(proof_with_limit, inline_batches) } pub fn new_opt_quorum_store( transactions: Vec<(Arc>, u64)>, proofs: Vec, - limit: Option, + max_txns_to_execute: Option, + block_gas_limit: Option, batch_infos: Vec, ) -> Self { let payload_with_proof = PayloadWithProof::new(transactions, proofs); - let proof_with_limit = PayloadWithProofAndLimit::new(payload_with_proof, limit); + let proof_with_limit = + PayloadWithProofAndLimit::new(payload_with_proof, max_txns_to_execute, block_gas_limit); Self::OptQuorumStore(proof_with_limit, batch_infos) } @@ -433,7 +446,7 @@ impl BlockTransactionPayload { } /// Returns the limit of the transaction payload - pub fn limit(&self) -> Option { + pub fn transaction_limit(&self) -> Option { match self { BlockTransactionPayload::InQuorumStore(_) => None, BlockTransactionPayload::InQuorumStoreWithLimit(payload) => payload.transaction_limit, @@ -444,6 +457,15 @@ impl BlockTransactionPayload { } } + pub fn block_gas_limit(&self) -> Option { + match self { + BlockTransactionPayload::InQuorumStore(_) => None, + BlockTransactionPayload::InQuorumStoreWithLimit(payload) => payload.block_gas_limit, + BlockTransactionPayload::QuorumStoreInlineHybrid(payload, _) => payload.block_gas_limit, + BlockTransactionPayload::OptQuorumStore(payload, _) => payload.block_gas_limit, + } + } + /// Returns the proofs of the transaction payload pub fn payload_proofs(&self) -> Vec { match self { @@ -496,12 +518,13 @@ impl BlockTransactionPayload { self.verify_batches(&proof_with_data.proof_with_data.proofs)?; // Verify the transaction limit - self.verify_transaction_limit(proof_with_data.max_txns_to_execute)?; + self.verify_transaction_limit(proof_with_data.block_gas_limit)?; }, Payload::QuorumStoreInlineHybrid( inline_batches, proof_with_data, max_txns_to_execute, + _, ) => { // Verify the batches in the requested block self.verify_batches(&proof_with_data.proofs)?; @@ -845,7 +868,7 @@ mod test { use aptos_consensus_types::{ block::Block, block_data::{BlockData, BlockType}, - common::{Author, ProofWithData, ProofWithDataWithTxnLimit}, + common::{Author, ProofWithData, ProofWithDataWithLimits}, pipelined_block::OrderedBlockWindow, proof_of_store::BatchId, quorum_cert::QuorumCert, @@ -918,16 +941,18 @@ mod test { vec![], proofs.clone(), transaction_limit, + None, ); // Create a quorum store payload with a single proof let batch_info = create_batch_info(); - let proof_with_data = ProofWithDataWithTxnLimit::new( + let proof_with_data = ProofWithDataWithLimits::new( ProofWithData::new(vec![ProofOfStore::new( batch_info, AggregateSignature::empty(), )]), transaction_limit, + None, ); let ordered_payload = Payload::InQuorumStoreWithLimit(proof_with_data); @@ -939,7 +964,7 @@ mod test { // Create a quorum store payload with no proofs and no transaction limit let proof_with_data = - ProofWithDataWithTxnLimit::new(ProofWithData::new(proofs.clone()), None); + ProofWithDataWithLimits::new(ProofWithData::new(proofs.clone()), None, None); let ordered_payload = Payload::InQuorumStoreWithLimit(proof_with_data); // Verify the transaction payload and ensure it fails (the transaction limit doesn't match) @@ -950,7 +975,7 @@ mod test { // Create a quorum store payload with no proofs and the correct limit let proof_with_data = - ProofWithDataWithTxnLimit::new(ProofWithData::new(proofs), transaction_limit); + ProofWithDataWithLimits::new(ProofWithData::new(proofs), transaction_limit, None); let ordered_payload = Payload::InQuorumStoreWithLimit(proof_with_data); // Verify the transaction payload and ensure it passes @@ -969,6 +994,7 @@ mod test { vec![], proofs.clone(), transaction_limit, + None, inline_batches.clone(), ); @@ -983,6 +1009,7 @@ mod test { inline_batches.clone(), proof_with_data, transaction_limit, + None, ); // Verify the transaction payload and ensure it fails (the batch infos don't match) @@ -994,7 +1021,7 @@ mod test { // Create a quorum store payload with no transaction limit let proof_with_data = ProofWithData::new(vec![]); let ordered_payload = - Payload::QuorumStoreInlineHybrid(inline_batches.clone(), proof_with_data, None); + Payload::QuorumStoreInlineHybrid(inline_batches.clone(), proof_with_data, None, None); // Verify the transaction payload and ensure it fails (the transaction limit doesn't match) let error = transaction_payload @@ -1008,6 +1035,7 @@ mod test { vec![(create_batch_info(), Arc::new(vec![]))], proof_with_data, transaction_limit, + None, ); // Verify the transaction payload and ensure it fails (the inline batches don't match) @@ -1019,7 +1047,7 @@ mod test { // Create an empty quorum store payload let proof_with_data = ProofWithData::new(vec![]); let ordered_payload = - Payload::QuorumStoreInlineHybrid(vec![], proof_with_data, transaction_limit); + Payload::QuorumStoreInlineHybrid(vec![], proof_with_data, transaction_limit, None); // Verify the transaction payload and ensure it passes transaction_payload @@ -1373,6 +1401,7 @@ mod test { vec![], proofs.clone(), None, + None, vec![], ); @@ -1447,6 +1476,7 @@ mod test { vec![(Arc::new(signed_transactions.to_vec()), 0)], proofs.to_vec(), None, + None, inline_batches.to_vec(), ); diff --git a/consensus/src/consensus_observer/observer/payload_store.rs b/consensus/src/consensus_observer/observer/payload_store.rs index 9853148f5f1bf..30ea79e9b3c9e 100644 --- a/consensus/src/consensus_observer/observer/payload_store.rs +++ b/consensus/src/consensus_observer/observer/payload_store.rs @@ -1102,6 +1102,7 @@ mod test { vec![], proofs_of_store.clone(), None, + None, vec![], ); diff --git a/consensus/src/consensus_observer/publisher/consensus_publisher.rs b/consensus/src/consensus_observer/publisher/consensus_publisher.rs index 899901593f7ed..088a6ea4f98fc 100644 --- a/consensus/src/consensus_observer/publisher/consensus_publisher.rs +++ b/consensus/src/consensus_observer/publisher/consensus_publisher.rs @@ -543,6 +543,7 @@ mod test { vec![], vec![], Some(10), + None, vec![], ); let block_payload_message = ConsensusObserverMessage::new_block_payload_message( diff --git a/consensus/src/consensus_provider.rs b/consensus/src/consensus_provider.rs index 8667c9dec1226..7106265e32294 100644 --- a/consensus/src/consensus_provider.rs +++ b/consensus/src/consensus_provider.rs @@ -71,6 +71,7 @@ pub fn start_consensus( runtime.handle(), TransactionFilter::new(node_config.execution.transaction_filter.clone()), node_config.consensus.enable_pre_commit, + node_config.consensus.max_sending_block_txns_after_filtering, ); let time_service = Arc::new(ClockTimeService::new(runtime.handle().clone())); @@ -164,6 +165,7 @@ pub fn start_consensus_observer( consensus_observer_runtime.handle(), TransactionFilter::new(node_config.execution.transaction_filter.clone()), node_config.consensus.enable_pre_commit, + node_config.consensus.max_sending_block_txns_after_filtering, ); // Create the execution proxy client diff --git a/consensus/src/counters.rs b/consensus/src/counters.rs index 4086c4352adb0..e8d87cfe1231e 100644 --- a/consensus/src/counters.rs +++ b/consensus/src/counters.rs @@ -1218,6 +1218,16 @@ pub static MAX_TXNS_FROM_BLOCK_TO_EXECUTE: Lazy = Lazy::new(|| { .unwrap() }); +/// Histogram for the override block gas limit. +pub static BLOCK_GAS_LIMIT_OVERRIDE: Lazy = Lazy::new(|| { + register_histogram!( + "block_gas_limit_override", + "Histogram for the override block gas limit.", + exponential_buckets(/*start=*/ 1.5, /*factor=*/ 1.5, /*count=*/ 25).unwrap(), + ) + .unwrap() +}); + /// Count of the number of `DKG` validator transactions received while the feature is disabled. pub static UNEXPECTED_DKG_VTXN_COUNT: Lazy = Lazy::new(|| { register_int_counter!( diff --git a/consensus/src/dag/adapter.rs b/consensus/src/dag/adapter.rs index 35650eab6ee72..ba711c1b93d8c 100644 --- a/consensus/src/dag/adapter.rs +++ b/consensus/src/dag/adapter.rs @@ -195,7 +195,6 @@ impl OrderedNotifier for OrderedNotifierAdapter { parents_bitvec, node_digests, ), - vec![], StateComputeResult::new_dummy(), ); let block_info = block.block_info(); diff --git a/consensus/src/dag/dag_driver.rs b/consensus/src/dag/dag_driver.rs index fa0caee1faa8a..38becc67672ba 100644 --- a/consensus/src/dag/dag_driver.rs +++ b/consensus/src/dag/dag_driver.rs @@ -238,8 +238,12 @@ impl DagDriver { .map(|node_status| node_status.as_node()) .collect::>(); - let payload_filter = - PayloadFilter::from(&nodes.iter().map(|node| node.payload()).collect()); + let payload_filter = PayloadFilter::from( + &nodes + .iter() + .map(|node| (new_round, node.payload())) + .collect(), + ); let validator_txn_hashes = nodes .iter() .flat_map(|node| node.validator_txns()) @@ -273,6 +277,7 @@ impl DagDriver { pending_uncommitted_blocks: 0, recent_max_fill_fraction: 0.0, block_timestamp: self.time_service.now_unix_time(), + block_round: new_round, }, sys_payload_filter, Box::pin(async {}), diff --git a/consensus/src/dag/tests/helpers.rs b/consensus/src/dag/tests/helpers.rs index 7a9f430cdccec..40afe5d931203 100644 --- a/consensus/src/dag/tests/helpers.rs +++ b/consensus/src/dag/tests/helpers.rs @@ -25,10 +25,12 @@ pub(super) struct MockPayloadManager {} #[async_trait] impl TPayloadManager for MockPayloadManager { - fn prefetch_payload_data(&self, _payload: &Payload, _timestamp: u64) {} - fn notify_commit(&self, _block_timestamp: u64, _block: Option) {} + fn notify_ordered(&self, _block: PipelinedBlock) {} + + fn prefetch_payload_data(&self, _payload: &Payload, _timestamp: u64) {} + fn check_payload_availability(&self, _block: &Block) -> Result<(), BitVec> { unimplemented!() } @@ -36,8 +38,12 @@ impl TPayloadManager for MockPayloadManager { async fn get_transactions( &self, _block: &Block, - ) -> ExecutorResult<(Vec<(Arc>, u64)>, Option)> { - Ok((Vec::new(), None)) + ) -> ExecutorResult<( + Vec<(Arc>, u64)>, + Option, + Option, + )> { + Ok((Vec::new(), None, None)) } } diff --git a/consensus/src/epoch_manager.rs b/consensus/src/epoch_manager.rs index 1cc310dff2821..622f5a6803f72 100644 --- a/consensus/src/epoch_manager.rs +++ b/consensus/src/epoch_manager.rs @@ -871,6 +871,10 @@ impl EpochManager

{ onchain_consensus_config.max_failed_authors_to_store(), self.config .min_max_txns_in_block_after_filtering_from_backpressure, + onchain_execution_config + .block_executor_onchain_config() + .block_gas_limit_type + .block_gas_limit(), pipeline_backpressure_config, chain_health_backoff_config, self.quorum_store_enabled, diff --git a/consensus/src/execution_pipeline.rs b/consensus/src/execution_pipeline.rs index cf64f9d0cb917..f8e19f47e8ddf 100644 --- a/consensus/src/execution_pipeline.rs +++ b/consensus/src/execution_pipeline.rs @@ -104,6 +104,7 @@ impl ExecutionPipeline { &self, block: PipelinedBlock, block_window: OrderedBlockWindow, + max_block_txns: u64, metadata: BlockMetadataExt, parent_block_id: HashValue, txn_generator: BlockPreparer, @@ -118,6 +119,7 @@ impl ExecutionPipeline { .send(PrepareBlockCommand { block, block_window, + max_block_txns, metadata, block_executor_onchain_config, parent_block_id, @@ -153,8 +155,9 @@ impl ExecutionPipeline { let PrepareBlockCommand { block, block_window, + max_block_txns, metadata, - block_executor_onchain_config, + mut block_executor_onchain_config, parent_block_id, block_preparer, pre_commit_hook, @@ -164,17 +167,19 @@ impl ExecutionPipeline { } = command; counters::PREPARE_BLOCK_WAIT_TIME.observe_duration(command_creation_time.elapsed()); debug!("prepare_block received block {}.", block.id()); - let input_txns = block_preparer + let prepare_result = block_preparer .prepare_block(block.block(), &block_window) .await; - if let Err(e) = input_txns { + if let Err(e) = prepare_result { result_tx .send(Err(e)) .unwrap_or_else(log_failed_to_send_result("prepare_block", block.id())); return; } let validator_txns = block.validator_txns().cloned().unwrap_or_default(); - let input_txns = input_txns.expect("input_txns must be Some."); + let (input_txns, max_txns_to_execute, block_gas_limit) = + prepare_result.expect("input_txns must be Some."); + block_executor_onchain_config.block_gas_limit_override = block_gas_limit; tokio::task::spawn_blocking(move || { let txns_to_execute = Block::combine_to_input_transactions(validator_txns, input_txns.clone(), metadata); @@ -194,6 +199,8 @@ impl ExecutionPipeline { execute_block_tx .send(ExecuteBlockCommand { input_txns, + max_block_txns, + max_txns_to_execute, pipelined_block: block, block: (block_id, sig_verified_txns).into(), block_window, @@ -230,6 +237,8 @@ impl ExecutionPipeline { ) { 'outer: while let Some(ExecuteBlockCommand { input_txns: _, + max_block_txns, + max_txns_to_execute, pipelined_block, block, block_window, @@ -251,10 +260,11 @@ impl ExecutionPipeline { // TODO: lots of repeated code here monitor!("execute_wait_for_committed_transactions", { + let num_blocks_in_window = block_window.pipelined_blocks().len(); for b in block_window .pipelined_blocks() .iter() - .filter(|window_block| window_block.round() == pipelined_block.round() - 1) + .skip(num_blocks_in_window.saturating_sub(1)) { info!( "Execution: Waiting for committed transactions at block {} for block {}", @@ -293,52 +303,59 @@ impl ExecutionPipeline { } }); - let (mut txns, blocking_txns_provider) = - monitor!("execute_filter_block_committed_transactions", { - // TODO: Find a better way to do this. - match block.transactions { - ExecutableTransactions::Unsharded(txns) => { - let transactions: Vec<_> = txns - .into_iter() - .filter(|txn| { - if let Valid(UserTransaction(user_txn)) = txn { - !committed_transactions.contains(&user_txn.committed_hash()) - } else { - true - } - }) - .collect(); - let transactions_len = transactions.len(); - ( - transactions, - Arc::new(BlockingTxnsProvider::new(transactions_len)), - ) - }, - ExecutableTransactions::UnshardedBlocking(_) => { - unimplemented!("Not expecting this yet.") - }, - ExecutableTransactions::Sharded(_) => { - unimplemented!("Sharded transactions are not supported yet.") - }, - } - }); - - let blocking_txns_writer = blocking_txns_provider.clone(); - let join_shuffle = tokio::task::spawn_blocking(move || { - // TODO: keep this previously split so we don't have to re-split it here - if let Some((first_user_txn_idx, _)) = txns.iter().find_position(|txn| { + let mut txns = monitor!("execute_filter_block_committed_transactions", { + // TODO: Find a better way to do this. + match block.transactions { + ExecutableTransactions::Unsharded(txns) => { + let transactions: Vec<_> = txns + .into_iter() + .filter(|txn| { + if let Valid(UserTransaction(user_txn)) = txn { + !committed_transactions.contains(&user_txn.committed_hash()) + } else { + true + } + }) + .collect(); + transactions + }, + ExecutableTransactions::UnshardedBlocking(_) => { + unimplemented!("Not expecting this yet.") + }, + ExecutableTransactions::Sharded(_) => { + unimplemented!("Sharded transactions are not supported yet.") + }, + } + }); + let num_validator_txns = if let Some((first_user_txn_idx, _)) = + txns.iter().find_position(|txn| { let txn = match txn { Valid(txn) => txn, Invalid(txn) => txn, }; matches!(txn, UserTransaction(_)) }) { + first_user_txn_idx + } else { + txns.len() + }; + let mut num_txns_to_execute = txns.len().min(max_block_txns as usize); + if let Some(max_user_txns_to_execute) = max_txns_to_execute { + num_txns_to_execute = num_txns_to_execute + .min(num_validator_txns.saturating_add(max_user_txns_to_execute as usize)); + } + let blocking_txns_provider = Arc::new(BlockingTxnsProvider::new(num_txns_to_execute)); + let blocking_txns_writer = blocking_txns_provider.clone(); + let join_shuffle = tokio::task::spawn_blocking(move || { + // TODO: keep this previously split so we don't have to re-split it here + if num_txns_to_execute > num_validator_txns { let timer = Instant::now(); - let validator_txns: Vec<_> = txns.drain(0..first_user_txn_idx).collect(); + let validator_txns: Vec<_> = txns.drain(0..num_validator_txns).collect(); info!( "Execution: Split validator txns from user txns in {} micros", timer.elapsed().as_micros() ); + // TODO: we could probably constrain this too with max_txns_to_execute let shuffle_iterator = crate::transaction_shuffler::use_case_aware::iterator::ShuffledTransactionIterator::new(crate::transaction_shuffler::use_case_aware::Config { sender_spread_factor: 32, platform_use_case_spread_factor: 0, @@ -347,13 +364,13 @@ impl ExecutionPipeline { for (idx, txn) in validator_txns .into_iter() .chain(shuffle_iterator) + .take(num_txns_to_execute) .enumerate() { blocking_txns_writer.set_txn(idx as TxnIndex, txn); } } else { - // No user transactions in the block. - for (idx, txn) in txns.into_iter().enumerate() { + for (idx, txn) in txns.into_iter().take(num_txns_to_execute).enumerate() { blocking_txns_writer.set_txn(idx as TxnIndex, txn); } } @@ -573,6 +590,7 @@ impl ExecutionPipeline { struct PrepareBlockCommand { block: PipelinedBlock, block_window: OrderedBlockWindow, + max_block_txns: u64, metadata: BlockMetadataExt, block_executor_onchain_config: BlockExecutorConfigFromOnchain, // The parent block id. @@ -586,6 +604,8 @@ struct PrepareBlockCommand { struct ExecuteBlockCommand { input_txns: Vec, + max_block_txns: u64, + max_txns_to_execute: Option, pipelined_block: PipelinedBlock, block: ExecutableBlock, block_window: OrderedBlockWindow, diff --git a/consensus/src/liveness/proposal_generator.rs b/consensus/src/liveness/proposal_generator.rs index 4c019abf51351..ca66e2354f7a1 100644 --- a/consensus/src/liveness/proposal_generator.rs +++ b/consensus/src/liveness/proposal_generator.rs @@ -156,65 +156,61 @@ impl PipelineBackpressureConfig { }) } - pub fn get_execution_block_size_backoff( + pub fn get_execution_block_gas_limit_backoff( &self, block_execution_times: &[ExecutionSummary], - max_block_txns: u64, + max_block_gas_limit: u64, ) -> Option { self.execution.as_ref().and_then(|config| { - let sizes = block_execution_times + let computed_target_block_gas_limits = block_execution_times .iter() .flat_map(|summary| { - // for each block, compute target (re-calibrated) block size + // for each block, compute target (re-calibrated) block gas limit let execution_time_ms = summary.execution_time.as_millis(); // Only block above the time threshold are considered giving enough signal to support calibration // so we filter out shorter locks - if execution_time_ms > config.min_block_time_ms_to_activate as u128 - && summary.payload_len > 0 - { - // TODO: After cost of "retries" is reduced with execution pool, we - // should be computing block gas limit here, simply as: - // `config.target_block_time_ms / execution_time_ms * gas_consumed_by_block`` - // - // Until then, we need to compute wanted block size to create. - // Unfortunatelly, there is multiple layers where transactions are filtered. - // After deduping/reordering logic is applied, max_txns_to_execute limits the transactions - // passed to executor (`summary.payload_len` here), and then some are discarded for various - // reasons, which we approximate are cheaply ignored. - // For the rest, only `summary.to_commit` fraction of `summary.to_commit + summary.to_retry` - // was executed. And so assuming same discard rate, we scale `summary.payload_len` with it. - Some( - ((config.target_block_time_ms as f64 / execution_time_ms as f64 - * (summary.to_commit as f64 - / (summary.to_commit + summary.to_retry) as f64) - * summary.payload_len as f64) - .floor() as u64) - .max(1), - ) + if execution_time_ms > config.min_block_time_ms_to_activate as u128 { + if let Some(gas_used) = summary.gas_used { + Some( + ((config.target_block_time_ms as f64 / execution_time_ms as f64 + * gas_used as f64) + .floor() as u64) + .max(1), + ) + } else { + warn!("Block execution summary missing gas used, skipping"); + None + } } else { None } }) .sorted() .collect::>(); - if sizes.len() >= config.min_blocks_to_activate { - let calibrated_block_size = (*sizes - .get(((config.percentile * sizes.len() as f64) as usize).min(sizes.len() - 1)) + if computed_target_block_gas_limits.len() >= config.min_blocks_to_activate { + let computed_target_block_gas_limit = (*computed_target_block_gas_limits + .get( + ((config.percentile * computed_target_block_gas_limits.len() as f64) + as usize) + .min(computed_target_block_gas_limits.len() - 1), + ) .expect("guaranteed to be within vector size")) - .max(config.min_calibrated_txns_per_block); - PROPOSER_ESTIMATED_CALIBRATED_BLOCK_TXNS.observe(calibrated_block_size as f64); - // Check if calibrated block size is reduction in size, to turn on backpressure. - if max_block_txns > calibrated_block_size { + .max(config.min_calibrated_block_gas_limit); + PROPOSER_ESTIMATED_CALIBRATED_BLOCK_TXNS + .observe(computed_target_block_gas_limit as f64); + // Check if calibrated block gas limit is a reduction, to turn on backpressure. + if max_block_gas_limit > computed_target_block_gas_limit { warn!( block_execution_times = format!("{:?}", block_execution_times), - estimated_calibrated_block_sizes = format!("{:?}", sizes), - calibrated_block_size = calibrated_block_size, + computed_target_block_gas_limits = + format!("{:?}", computed_target_block_gas_limits), + computed_target_block_gas_limit = computed_target_block_gas_limit, "Execution backpressure recalibration: proposing reducing from {} to {}", - max_block_txns, - calibrated_block_size, + max_block_gas_limit, + computed_target_block_gas_limit, ); - Some(calibrated_block_size) + Some(computed_target_block_gas_limit) } else { None } @@ -255,11 +251,11 @@ pub struct ProposalGenerator { max_inline_txns: PayloadTxnsSize, // Max number of failed authors to be added to a proposed block. max_failed_authors_to_store: usize, - /// If backpressure target block size is below it, update `max_txns_to_execute` instead. /// Applied to execution, pipeline and chain health backpressure. /// Needed as we cannot subsplit QS batches. min_max_txns_in_block_after_filtering_from_backpressure: u64, + max_block_gas_limit: Option, pipeline_backpressure_config: PipelineBackpressureConfig, chain_health_backoff_config: ChainHealthBackoffConfig, @@ -286,6 +282,7 @@ impl ProposalGenerator { max_inline_txns: PayloadTxnsSize, max_failed_authors_to_store: usize, min_max_txns_in_block_after_filtering_from_backpressure: u64, + max_block_gas_limit: Option, pipeline_backpressure_config: PipelineBackpressureConfig, chain_health_backoff_config: ChainHealthBackoffConfig, quorum_store_enabled: bool, @@ -304,6 +301,7 @@ impl ProposalGenerator { min_max_txns_in_block_after_filtering_from_backpressure, max_inline_txns, max_failed_authors_to_store, + max_block_gas_limit, pipeline_backpressure_config, chain_health_backoff_config, last_round_generated: Mutex::new(0), @@ -389,7 +387,7 @@ impl ProposalGenerator { // parent (including) up to the root (including). let exclude_payload: Vec<_> = pending_blocks .iter() - .flat_map(|block| block.payload()) + .flat_map(|block| block.payload().map(|p| (block.round(), p))) .collect(); let payload_filter = PayloadFilter::from(&exclude_payload); @@ -411,6 +409,7 @@ impl ProposalGenerator { max_block_txns, max_block_txns_after_filtering, max_txns_from_block_to_execute, + block_gas_limit, proposal_delay, ) = self .calculate_max_block_sizes(voting_power_ratio, timestamp, round) @@ -460,7 +459,7 @@ impl ProposalGenerator { max_poll_time: self.quorum_store_poll_time.saturating_sub(proposal_delay), max_txns: max_block_txns, max_txns_after_filtering: max_block_txns_after_filtering, - soft_max_txns_after_filtering: max_txns_from_block_to_execute + soft_max_txns_after_filtering: block_gas_limit .unwrap_or(max_block_txns_after_filtering), max_inline_txns: self.max_inline_txns, maybe_optqs_payload_pull_params, @@ -469,6 +468,7 @@ impl ProposalGenerator { pending_uncommitted_blocks: pending_blocks.len(), recent_max_fill_fraction: max_fill_fraction, block_timestamp: timestamp, + block_round: round, }, validator_txn_filter, wait_callback, @@ -476,11 +476,14 @@ impl ProposalGenerator { .await .context("Fail to retrieve payload")?; + // TODO: does this condition make sense? if !payload.is_direct() - && max_txns_from_block_to_execute.is_some() - && max_txns_from_block_to_execute.map_or(false, |v| payload.len() as u64 > v) + && (max_txns_from_block_to_execute.is_some() + && max_txns_from_block_to_execute.map_or(false, |v| payload.len() as u64 > v)) + || (block_gas_limit.is_some()) { - payload = payload.transform_to_quorum_store_v2(max_txns_from_block_to_execute); + payload = payload + .transform_to_quorum_store_v2(max_txns_from_block_to_execute, block_gas_limit); } (validator_txns, payload, timestamp.as_micros() as u64) }; @@ -522,10 +525,13 @@ impl ProposalGenerator { voting_power_ratio: f64, timestamp: Duration, round: Round, - ) -> (PayloadTxnsSize, u64, Option, Duration) { + ) -> (PayloadTxnsSize, u64, Option, Option, Duration) { + // TODO: need to use the block sizes from other backpressures, and treat block gas limit from execution backpressure separately. + // TODO: maybe ideally the other backpressures also use block gas limit, but it's not entirely clear how they would translate txns to gas limit. (% of max?) let mut values_max_block_txns_after_filtering = vec![self.max_block_txns_after_filtering]; let mut values_max_block = vec![self.max_block_txns]; let mut values_proposal_delay = vec![Duration::ZERO]; + let mut block_gas_limit = None; let chain_health_backoff = self .chain_health_backoff_config @@ -561,18 +567,20 @@ impl ProposalGenerator { }; let mut execution_backpressure_applied = false; - if let Some(config) = &self.pipeline_backpressure_config.execution { - let execution_backpressure = self - .pipeline_backpressure_config - .get_execution_block_size_backoff( - &self - .block_store - .get_recent_block_execution_times(config.num_blocks_to_look_at), - self.max_block_txns_after_filtering, - ); - if let Some(execution_backpressure_block_size) = execution_backpressure { - values_max_block_txns_after_filtering.push(execution_backpressure_block_size); - execution_backpressure_applied = true; + if let Some(max_block_gas_limit) = self.max_block_gas_limit { + if let Some(config) = &self.pipeline_backpressure_config.execution { + let execution_backpressure = self + .pipeline_backpressure_config + .get_execution_block_gas_limit_backoff( + &self + .block_store + .get_recent_block_execution_times(config.num_blocks_to_look_at), + max_block_gas_limit, + ); + if let Some(execution_backpressure_block_gas_limit) = execution_backpressure { + block_gas_limit = Some(execution_backpressure_block_gas_limit); + execution_backpressure_applied = true; + } } } EXECUTION_BACKPRESSURE_ON_PROPOSAL_TRIGGERED.observe( @@ -616,6 +624,7 @@ impl ProposalGenerator { max_txns_from_block_to_execute = max_txns_from_block_to_execute.unwrap_or(max_block_txns_after_filtering), max_block_size = max_block_size, + block_gas_limit = block_gas_limit.unwrap_or(self.max_block_gas_limit.unwrap_or(0)), is_pipeline_backpressure = pipeline_backpressure.is_some(), is_execution_backpressure = execution_backpressure_applied, is_chain_health_backoff = chain_health_backoff.is_some(), @@ -627,6 +636,7 @@ impl ProposalGenerator { max_block_size, max_block_txns_after_filtering, max_txns_from_block_to_execute, + block_gas_limit, proposal_delay, ) } diff --git a/consensus/src/liveness/proposal_generator_test.rs b/consensus/src/liveness/proposal_generator_test.rs index 5aa907d7fe672..a26a18fbe849c 100644 --- a/consensus/src/liveness/proposal_generator_test.rs +++ b/consensus/src/liveness/proposal_generator_test.rs @@ -25,6 +25,8 @@ use aptos_types::{on_chain_config::ValidatorTxnConfig, validator_signer::Validat use futures::{future::BoxFuture, FutureExt}; use std::{sync::Arc, time::Duration}; +const MAX_BLOCK_GAS_LIMIT: u64 = 30_000; + fn empty_callback() -> BoxFuture<'static, ()> { async move {}.boxed() } @@ -52,6 +54,7 @@ async fn test_proposal_generation_empty_tree() { PayloadTxnsSize::new(1, 10), 10, 1, + Some(MAX_BLOCK_GAS_LIMIT), PipelineBackpressureConfig::new_no_backoff(), ChainHealthBackoffConfig::new_no_backoff(), false, @@ -98,6 +101,7 @@ async fn test_proposal_generation_parent() { PayloadTxnsSize::new(1, 500), 10, 1, + Some(MAX_BLOCK_GAS_LIMIT), PipelineBackpressureConfig::new_no_backoff(), ChainHealthBackoffConfig::new_no_backoff(), false, @@ -174,6 +178,7 @@ async fn test_old_proposal_generation() { PayloadTxnsSize::new(1, 500), 10, 1, + Some(MAX_BLOCK_GAS_LIMIT), PipelineBackpressureConfig::new_no_backoff(), ChainHealthBackoffConfig::new_no_backoff(), false, @@ -215,6 +220,7 @@ async fn test_correct_failed_authors() { PayloadTxnsSize::new(1, 500), 10, 1, + Some(MAX_BLOCK_GAS_LIMIT), PipelineBackpressureConfig::new_no_backoff(), ChainHealthBackoffConfig::new_no_backoff(), false, diff --git a/consensus/src/payload_client/mixed.rs b/consensus/src/payload_client/mixed.rs index afc981ab4a3b8..34b58b1c5b896 100644 --- a/consensus/src/payload_client/mixed.rs +++ b/consensus/src/payload_client/mixed.rs @@ -157,6 +157,7 @@ mod tests { 0, 0., aptos_infallible::duration_since_epoch(), + 1, ), vtxn_pool::TransactionFilter::PendingTxnHashSet(HashSet::new()), Box::pin(async {}), @@ -185,6 +186,7 @@ mod tests { 0, 0., aptos_infallible::duration_since_epoch(), + 1, ), vtxn_pool::TransactionFilter::PendingTxnHashSet(HashSet::new()), Box::pin(async {}), @@ -213,6 +215,7 @@ mod tests { 0, 0., aptos_infallible::duration_since_epoch(), + 1, ), vtxn_pool::TransactionFilter::PendingTxnHashSet(HashSet::new()), Box::pin(async {}), @@ -241,6 +244,7 @@ mod tests { 0, 0., aptos_infallible::duration_since_epoch(), + 1, ), vtxn_pool::TransactionFilter::PendingTxnHashSet(HashSet::new()), Box::pin(async {}), @@ -287,6 +291,7 @@ mod tests { 0, 0., aptos_infallible::duration_since_epoch(), + 1, ), vtxn_pool::TransactionFilter::PendingTxnHashSet(HashSet::new()), Box::pin(async {}), diff --git a/consensus/src/payload_client/user/quorum_store_client.rs b/consensus/src/payload_client/user/quorum_store_client.rs index cd71f9261688e..9063c607cf892 100644 --- a/consensus/src/payload_client/user/quorum_store_client.rs +++ b/consensus/src/payload_client/user/quorum_store_client.rs @@ -6,7 +6,7 @@ use crate::{ payload_client::user::UserPayloadClient, }; use aptos_consensus_types::{ - common::{Payload, PayloadFilter}, + common::{Payload, PayloadFilter, Round}, payload_pull_params::{OptQSPayloadPullParams, PayloadPullParameters}, request_response::{GetPayloadCommand, GetPayloadRequest, GetPayloadResponse}, utils::PayloadTxnsSize, @@ -55,6 +55,7 @@ impl QuorumStoreClient { return_non_full: bool, exclude_payloads: PayloadFilter, block_timestamp: Duration, + block_round: Round, ) -> anyhow::Result { let (callback, callback_rcv) = oneshot::channel(); let req = GetPayloadCommand::GetPayloadRequest(GetPayloadRequest { @@ -67,6 +68,7 @@ impl QuorumStoreClient { return_non_full, callback, block_timestamp, + block_round, }); // send to shared mempool self.consensus_to_quorum_store_sender @@ -122,6 +124,7 @@ impl UserPayloadClient for QuorumStoreClient { return_non_full || return_empty || done, params.user_txn_filter.clone(), params.block_timestamp, + params.block_round, ) .await?; if payload.is_empty() && !return_empty && !done { diff --git a/consensus/src/payload_manager.rs b/consensus/src/payload_manager.rs index b5593cc99f668..309fe55ef128f 100644 --- a/consensus/src/payload_manager.rs +++ b/consensus/src/payload_manager.rs @@ -44,6 +44,8 @@ pub trait TPayloadManager: Send + Sync { /// transactions in the block's payload are no longer required for consensus. fn notify_commit(&self, block_timestamp: u64, block: Option); + fn notify_ordered(&self, block: PipelinedBlock); + /// Prefetch the data for a payload. This is used to ensure that the data for a payload is /// available when block is executed. fn prefetch_payload_data(&self, payload: &Payload, timestamp: u64); @@ -57,7 +59,11 @@ pub trait TPayloadManager: Send + Sync { async fn get_transactions( &self, block: &Block, - ) -> ExecutorResult<(Vec<(Arc>, u64)>, Option)>; + ) -> ExecutorResult<( + Vec<(Arc>, u64)>, + Option, + Option, + )>; } /// A payload manager that directly returns the transactions in a block's payload. @@ -73,6 +79,8 @@ impl DirectMempoolPayloadManager { impl TPayloadManager for DirectMempoolPayloadManager { fn notify_commit(&self, _block_timestamp: u64, _block: Option) {} + fn notify_ordered(&self, _block: PipelinedBlock) {} + fn prefetch_payload_data(&self, _payload: &Payload, _timestamp: u64) {} fn check_payload_availability(&self, _block: &Block) -> Result<(), BitVec> { @@ -82,13 +90,17 @@ impl TPayloadManager for DirectMempoolPayloadManager { async fn get_transactions( &self, block: &Block, - ) -> ExecutorResult<(Vec<(Arc>, u64)>, Option)> { + ) -> ExecutorResult<( + Vec<(Arc>, u64)>, + Option, + Option, + )> { let Some(payload) = block.payload() else { - return Ok((Vec::new(), None)); + return Ok((Vec::new(), None, None)); }; match payload { - Payload::DirectMempool(txns) => Ok((vec![(Arc::new(txns.clone()), 0)], None)), + Payload::DirectMempool(txns) => Ok((vec![(Arc::new(txns.clone()), 0)], None, None)), _ => unreachable!( "DirectMempoolPayloadManager: Unacceptable payload type {}. Epoch: {}, Round: {}, Block: {}", payload, @@ -176,7 +188,7 @@ impl QuorumStorePayloadManager { batches.push(proof.info().clone()); } }, - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { for (batch_info, _) in inline_batches.iter() { batches.push(batch_info.clone()); } @@ -253,6 +265,16 @@ impl TPayloadManager for QuorumStorePayloadManager { } } + fn notify_ordered(&self, block: PipelinedBlock) { + let mut tx = self.coordinator_tx.clone(); + if let Err(e) = tx.try_send(CoordinatorCommand::OrderedNotification(block)) { + warn!( + "BlockOrdered notification failed. Is the epoch shutting down? error: {}", + e + ); + } + } + fn prefetch_payload_data(&self, payload: &Payload, timestamp: u64) { // This is deprecated. // TODO(ibalajiarun): Remove this after migrating to OptQuorumStore type @@ -318,7 +340,7 @@ impl TPayloadManager for QuorumStorePayloadManager { self.batch_reader.clone(), ); }, - Payload::QuorumStoreInlineHybrid(_, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(_, proof_with_data, _, _) => { request_txns_and_update_status(proof_with_data, self.batch_reader.clone()); }, Payload::DirectMempool(_) => { @@ -352,7 +374,7 @@ impl TPayloadManager for QuorumStorePayloadManager { }, Payload::InQuorumStore(_) => Ok(()), Payload::InQuorumStoreWithLimit(_) => Ok(()), - Payload::QuorumStoreInlineHybrid(inline_batches, proofs, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proofs, _, _) => { fn update_availability_metrics<'a>( batch_reader: &Arc, is_proof_label: &str, @@ -423,7 +445,11 @@ impl TPayloadManager for QuorumStorePayloadManager { async fn get_transactions( &self, block: &Block, - ) -> ExecutorResult<(Vec<(Arc>, u64)>, Option)> { + ) -> ExecutorResult<( + Vec<(Arc>, u64)>, + Option, + Option, + )> { info!( "get_transactions for block ({}, {}) started.", block.epoch(), @@ -435,7 +461,7 @@ impl TPayloadManager for QuorumStorePayloadManager { block.epoch(), block.round() ); - return Ok((Vec::new(), None)); + return Ok((Vec::new(), None, None)); }; let transaction_payload = match payload { @@ -464,12 +490,14 @@ impl TPayloadManager for QuorumStorePayloadManager { transactions, proof_with_data.proof_with_data.proofs.clone(), proof_with_data.max_txns_to_execute, + proof_with_data.block_gas_limit, ) }, Payload::QuorumStoreInlineHybrid( inline_batches, proof_with_data, max_txns_to_execute, + block_gas_limit, ) => { let all_transactions = { let mut all_txns = process_payload( @@ -496,6 +524,7 @@ impl TPayloadManager for QuorumStorePayloadManager { all_transactions, proof_with_data.proofs.clone(), *max_txns_to_execute, + *block_gas_limit, inline_batches, ) }, @@ -521,6 +550,7 @@ impl TPayloadManager for QuorumStorePayloadManager { all_txns, opt_qs_payload.proof_with_data().deref().clone(), opt_qs_payload.max_txns_to_execute(), + opt_qs_payload.block_gas_limit(), [ opt_qs_payload.opt_batches().deref().clone(), opt_qs_payload.inline_batches().batch_infos(), @@ -553,7 +583,8 @@ impl TPayloadManager for QuorumStorePayloadManager { Ok(( transaction_payload.transactions(), - transaction_payload.limit(), + transaction_payload.transaction_limit(), + transaction_payload.block_gas_limit(), )) } } @@ -563,7 +594,11 @@ async fn get_transactions_for_observer( block: &Block, block_payloads: &Arc>>, consensus_publisher: &Option>, -) -> ExecutorResult<(Vec<(Arc>, u64)>, Option)> { +) -> ExecutorResult<( + Vec<(Arc>, u64)>, + Option, + Option, +)> { // The data should already be available (as consensus observer will only ever // forward a block to the executor once the data has been received and verified). let block_payload = match block_payloads.lock().entry((block.epoch(), block.round())) { @@ -603,7 +638,8 @@ async fn get_transactions_for_observer( // Return the transactions and the transaction limit Ok(( transaction_payload.transactions(), - transaction_payload.limit(), + transaction_payload.transaction_limit(), + transaction_payload.block_gas_limit(), )) } @@ -810,6 +846,10 @@ impl TPayloadManager for ConsensusObserverPayloadManager { // noop } + fn notify_ordered(&self, _block: PipelinedBlock) { + // noop + } + fn prefetch_payload_data(&self, _payload: &Payload, _timestamp: u64) { // noop } @@ -821,7 +861,11 @@ impl TPayloadManager for ConsensusObserverPayloadManager { async fn get_transactions( &self, block: &Block, - ) -> ExecutorResult<(Vec<(Arc>, u64)>, Option)> { + ) -> ExecutorResult<( + Vec<(Arc>, u64)>, + Option, + Option, + )> { return get_transactions_for_observer(block, &self.txns_pool, &self.consensus_publisher) .await; } diff --git a/consensus/src/pipeline/buffer_item.rs b/consensus/src/pipeline/buffer_item.rs index 67fcca255875e..d42dc9c891f79 100644 --- a/consensus/src/pipeline/buffer_item.rs +++ b/consensus/src/pipeline/buffer_item.rs @@ -517,7 +517,6 @@ mod test { BlockData::dummy_with_validator_txns(vec![]), None, ), - vec![], StateComputeResult::new_dummy(), ) } diff --git a/consensus/src/pipeline/tests/execution_phase_tests.rs b/consensus/src/pipeline/tests/execution_phase_tests.rs index a0d7eb6a76c43..acf95a1740233 100644 --- a/consensus/src/pipeline/tests/execution_phase_tests.rs +++ b/consensus/src/pipeline/tests/execution_phase_tests.rs @@ -90,11 +90,7 @@ fn add_execution_phase_test_cases( // happy path phase_tester.add_test_case( ExecutionRequest { - ordered_blocks: vec![PipelinedBlock::new( - block, - vec![], - StateComputeResult::new_dummy(), - )], + ordered_blocks: vec![PipelinedBlock::new(block, StateComputeResult::new_dummy())], lifetime_guard: dummy_guard(), }, Box::new(move |resp| { @@ -132,7 +128,6 @@ fn add_execution_phase_test_cases( ExecutionRequest { ordered_blocks: vec![PipelinedBlock::new( bad_block, - vec![], StateComputeResult::new_dummy(), )], lifetime_guard: dummy_guard(), diff --git a/consensus/src/pipeline/tests/test_utils.rs b/consensus/src/pipeline/tests/test_utils.rs index b0c2243ed659e..8ca54e489cdda 100644 --- a/consensus/src/pipeline/tests/test_utils.rs +++ b/consensus/src/pipeline/tests/test_utils.rs @@ -111,9 +111,7 @@ pub fn prepare_executed_blocks_with_ledger_info( let executed_blocks: Vec = proposals .iter() - .map(|proposal| { - PipelinedBlock::new(proposal.block().clone(), vec![], compute_result.clone()) - }) + .map(|proposal| PipelinedBlock::new(proposal.block().clone(), compute_result.clone())) .collect(); (executed_blocks, li_sig, proposals) diff --git a/consensus/src/quorum_store/batch_proof_queue.rs b/consensus/src/quorum_store/batch_proof_queue.rs index 8a91f6059f860..53f2e6c12cc75 100644 --- a/consensus/src/quorum_store/batch_proof_queue.rs +++ b/consensus/src/quorum_store/batch_proof_queue.rs @@ -7,8 +7,9 @@ use super::{ }; use crate::quorum_store::counters; use aptos_consensus_types::{ - common::{Author, TxnSummaryWithExpiration}, + common::{Author, PayloadFilter, Round, TxnSummaryWithExpiration}, payload::TDataInfo, + pipelined_block::PipelinedBlock, proof_of_store::{BatchInfo, ProofOfStore}, utils::PayloadTxnsSize, }; @@ -16,7 +17,6 @@ use aptos_logger::{info, sample, sample::SampleRate, warn}; use aptos_metrics_core::TimerHelper; use aptos_short_hex_str::AsShortHexStr; use aptos_types::{transaction::SignedTransaction, PeerId}; -use dashmap::DashSet; use rand::{prelude::SliceRandom, thread_rng}; use std::{ cmp::Reverse, @@ -55,6 +55,22 @@ impl QueueItem { } } +struct TxnDuplicatesCache { + pub cache: HashMap, + pub expiration_rounds: TimeExpirations, + pub latest_block_round: Round, +} + +impl TxnDuplicatesCache { + fn new() -> Self { + Self { + cache: HashMap::new(), + expiration_rounds: TimeExpirations::new(), + latest_block_round: 0, + } + } +} + pub struct BatchProofQueue { my_peer_id: PeerId, // Queue per peer to ensure fairness between peers and priority within peer @@ -67,6 +83,9 @@ pub struct BatchProofQueue { // Expiration index expirations: TimeExpirations, batch_store: Arc, + // For duplicate accounting + txn_duplicates_cache: TxnDuplicatesCache, + txn_duplicates_window: u64, latest_block_timestamp: u64, remaining_txns_with_duplicates: u64, @@ -90,6 +109,9 @@ impl BatchProofQueue { txn_summary_num_occurrences: HashMap::new(), expirations: TimeExpirations::new(), batch_store, + txn_duplicates_cache: TxnDuplicatesCache::new(), + // TODO: take the block window + txn_duplicates_window: 20, latest_block_timestamp: 0, remaining_txns_with_duplicates: 0, remaining_proofs: 0, @@ -342,14 +364,14 @@ impl BatchProofQueue { fn log_remaining_data_after_pull( &self, - excluded_batches: &HashSet, + excluded_batches: &HashSet<(Round, BatchInfo)>, pulled_proofs: &[ProofOfStore], ) { let mut num_proofs_remaining_after_pull = 0; let mut num_txns_remaining_after_pull = 0; let excluded_batch_keys = excluded_batches .iter() - .map(BatchKey::from_info) + .map(|(_, batch)| BatchKey::from_info(batch)) .collect::>(); let remaining_batches = self @@ -401,7 +423,7 @@ impl BatchProofQueue { // whether the proof queue is fully utilized. pub(crate) fn pull_proofs( &mut self, - excluded_batches: &HashSet, + excluded_batches: &HashSet<(Round, BatchInfo)>, max_txns: PayloadTxnsSize, max_txns_after_filtering: u64, soft_max_txns_after_filtering: u64, @@ -457,7 +479,7 @@ impl BatchProofQueue { pub fn pull_batches( &mut self, - excluded_batches: &HashSet, + excluded_batches: &HashSet<(Round, BatchInfo)>, exclude_authors: &HashSet, max_txns: PayloadTxnsSize, max_txns_after_filtering: u64, @@ -490,7 +512,7 @@ impl BatchProofQueue { pub fn pull_batches_internal( &mut self, - excluded_batches: &HashSet, + excluded_batches: &HashSet<(Round, BatchInfo)>, exclude_authors: &HashSet, max_txns: PayloadTxnsSize, max_txns_after_filtering: u64, @@ -516,7 +538,7 @@ impl BatchProofQueue { pub fn pull_batches_with_transactions( &mut self, - excluded_batches: &HashSet, + excluded_batches: &HashSet<(Round, BatchInfo)>, max_txns: PayloadTxnsSize, max_txns_after_filtering: u64, soft_max_txns_after_filtering: u64, @@ -563,7 +585,7 @@ impl BatchProofQueue { fn pull_internal( &mut self, batches_without_proofs: bool, - excluded_batches: &HashSet, + excluded_batches: &HashSet<(Round, BatchInfo)>, exclude_authors: &HashSet, max_txns: PayloadTxnsSize, max_txns_after_filtering: u64, @@ -580,24 +602,26 @@ impl BatchProofQueue { let mut excluded_txns = 0; let mut full = false; - let filtered_txns: DashSet = DashSet::new(); - // let num_all_txns = excluded_batches - // .iter() - // .map(|batch| batch.num_txns() as usize) - // .sum(); - // let filtered_txns = DashSet::with_capacity(num_all_txns); - // excluded_batches.par_iter().for_each(|batch_info| { - // let batch_key = BatchKey::from_info(batch_info); - // if let Some(txn_summaries) = self - // .items - // .get(&batch_key) - // .and_then(|item| item.txn_summaries.as_ref()) - // { - // for txn_summary in txn_summaries { - // filtered_txns.insert(*txn_summary); - // } - // } - // }); + let mut recent_filtered_txns: HashSet<_> = HashSet::new(); + excluded_batches + .iter() + .filter(|(round, _)| *round > self.txn_duplicates_cache.latest_block_round) + .for_each(|(_, batch_info)| { + let batch_key = BatchKey::from_info(batch_info); + if let Some(txn_summaries) = self + .items + .get(&batch_key) + .and_then(|item| item.txn_summaries.as_ref()) + { + for txn_summary in txn_summaries { + recent_filtered_txns.insert(*txn_summary); + } + } + }); + let all_excluded_batches: HashSet<_> = excluded_batches + .iter() + .map(|(_, batch_info)| batch_info) + .collect(); info!( "Pull payloads from QuorumStore: building filtered_txns took {} ms", start_time.elapsed().as_millis() @@ -644,7 +668,7 @@ impl BatchProofQueue { } if let Some((batch, item)) = iter.next() { - if excluded_batches.contains(batch) { + if all_excluded_batches.contains(batch) { excluded_txns += batch.num_txns(); } else { // Calculate the number of unique transactions if this batch is included in the result @@ -653,7 +677,11 @@ impl BatchProofQueue { + txn_summaries .iter() .filter(|txn_summary| { - !filtered_txns.contains(txn_summary) + !(recent_filtered_txns.contains(txn_summary) + || self + .txn_duplicates_cache + .cache + .contains_key(txn_summary)) && block_timestamp.as_secs() < txn_summary.expiration_timestamp_secs }) @@ -678,7 +706,7 @@ impl BatchProofQueue { summaries .iter() .filter(|summary| { - filtered_txns.insert(**summary) + recent_filtered_txns.insert(**summary) && block_timestamp.as_secs() < summary.expiration_timestamp_secs }) @@ -856,6 +884,55 @@ impl BatchProofQueue { (remaining_txns_without_duplicates, self.remaining_proofs) } + pub(crate) fn handle_ordered(&mut self, block: PipelinedBlock) { + let start_time = Instant::now(); + + // gc the transactions outside the window + for expired_txn in self + .txn_duplicates_cache + .expiration_rounds + .expire(block.round().saturating_sub(self.txn_duplicates_window)) + { + if let Entry::Occupied(mut entry) = self.txn_duplicates_cache.cache.entry(expired_txn) { + let count = entry.get_mut(); + *count -= 1; + if *count == 0 { + entry.remove(); + } + } + } + + if let Some(payload) = block.block().payload() { + let payload_vec = &vec![(block.round(), payload)]; + let payload_filter: PayloadFilter = payload_vec.into(); + if let PayloadFilter::InQuorumStore(batches) = payload_filter { + // Look up the txns in the block and update the duplicates cache + for (_, batch) in batches { + if let Some(item) = self.items.get(&BatchKey::from_info(&batch)) { + if let Some(txn_summaries) = item.txn_summaries.as_ref() { + for txn_summary in txn_summaries { + let entry = self + .txn_duplicates_cache + .cache + .entry(*txn_summary) + .or_insert(0); + *entry += 1; + self.txn_duplicates_cache + .expiration_rounds + .add_item(*txn_summary, block.round()); + } + } + } + } + } + } + self.txn_duplicates_cache.latest_block_round = block.round(); + info!( + "handle_ordered from QuorumStore: updating txn_duplicates_cache took {} ms", + start_time.elapsed().as_millis() + ); + } + // Mark in the hashmap committed PoS, but keep them until they expire pub(crate) fn mark_committed(&mut self, batches: Vec) { let start = Instant::now(); diff --git a/consensus/src/quorum_store/proof_manager.rs b/consensus/src/quorum_store/proof_manager.rs index 35484248e2073..86fdd6501ba37 100644 --- a/consensus/src/quorum_store/proof_manager.rs +++ b/consensus/src/quorum_store/proof_manager.rs @@ -9,6 +9,7 @@ use crate::{ use aptos_consensus_types::{ common::{Payload, PayloadFilter, ProofWithData, TxnSummaryWithExpiration}, payload::{OptQuorumStorePayload, PayloadExecutionLimit}, + pipelined_block::PipelinedBlock, proof_of_store::{BatchInfo, ProofOfStore, ProofOfStoreMsg}, request_response::{GetPayloadCommand, GetPayloadResponse}, utils::PayloadTxnsSize, @@ -23,6 +24,7 @@ use std::{cmp::min, collections::HashSet, sync::Arc, time::Duration}; pub enum ProofManagerCommand { ReceiveProofs(ProofOfStoreMsg), ReceiveBatches(Vec<(BatchInfo, Vec)>), + OrderedNotification(PipelinedBlock), CommitNotification(u64, Vec), Shutdown(tokio::sync::oneshot::Sender<()>), } @@ -82,6 +84,16 @@ impl ProofManager { self.update_remaining_txns_and_proofs(); } + pub(crate) fn handle_ordered_notification(&mut self, block: PipelinedBlock) { + info!( + "handle_ordered_notification ({}, {}) {}", + block.epoch(), + block.round(), + block.id() + ); + self.batch_proof_queue.handle_ordered(block); + } + pub(crate) fn handle_commit_notification( &mut self, block_timestamp: u64, @@ -133,7 +145,7 @@ impl ProofManager { &excluded_batches .iter() .cloned() - .chain(proof_block.iter().map(|proof| proof.info().clone())) + .chain(proof_block.iter().map(|proof| (request.block_round, proof.info().clone()))) .collect(), ¶ms.exclude_authors, max_opt_batch_txns_size, @@ -166,8 +178,17 @@ impl ProofManager { &excluded_batches .iter() .cloned() - .chain(proof_block.iter().map(|proof| proof.info().clone())) - .chain(opt_batches.clone()) + .chain( + proof_block + .iter() + .map(|proof| (request.block_round, proof.info().clone())), + ) + // TODO: better to just return round with the batch info? + .chain( + opt_batches + .iter() + .map(|batch| (request.block_round, batch.clone())), + ) .collect(), max_inline_txns_to_pull, request.max_txns_after_filtering, @@ -199,7 +220,12 @@ impl ProofManager { proof_block.len(), inline_block.len() ); - Payload::QuorumStoreInlineHybrid(inline_block, ProofWithData::new(proof_block), None) + Payload::QuorumStoreInlineHybrid( + inline_block, + ProofWithData::new(proof_block), + None, + None, + ) }; let res = GetPayloadResponse::GetPayloadResponse(response); @@ -275,7 +301,11 @@ impl ProofManager { ProofManagerCommand::ReceiveBatches(batches) => { counters::QUORUM_STORE_MSG_COUNT.with_label_values(&["ProofManager::receive_batches"]).inc(); self.receive_batches(batches); - } + }, + ProofManagerCommand::OrderedNotification(block) => { + counters::QUORUM_STORE_MSG_COUNT.with_label_values(&["ProofManager::ordered_notification"]).inc(); + self.handle_ordered_notification(block); + }, ProofManagerCommand::CommitNotification(block_timestamp, batches) => { counters::QUORUM_STORE_MSG_COUNT.with_label_values(&["ProofManager::commit_notification"]).inc(); self.handle_commit_notification( diff --git a/consensus/src/quorum_store/quorum_store_coordinator.rs b/consensus/src/quorum_store/quorum_store_coordinator.rs index baadc5e51d4a9..b70ea19cf5b80 100644 --- a/consensus/src/quorum_store/quorum_store_coordinator.rs +++ b/consensus/src/quorum_store/quorum_store_coordinator.rs @@ -10,7 +10,9 @@ use crate::{ round_manager::VerifiedEvent, }; use aptos_channels::aptos_channel; -use aptos_consensus_types::{common::Author, proof_of_store::BatchInfo}; +use aptos_consensus_types::{ + common::Author, pipelined_block::PipelinedBlock, proof_of_store::BatchInfo, +}; use aptos_logger::prelude::*; use aptos_types::{account_address::AccountAddress, PeerId}; use futures::StreamExt; @@ -18,6 +20,7 @@ use tokio::sync::{mpsc, oneshot}; pub enum CoordinatorCommand { CommitNotification(u64, Vec), + OrderedNotification(PipelinedBlock), Shutdown(futures_channel::oneshot::Sender<()>), } @@ -79,6 +82,12 @@ impl QuorumStoreCoordinator { .await .expect("Failed to send to BatchGenerator"); }, + CoordinatorCommand::OrderedNotification(block) => { + self.proof_manager_cmd_tx + .send(ProofManagerCommand::OrderedNotification(block)) + .await + .expect("Failed to send to ProofManager"); + }, CoordinatorCommand::Shutdown(ack_tx) => { counters::QUORUM_STORE_MSG_COUNT .with_label_values(&["QSCoordinator::shutdown"]) diff --git a/consensus/src/quorum_store/tests/batch_proof_queue_test.rs b/consensus/src/quorum_store/tests/batch_proof_queue_test.rs index 96ab5414ab120..fe2c04c2150c8 100644 --- a/consensus/src/quorum_store/tests/batch_proof_queue_test.rs +++ b/consensus/src/quorum_store/tests/batch_proof_queue_test.rs @@ -525,7 +525,7 @@ fn test_proof_pull_proofs_with_duplicates() { assert_eq!(pulled_txns.len(), 4); let result = proof_queue.pull_proofs( - &hashset![info_0.clone()], + &hashset![(1, info_0.clone())], PayloadTxnsSize::new(8, 400), 4, 4, @@ -601,7 +601,7 @@ fn test_proof_pull_proofs_with_duplicates() { assert_eq!(result.2, 2); let result = proof_queue.pull_proofs( - &hashset![info_7], + &hashset![(1, info_7)], PayloadTxnsSize::new(8, 400), 4, 4, diff --git a/consensus/src/quorum_store/tests/direct_mempool_quorum_store_test.rs b/consensus/src/quorum_store/tests/direct_mempool_quorum_store_test.rs index aa90aa5f03546..9d8c3d3a5e209 100644 --- a/consensus/src/quorum_store/tests/direct_mempool_quorum_store_test.rs +++ b/consensus/src/quorum_store/tests/direct_mempool_quorum_store_test.rs @@ -39,6 +39,7 @@ async fn test_block_request_no_txns() { filter: PayloadFilter::DirectMempool(vec![]), callback: consensus_callback, block_timestamp: aptos_infallible::duration_since_epoch(), + block_round: 1, maybe_optqs_payload_pull_params: None, })) .unwrap(); diff --git a/consensus/src/quorum_store/tests/proof_manager_test.rs b/consensus/src/quorum_store/tests/proof_manager_test.rs index 3eebe4c667937..5207dc379d1da 100644 --- a/consensus/src/quorum_store/tests/proof_manager_test.rs +++ b/consensus/src/quorum_store/tests/proof_manager_test.rs @@ -5,7 +5,7 @@ use crate::quorum_store::{ proof_manager::ProofManager, tests::batch_store_test::batch_store_for_test, }; use aptos_consensus_types::{ - common::{Payload, PayloadFilter}, + common::{Payload, PayloadFilter, Round}, proof_of_store::{BatchId, BatchInfo, ProofOfStore}, request_response::{GetPayloadCommand, GetPayloadRequest, GetPayloadResponse}, utils::PayloadTxnsSize, @@ -50,7 +50,7 @@ fn create_proof_with_gas( async fn get_proposal( proof_manager: &mut ProofManager, max_txns: u64, - filter: &[BatchInfo], + filter: &[(Round, BatchInfo)], ) -> Payload { let (callback_tx, callback_rx) = oneshot::channel(); let filter_set = HashSet::from_iter(filter.iter().cloned()); @@ -59,10 +59,11 @@ async fn get_proposal( max_txns_after_filtering: max_txns, soft_max_txns_after_filtering: max_txns, max_inline_txns: PayloadTxnsSize::new(max(max_txns / 2, 1), 100000), + return_non_full: true, filter: PayloadFilter::InQuorumStore(filter_set), callback: callback_tx, block_timestamp: aptos_infallible::duration_since_epoch(), - return_non_full: true, + block_round: 1, maybe_optqs_payload_pull_params: None, }); proof_manager.handle_proposal_request(req); @@ -89,7 +90,8 @@ fn assert_payload_response( } assert_eq!(proofs.max_txns_to_execute, max_txns_from_block_to_execute); }, - Payload::QuorumStoreInlineHybrid(_inline_batches, proofs, max_txns_to_execute) => { + // TODO: add block_gas_limit check? + Payload::QuorumStoreInlineHybrid(_inline_batches, proofs, max_txns_to_execute, _) => { assert_eq!(proofs.proofs.len(), expected.len()); for proof in proofs.proofs { assert!(expected.contains(&proof)); @@ -104,7 +106,7 @@ fn assert_payload_response( async fn get_proposal_and_assert( proof_manager: &mut ProofManager, max_txns: u64, - filter: &[BatchInfo], + filter: &[(Round, BatchInfo)], expected: &[ProofOfStore], ) { assert_payload_response( @@ -135,7 +137,7 @@ async fn test_max_txns_from_block_to_execute() { // convert payload to v2 format and assert let max_txns_from_block_to_execute = 10; assert_payload_response( - payload.transform_to_quorum_store_v2(Some(max_txns_from_block_to_execute)), + payload.transform_to_quorum_store_v2(Some(max_txns_from_block_to_execute), None), &vec![proof], Some(max_txns_from_block_to_execute), ); @@ -192,7 +194,7 @@ async fn test_proposal_priority() { get_proposal_and_assert( &mut proof_manager, 1, - &[peer0_proof0.info().clone()], + &[(1, peer0_proof0.info().clone())], &expected, ) .await; @@ -228,13 +230,13 @@ async fn test_proposal_fairness() { // The next two proofs are taken from the remaining peer let filter = vec![peer0_proofs[0].clone(), peer1_proof_0.clone()]; - let filter: Vec<_> = filter.iter().map(ProofOfStore::info).cloned().collect(); + let filter: Vec<_> = filter.iter().map(|p| (1, p.info().clone())).collect(); get_proposal_and_assert(&mut proof_manager, 2, &filter, &peer0_proofs[1..3]).await; // The last proof is also taken from the remaining peer let mut filter = peer0_proofs[0..3].to_vec(); filter.push(peer1_proof_0.clone()); - let filter: Vec<_> = filter.iter().map(ProofOfStore::info).cloned().collect(); + let filter: Vec<_> = filter.iter().map(|p| (1, p.info().clone())).collect(); get_proposal_and_assert(&mut proof_manager, 2, &filter, &peer0_proofs[3..4]).await; } diff --git a/consensus/src/rand/rand_gen/test_utils.rs b/consensus/src/rand/rand_gen/test_utils.rs index f24c310c697a9..edfe183ab3468 100644 --- a/consensus/src/rand/rand_gen/test_utils.rs +++ b/consensus/src/rand/rand_gen/test_utils.rs @@ -36,7 +36,6 @@ pub fn create_ordered_blocks(rounds: Vec) -> OrderedBlocks { ), None, ), - vec![], StateComputeResult::new_dummy(), ) }) diff --git a/consensus/src/round_manager_fuzzing.rs b/consensus/src/round_manager_fuzzing.rs index ea9492bcdfa45..1ccb2ee9b7bff 100644 --- a/consensus/src/round_manager_fuzzing.rs +++ b/consensus/src/round_manager_fuzzing.rs @@ -178,6 +178,7 @@ fn create_node_for_fuzzing() -> RoundManager { PayloadTxnsSize::new(1, 1024), 10, 1, + Some(30_000), PipelineBackpressureConfig::new_no_backoff(), ChainHealthBackoffConfig::new_no_backoff(), false, diff --git a/consensus/src/round_manager_test.rs b/consensus/src/round_manager_test.rs index 88909cbcfb34f..ea47512f2879a 100644 --- a/consensus/src/round_manager_test.rs +++ b/consensus/src/round_manager_test.rs @@ -301,6 +301,7 @@ impl NodeSetup { PayloadTxnsSize::new(5, 500), 10, 1, + Some(30_000), PipelineBackpressureConfig::new_no_backoff(), ChainHealthBackoffConfig::new_no_backoff(), false, diff --git a/consensus/src/state_computer.rs b/consensus/src/state_computer.rs index 13cb4b3767883..888ceaa4c0fc5 100644 --- a/consensus/src/state_computer.rs +++ b/consensus/src/state_computer.rs @@ -82,6 +82,7 @@ pub struct ExecutionProxy { transaction_filter: Arc, execution_pipeline: ExecutionPipeline, state: RwLock>, + max_block_txns: u64, } impl ExecutionProxy { @@ -92,6 +93,7 @@ impl ExecutionProxy { handle: &tokio::runtime::Handle, txn_filter: TransactionFilter, enable_pre_commit: bool, + max_block_txns: u64, ) -> Self { let pre_commit_notifier = Self::spawn_future_runner( handle, @@ -113,6 +115,7 @@ impl ExecutionProxy { transaction_filter: Arc::new(txn_filter), execution_pipeline, state: RwLock::new(None), + max_block_txns, } } @@ -181,6 +184,9 @@ impl StateComputer for ExecutionProxy { lifetime_guard: CountedRequest<()>, ) -> StateComputeResultFut { block.init_committed_transactions(); + if let Some(s) = self.state.write().as_ref() { + s.payload_manager.notify_ordered(block.clone()); + } let block_id = block.id(); debug!( @@ -208,6 +214,7 @@ impl StateComputer for ExecutionProxy { self.transaction_filter.clone(), transaction_deduper.clone(), transaction_shuffler.clone(), + self.max_block_txns, ); let block_executor_onchain_config = block_executor_onchain_config.clone(); @@ -227,6 +234,7 @@ impl StateComputer for ExecutionProxy { .queue( block.clone(), block_window.clone(), + self.max_block_txns, metadata.clone(), parent_block_id, transaction_generator, @@ -615,6 +623,7 @@ async fn test_commit_sync_race() { &tokio::runtime::Handle::current(), TransactionFilter::new(Filter::empty()), true, + 3000, ); executor.new_epoch( diff --git a/consensus/src/state_computer_tests.rs b/consensus/src/state_computer_tests.rs index 2ead4e9cfbbb7..6191418742204 100644 --- a/consensus/src/state_computer_tests.rs +++ b/consensus/src/state_computer_tests.rs @@ -176,6 +176,7 @@ async fn should_see_and_notify_validator_txns() { &Handle::current(), TransactionFilter::new(Filter::empty()), true, + 3000, ); let validator_txn_0 = ValidatorTransaction::dummy(vec![0xFF; 99]); diff --git a/consensus/src/test_utils/mock_execution_client.rs b/consensus/src/test_utils/mock_execution_client.rs index a1434e58e11a4..92b9abbf1f7fb 100644 --- a/consensus/src/test_utils/mock_execution_client.rs +++ b/consensus/src/test_utils/mock_execution_client.rs @@ -74,7 +74,7 @@ impl MockExecutionClient { .lock() .remove(&block.id()) .ok_or_else(|| format_err!("Cannot find block"))?; - let (mut payload_txns, _max_txns_from_block_to_execute) = + let (mut payload_txns, _max_txns_from_block_to_execute, _block_gas_limit) = self.payload_manager.get_transactions(block.block()).await?; txns.append(&mut payload_txns); } diff --git a/consensus/src/util/db_tool.rs b/consensus/src/util/db_tool.rs index 0d86d9ac60768..b26372b44eac0 100644 --- a/consensus/src/util/db_tool.rs +++ b/consensus/src/util/db_tool.rs @@ -101,7 +101,7 @@ pub fn extract_txns_from_block<'a>( .map(|proof| *proof.digest()), all_batches, ), - Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _) => { + Payload::QuorumStoreInlineHybrid(inline_batches, proof_with_data, _, _) => { let mut all_txns = extract_txns_from_quorum_store( proof_with_data.proofs.iter().map(|proof| *proof.digest()), all_batches, diff --git a/execution/executor-types/src/execution_output.rs b/execution/executor-types/src/execution_output.rs index b9e2ac4806144..6ea7222d73977 100644 --- a/execution/executor-types/src/execution_output.rs +++ b/execution/executor-types/src/execution_output.rs @@ -130,6 +130,10 @@ impl ExecutionOutput { pub fn expect_last_version(&self) -> Version { self.first_version + self.num_transactions_to_commit() as Version - 1 } + + pub fn block_end_info(&self) -> Option<&BlockEndInfo> { + self.block_end_info.as_ref() + } } #[derive(Debug)] diff --git a/testsuite/fixtures/testFormatJunitXml.fixture b/testsuite/fixtures/testFormatJunitXml.fixture new file mode 100644 index 0000000000000..8548bf70a0ca7 --- /dev/null +++ b/testsuite/fixtures/testFormatJunitXml.fixture @@ -0,0 +1,3 @@ + +blah + \ No newline at end of file diff --git a/testsuite/fixtures/testMain.fixture b/testsuite/fixtures/testMain.fixture index c0107a095369b..aeb54372076e1 100644 --- a/testsuite/fixtures/testMain.fixture +++ b/testsuite/fixtures/testMain.fixture @@ -8,6 +8,7 @@ Using the following image tags: Checking if image exists in GCP: aptos/validator-testing:banana Checking if image exists in GCP: aptos/validator-testing:banana Checking if image exists in GCP: aptos/forge:banana +forge_args: ['forge', '--suite', 'banana-test', '--duration-secs', '300', 'test', 'k8s-swarm', '--image-tag', 'banana', '--upgrade-image-tag', 'banana', '--namespace', 'forge-perry-1659078000'] === Start temp-pre-comment === ### Forge is running suite `banana-test` on `banana` * [Grafana dashboard (auto-refresh)](https://aptoslabs.grafana.net/d/overview/overview?orgId=1&refresh=10s&var-Datasource=VictoriaMetrics%20Global%20%28Non-mainnet%29&var-BigQuery=Google%20BigQuery&var-namespace=forge-perry-1659078000&var-metrics_source=All&var-chain_name=forge-big-1&refresh=10s&from=now-15m&to=now) @@ -19,6 +20,7 @@ Checking if image exists in GCP: aptos/forge:banana * Test run is land-blocking === End temp-pre-comment === Deleting forge pod for namespace forge-perry-1659078000 +rendered_forge_test_runner: Deleting forge pod for namespace forge-perry-1659078000 === Start temp-report === Forge test runner terminated: diff --git a/testsuite/forge-cli/Cargo.toml b/testsuite/forge-cli/Cargo.toml index 4006dbc32c6f7..e6b3000ee9b30 100644 --- a/testsuite/forge-cli/Cargo.toml +++ b/testsuite/forge-cli/Cargo.toml @@ -32,6 +32,7 @@ random_word = { workspace = true } reqwest = { workspace = true } serde_json = { workspace = true } serde_yaml = { workspace = true } +sugars = { workspace = true } tokio = { workspace = true } url = { workspace = true } diff --git a/testsuite/forge-cli/src/main.rs b/testsuite/forge-cli/src/main.rs index 269247405b162..b90cf2fa40372 100644 --- a/testsuite/forge-cli/src/main.rs +++ b/testsuite/forge-cli/src/main.rs @@ -5,13 +5,14 @@ #![allow(clippy::field_reassign_with_default)] use anyhow::{bail, format_err, Context, Result}; -use aptos_forge::{ForgeConfig, Options, *}; +use aptos_forge::{config::ForgeConfig, Options, *}; use aptos_logger::Level; use clap::{Parser, Subcommand}; use futures::{future, FutureExt}; use rand::{rngs::ThreadRng, seq::SliceRandom, Rng}; use serde_json::{json, Value}; use std::{self, env, num::NonZeroUsize, process, time::Duration}; +use sugars::{boxed, hmap}; use suites::{ dag::get_dag_test, indexer::get_indexer_test, @@ -277,13 +278,13 @@ fn main() -> Result<()> { mempool_backlog: 5000, })); let swarm_dir = local_cfg.swarmdir.clone(); - run_forge( - duration, + let forge = Forge::new( + &args.options, test_suite, + duration, LocalFactory::from_workspace(swarm_dir)?, - &args.options, - args.changelog.clone(), - ) + ); + run_forge_with_changelog(forge, &args.options, args.changelog.clone()) }, TestCommand::K8sSwarm(k8s) => { if let Some(move_modules_dir) = &k8s.move_modules_dir { @@ -308,9 +309,10 @@ fn main() -> Result<()> { }; let forge_runner_mode = ForgeRunnerMode::try_from_env().unwrap_or(ForgeRunnerMode::K8s); - run_forge( - duration, + let forge = Forge::new( + &args.options, test_suite, + duration, K8sFactory::new( namespace, k8s.image_tag.clone(), @@ -322,12 +324,9 @@ fn main() -> Result<()> { k8s.enable_haproxy, k8s.enable_indexer, k8s.deployer_profile.clone(), - ) - .unwrap(), - &args.options, - args.changelog, - )?; - Ok(()) + )?, + ); + run_forge_with_changelog(forge, &args.options, args.changelog) }, } }, @@ -413,39 +412,33 @@ fn main() -> Result<()> { } } -pub fn run_forge( - global_duration: Duration, - tests: ForgeConfig, - factory: F, +pub fn run_forge_with_changelog( + forge: Forge, options: &Options, - logs: Option>, + optional_changelog: Option>, ) -> Result<()> { - let forge = Forge::new(options, tests, global_duration, factory); - if options.list { forge.list()?; return Ok(()); } - match forge.run() { - Ok(report) => { - if let Some(mut changelog) = logs { - if changelog.len() != 2 { - println!("Use: changelog "); - process::exit(1); - } - let to_commit = changelog.remove(1); - let from_commit = Some(changelog.remove(0)); - send_changelog_message(&report.to_string(), &from_commit, &to_commit); - } - Ok(()) - }, - Err(e) => { - eprintln!("Failed to run tests:\n{}", e); - Err(e) - }, + let forge_result = forge.run(); + let report = forge_result.map_err(|e| { + eprintln!("Failed to run tests:\n{}", e); + anyhow::anyhow!(e) + })?; + + if let Some(changelog) = optional_changelog { + if changelog.len() != 2 { + println!("Use: changelog "); + process::exit(1); + } + let to_commit = changelog[1].clone(); + let from_commit = Some(changelog[0].clone()); + send_changelog_message(&report.to_string(), &from_commit, &to_commit); } + Ok(()) } pub fn send_changelog_message(perf_msg: &str, from_commit: &Option, to_commit: &str) { @@ -503,39 +496,42 @@ fn get_test_suite( duration: Duration, test_cmd: &TestCommand, ) -> Result { - // Check the test name against the multi-test suites - match test_name { - "local_test_suite" => return Ok(local_test_suite()), - "pre_release" => return Ok(pre_release_suite()), - "run_forever" => return Ok(run_forever()), - // TODO(rustielin): verify each test suite - "k8s_suite" => return Ok(k8s_test_suite()), - "chaos" => return Ok(chaos_test_suite(duration)), - _ => {}, // No multi-test suite matches! + // These are high level suite aliases that express an intent + let suite_aliases = hmap! { + "local_test_suite" => boxed!(local_test_suite) as Box ForgeConfig>, + "pre_release" => boxed!(pre_release_suite), + "run_forever" => boxed!(run_forever), + "k8s_suite" => boxed!(k8s_test_suite), + "chaos" => boxed!(|| chaos_test_suite(duration)), }; + if let Some(test_suite) = suite_aliases.get(test_name) { + return Ok(test_suite()); + } + // Otherwise, check the test name against the grouped test suites - if let Some(test_suite) = get_land_blocking_test(test_name, duration, test_cmd) { - Ok(test_suite) - } else if let Some(test_suite) = get_multi_region_test(test_name) { - return Ok(test_suite); - } else if let Some(test_suite) = get_netbench_test(test_name) { - return Ok(test_suite); - } else if let Some(test_suite) = get_pfn_test(test_name, duration) { - return Ok(test_suite); - } else if let Some(test_suite) = get_realistic_env_test(test_name, duration, test_cmd) { - return Ok(test_suite); - } else if let Some(test_suite) = get_state_sync_test(test_name) { - return Ok(test_suite); - } else if let Some(test_suite) = get_dag_test(test_name, duration, test_cmd) { - return Ok(test_suite); - } else if let Some(test_suite) = get_indexer_test(test_name) { - return Ok(test_suite); - } else if let Some(test_suite) = get_ungrouped_test(test_name) { - return Ok(test_suite); - } else { - bail!(format_err!("Invalid --suite given: {:?}", test_name)) + // This is done in order of priority + // A match higher up in the list will take precedence + let named_test_suites = [ + boxed!(|| get_land_blocking_test(test_name, duration, test_cmd)) + as Box Option>, + boxed!(|| get_multi_region_test(test_name)), + boxed!(|| get_netbench_test(test_name)), + boxed!(|| get_pfn_test(test_name, duration)), + boxed!(|| get_realistic_env_test(test_name, duration, test_cmd)), + boxed!(|| get_state_sync_test(test_name)), + boxed!(|| get_dag_test(test_name, duration, test_cmd)), + boxed!(|| get_indexer_test(test_name)), + boxed!(|| get_ungrouped_test(test_name)), + ]; + + for named_suite in named_test_suites.iter() { + if let Some(suite) = named_suite() { + return Ok(suite); + } } + + bail!(format_err!("Invalid --suite given: {:?}", test_name)) } #[cfg(test)] mod test { diff --git a/testsuite/forge-cli/src/suites/realistic_environment.rs b/testsuite/forge-cli/src/suites/realistic_environment.rs index 2783517edbf72..3a57173d1056a 100644 --- a/testsuite/forge-cli/src/suites/realistic_environment.rs +++ b/testsuite/forge-cli/src/suites/realistic_environment.rs @@ -178,11 +178,7 @@ pub(crate) fn realistic_env_fairness_workload_sweep() -> ForgeConfig { .with_transactions_per_account(1), ]), criteria: Vec::new(), - background_traffic: background_traffic_for_sweep_with_latency(&[ - (3.0, 8.0), - (3.0, 8.0), - (3.0, 4.0), - ]), + background_traffic: None, }) } diff --git a/testsuite/forge-test-runner-template.yaml b/testsuite/forge-test-runner-template.yaml index 60876c498df8c..1d856ca269c04 100644 --- a/testsuite/forge-test-runner-template.yaml +++ b/testsuite/forge-test-runner-template.yaml @@ -38,6 +38,8 @@ spec: value: {FORGE_USERNAME} - name: FORGE_RETAIN_DEBUG_LOGS value: "{FORGE_RETAIN_DEBUG_LOGS}" + - name: FORGE_JUNIT_XML_PATH + value: "{FORGE_JUNIT_XML_PATH}" - name: PROMETHEUS_URL valueFrom: secretKeyRef: diff --git a/testsuite/forge.py b/testsuite/forge.py index 772597c976009..242d7e1233d4c 100644 --- a/testsuite/forge.py +++ b/testsuite/forge.py @@ -266,6 +266,7 @@ class ForgeContext: forge_username: str forge_blocking: bool forge_retain_debug_logs: str + forge_junit_xml_path: Optional[str] github_actions: str github_job_url: Optional[str] @@ -688,6 +689,33 @@ def format_comment(context: ForgeContext, result: ForgeResult) -> str: ) +BEGIN_JUNIT = "=== BEGIN JUNIT ===" +END_JUNIT = "=== END JUNIT ===" + + +def format_junit_xml(_context: ForgeContext, result: ForgeResult) -> str: + forge_output = result.output + start_index = forge_output.find(BEGIN_JUNIT) + if start_index == -1: + raise Exception( + "=== BEGIN JUNIT === not found in forge output, unable to write junit xml" + ) + + start_index += len(BEGIN_JUNIT) + if start_index > len(forge_output): + raise Exception( + "=== BEGIN JUNIT === found at end of forge output, unable to write junit xml" + ) + + end_index = forge_output.find(END_JUNIT) + if end_index == -1: + raise Exception( + "=== END JUNIT === not found in forge output, unable to write junit xml" + ) + + return forge_output[start_index:end_index].strip().lstrip() + + class ForgeRunner: def run(self, context: ForgeContext) -> ForgeResult: raise NotImplementedError @@ -840,6 +868,7 @@ def run(self, context: ForgeContext) -> ForgeResult: FORGE_TEST_SUITE=sanitize_k8s_resource_name(context.forge_test_suite), FORGE_USERNAME=sanitize_k8s_resource_name(context.forge_username), FORGE_RETAIN_DEBUG_LOGS=context.forge_retain_debug_logs, + FORGE_JUNIT_XML_PATH=context.forge_junit_xml_path, VALIDATOR_NODE_SELECTOR=validator_node_selector, KUBECONFIG=MULTIREGION_KUBECONFIG_PATH, MULTIREGION_KUBECONFIG_DIR=MULTIREGION_KUBECONFIG_DIR, @@ -1340,10 +1369,11 @@ def seeded_random_choice(namespace: str, cluster_names: Sequence[str]) -> str: @envoption("FORGE_DEPLOYER_PROFILE") @envoption("FORGE_ENABLE_FAILPOINTS") @envoption("FORGE_ENABLE_PERFORMANCE") -@envoption("FORGE_TEST_SUITE") @envoption("FORGE_RUNNER_DURATION_SECS", "300") @envoption("FORGE_IMAGE_TAG") @envoption("FORGE_RETAIN_DEBUG_LOGS", "false") +@envoption("FORGE_JUNIT_XML_PATH") +@envoption("FORGE_TEST_SUITE") @envoption("IMAGE_TAG") @envoption("UPGRADE_IMAGE_TAG") @envoption("FORGE_NAMESPACE") @@ -1389,6 +1419,7 @@ def test( forge_runner_duration_secs: str, forge_image_tag: Optional[str], forge_retain_debug_logs: str, + forge_junit_xml_path: Optional[str], image_tag: Optional[str], upgrade_image_tag: Optional[str], forge_namespace: Optional[str], @@ -1639,6 +1670,7 @@ def test( forge_test_suite=forge_test_suite, forge_username=forge_username, forge_retain_debug_logs=forge_retain_debug_logs, + forge_junit_xml_path=forge_junit_xml_path, forge_blocking=forge_blocking == "true", github_actions=github_actions, github_job_url=( @@ -1683,6 +1715,9 @@ def test( log.info(format_comment(forge_context, result)) if github_step_summary: outputs.append(ForgeFormatter(github_step_summary, format_comment)) + if forge_junit_xml_path: + outputs.append(ForgeFormatter(forge_junit_xml_path, format_junit_xml)) + forge_context.report(result, outputs) log.info(result.format(forge_context)) diff --git a/testsuite/forge/Cargo.toml b/testsuite/forge/Cargo.toml index 9b877474df562..2755feb131130 100644 --- a/testsuite/forge/Cargo.toml +++ b/testsuite/forge/Cargo.toml @@ -50,17 +50,20 @@ kube = { version = "0.65.0", default-features = false, features = ["jsonpatch", num_cpus = { workspace = true } once_cell = { workspace = true } prometheus-http-query = { workspace = true } +quick-junit = { workspace = true } rand = { workspace = true } regex = { workspace = true } reqwest = { workspace = true } serde = { workspace = true } serde_json = { workspace = true } serde_yaml = { workspace = true } +sugars = { workspace = true } tempfile = { workspace = true } termcolor = { workspace = true } thiserror = { workspace = true } tokio = { workspace = true } url = { workspace = true } +uuid = { workspace = true } [dev-dependencies] serde_merge = { workspace = true } diff --git a/testsuite/forge/src/config.rs b/testsuite/forge/src/config.rs new file mode 100644 index 0000000000000..940589e7fb3b1 --- /dev/null +++ b/testsuite/forge/src/config.rs @@ -0,0 +1,342 @@ +// Copyright © Aptos Foundation +// Parts of the project are originally copyright © Meta Platforms, Inc. +// SPDX-License-Identifier: Apache-2.0 + +use crate::{ + success_criteria::{MetricsThreshold, SuccessCriteria, SystemMetricsThreshold}, + *, +}; +use aptos_config::config::{NodeConfig, OverrideNodeConfig}; +use aptos_framework::ReleaseBundle; +use std::{num::NonZeroUsize, sync::Arc}; + +pub struct ForgeConfig { + suite_name: Option, + + pub aptos_tests: Vec>, + pub admin_tests: Vec>, + pub network_tests: Vec>, + + /// The initial number of validators to spawn when the test harness creates a swarm + pub initial_validator_count: NonZeroUsize, + + /// The initial number of fullnodes to spawn when the test harness creates a swarm + pub initial_fullnode_count: usize, + + /// The initial version to use when the test harness creates a swarm + pub initial_version: InitialVersion, + + /// The initial genesis modules to use when starting a network + pub genesis_config: Option, + + /// Optional genesis helm values init function + pub genesis_helm_config_fn: Option, + + /// Optional validator node config override function + pub validator_override_node_config_fn: Option, + + /// Optional fullnode node config override function + pub fullnode_override_node_config_fn: Option, + + pub multi_region_config: bool, + + /// Transaction workload to run on the swarm + pub emit_job_request: EmitJobRequest, + + /// Success criteria + pub success_criteria: SuccessCriteria, + + /// The label of existing DBs to use, if None, will create new db. + pub existing_db_tag: Option, + + pub validator_resource_override: NodeResourceOverride, + + pub fullnode_resource_override: NodeResourceOverride, + + /// Retain debug logs and above for all nodes instead of just the first 5 nodes + pub retain_debug_logs: bool, +} + +impl ForgeConfig { + pub fn new() -> Self { + Self::default() + } + + pub fn add_aptos_test(mut self, aptos_test: T) -> Self { + self.aptos_tests.push(Box::new(aptos_test)); + self + } + + pub fn get_suite_name(&self) -> Option { + self.suite_name.clone() + } + + pub fn with_suite_name(mut self, suite_name: String) -> Self { + self.suite_name = Some(suite_name); + self + } + + pub fn with_aptos_tests(mut self, aptos_tests: Vec>) -> Self { + self.aptos_tests = aptos_tests; + self + } + + pub fn add_admin_test(mut self, admin_test: T) -> Self { + self.admin_tests.push(Box::new(admin_test)); + self + } + + pub fn with_admin_tests(mut self, admin_tests: Vec>) -> Self { + self.admin_tests = admin_tests; + self + } + + pub fn add_network_test(mut self, network_test: T) -> Self { + self.network_tests.push(Box::new(network_test)); + self + } + + pub fn with_network_tests(mut self, network_tests: Vec>) -> Self { + self.network_tests = network_tests; + self + } + + pub fn with_initial_validator_count(mut self, initial_validator_count: NonZeroUsize) -> Self { + self.initial_validator_count = initial_validator_count; + self + } + + pub fn with_initial_fullnode_count(mut self, initial_fullnode_count: usize) -> Self { + self.initial_fullnode_count = initial_fullnode_count; + self + } + + pub fn with_genesis_helm_config_fn(mut self, genesis_helm_config_fn: GenesisConfigFn) -> Self { + self.genesis_helm_config_fn = Some(genesis_helm_config_fn); + self + } + + pub fn with_validator_override_node_config_fn(mut self, f: OverrideNodeConfigFn) -> Self { + self.validator_override_node_config_fn = Some(f); + self + } + + pub fn with_fullnode_override_node_config_fn(mut self, f: OverrideNodeConfigFn) -> Self { + self.fullnode_override_node_config_fn = Some(f); + self + } + + pub fn with_multi_region_config(mut self) -> Self { + self.multi_region_config = true; + self + } + + pub fn with_validator_resource_override( + mut self, + resource_override: NodeResourceOverride, + ) -> Self { + self.validator_resource_override = resource_override; + self + } + + pub fn with_fullnode_resource_override( + mut self, + resource_override: NodeResourceOverride, + ) -> Self { + self.fullnode_resource_override = resource_override; + self + } + + fn override_node_config_from_fn(config_fn: OverrideNodeConfigFn) -> OverrideNodeConfig { + let mut override_config = NodeConfig::default(); + let mut base_config = NodeConfig::default(); + config_fn(&mut override_config, &mut base_config); + OverrideNodeConfig::new(override_config, base_config) + } + + /// Builds a function that can be used to override the default helm values for the validator and fullnode. + /// If a configuration is intended to be set for all nodes, set the value in the default helm values file: + /// testsuite/forge/src/backend/k8s/helm-values/aptos-node-default-values.yaml + pub fn build_node_helm_config_fn(&self, retain_debug_logs: bool) -> Option { + let validator_override_node_config = self + .validator_override_node_config_fn + .clone() + .map(|config_fn| Self::override_node_config_from_fn(config_fn)); + let fullnode_override_node_config = self + .fullnode_override_node_config_fn + .clone() + .map(|config_fn| Self::override_node_config_from_fn(config_fn)); + let multi_region_config = self.multi_region_config; + let existing_db_tag = self.existing_db_tag.clone(); + let validator_resource_override = self.validator_resource_override; + let fullnode_resource_override = self.fullnode_resource_override; + + // Override specific helm values. See reference: terraform/helm/aptos-node/values.yaml + Some(Arc::new(move |helm_values: &mut serde_yaml::Value| { + if let Some(override_config) = &validator_override_node_config { + helm_values["validator"]["config"] = override_config.get_yaml().unwrap(); + } + if let Some(override_config) = &fullnode_override_node_config { + helm_values["fullnode"]["config"] = override_config.get_yaml().unwrap(); + } + if multi_region_config { + helm_values["multicluster"]["enabled"] = true.into(); + // Create headless services for validators and fullnodes. + // Note: chaos-mesh will not work with clusterIP services. + helm_values["service"]["validator"]["internal"]["type"] = "ClusterIP".into(); + helm_values["service"]["validator"]["internal"]["headless"] = true.into(); + helm_values["service"]["fullnode"]["internal"]["type"] = "ClusterIP".into(); + helm_values["service"]["fullnode"]["internal"]["headless"] = true.into(); + } + if let Some(existing_db_tag) = &existing_db_tag { + helm_values["validator"]["storage"]["labels"]["tag"] = + existing_db_tag.clone().into(); + helm_values["fullnode"]["storage"]["labels"]["tag"] = + existing_db_tag.clone().into(); + } + + // validator resource overrides + if let Some(cpu_cores) = validator_resource_override.cpu_cores { + helm_values["validator"]["resources"]["requests"]["cpu"] = cpu_cores.into(); + helm_values["validator"]["resources"]["limits"]["cpu"] = cpu_cores.into(); + } + if let Some(memory_gib) = validator_resource_override.memory_gib { + helm_values["validator"]["resources"]["requests"]["memory"] = + format!("{}Gi", memory_gib).into(); + helm_values["validator"]["resources"]["limits"]["memory"] = + format!("{}Gi", memory_gib).into(); + } + if let Some(storage_gib) = validator_resource_override.storage_gib { + helm_values["validator"]["storage"]["size"] = format!("{}Gi", storage_gib).into(); + } + // fullnode resource overrides + if let Some(cpu_cores) = fullnode_resource_override.cpu_cores { + helm_values["fullnode"]["resources"]["requests"]["cpu"] = cpu_cores.into(); + helm_values["fullnode"]["resources"]["limits"]["cpu"] = cpu_cores.into(); + } + if let Some(memory_gib) = fullnode_resource_override.memory_gib { + helm_values["fullnode"]["resources"]["requests"]["memory"] = + format!("{}Gi", memory_gib).into(); + helm_values["fullnode"]["resources"]["limits"]["memory"] = + format!("{}Gi", memory_gib).into(); + } + if let Some(storage_gib) = fullnode_resource_override.storage_gib { + helm_values["fullnode"]["storage"]["size"] = format!("{}Gi", storage_gib).into(); + } + + if retain_debug_logs { + helm_values["validator"]["podAnnotations"]["aptos.dev/min-log-level-to-retain"] = + serde_yaml::Value::String("debug".to_owned()); + helm_values["fullnode"]["podAnnotations"]["aptos.dev/min-log-level-to-retain"] = + serde_yaml::Value::String("debug".to_owned()); + helm_values["validator"]["rust_log"] = "debug,hyper=off".into(); + helm_values["fullnode"]["rust_log"] = "debug,hyper=off".into(); + } + helm_values["validator"]["config"]["storage"]["rocksdb_configs"] + ["enable_storage_sharding"] = true.into(); + helm_values["fullnode"]["config"]["storage"]["rocksdb_configs"] + ["enable_storage_sharding"] = true.into(); + helm_values["validator"]["config"]["indexer_db_config"]["enable_event"] = true.into(); + helm_values["fullnode"]["config"]["indexer_db_config"]["enable_event"] = true.into(); + })) + } + + pub fn with_initial_version(mut self, initial_version: InitialVersion) -> Self { + self.initial_version = initial_version; + self + } + + pub fn with_genesis_module_bundle(mut self, bundle: ReleaseBundle) -> Self { + self.genesis_config = Some(GenesisConfig::Bundle(bundle)); + self + } + + pub fn with_genesis_modules_path(mut self, genesis_modules: String) -> Self { + self.genesis_config = Some(GenesisConfig::Path(genesis_modules)); + self + } + + pub fn with_emit_job(mut self, emit_job_request: EmitJobRequest) -> Self { + self.emit_job_request = emit_job_request; + self + } + + pub fn get_emit_job(&self) -> &EmitJobRequest { + &self.emit_job_request + } + + pub fn with_success_criteria(mut self, success_criteria: SuccessCriteria) -> Self { + self.success_criteria = success_criteria; + self + } + + pub fn get_success_criteria_mut(&mut self) -> &mut SuccessCriteria { + &mut self.success_criteria + } + + pub fn with_existing_db(mut self, tag: String) -> Self { + self.existing_db_tag = Some(tag); + self + } + + pub fn number_of_tests(&self) -> usize { + self.admin_tests.len() + self.network_tests.len() + self.aptos_tests.len() + } + + pub fn all_tests(&self) -> Vec>> { + self.admin_tests + .iter() + .map(|t| Box::new(AnyTestRef::Admin(t.as_ref()))) + .chain( + self.network_tests + .iter() + .map(|t| Box::new(AnyTestRef::Network(t.as_ref()))), + ) + .chain( + self.aptos_tests + .iter() + .map(|t| Box::new(AnyTestRef::Aptos(t.as_ref()))), + ) + .collect() + } +} + +impl Default for ForgeConfig { + fn default() -> Self { + let forge_run_mode = ForgeRunnerMode::try_from_env().unwrap_or(ForgeRunnerMode::K8s); + let success_criteria = if forge_run_mode == ForgeRunnerMode::Local { + SuccessCriteria::new(600).add_no_restarts() + } else { + SuccessCriteria::new(3500) + .add_no_restarts() + .add_system_metrics_threshold(SystemMetricsThreshold::new( + // Check that we don't use more than 12 CPU cores for 30% of the time. + MetricsThreshold::new(12.0, 30), + // Check that we don't use more than 10 GB of memory for 30% of the time. + MetricsThreshold::new_gb(10.0, 30), + )) + }; + Self { + suite_name: None, + aptos_tests: vec![], + admin_tests: vec![], + network_tests: vec![], + initial_validator_count: NonZeroUsize::new(1).unwrap(), + initial_fullnode_count: 0, + initial_version: InitialVersion::Oldest, + genesis_config: None, + genesis_helm_config_fn: None, + validator_override_node_config_fn: None, + fullnode_override_node_config_fn: None, + multi_region_config: false, + emit_job_request: EmitJobRequest::default().mode(EmitJobMode::MaxLoad { + mempool_backlog: 40000, + }), + success_criteria, + existing_db_tag: None, + validator_resource_override: NodeResourceOverride::default(), + fullnode_resource_override: NodeResourceOverride::default(), + retain_debug_logs: false, + } + } +} diff --git a/testsuite/forge/src/interface/test.rs b/testsuite/forge/src/interface/test.rs index 72c78e6a64514..d3f6c9244b89c 100644 --- a/testsuite/forge/src/interface/test.rs +++ b/testsuite/forge/src/interface/test.rs @@ -3,6 +3,7 @@ // SPDX-License-Identifier: Apache-2.0 use rand::SeedableRng; +use std::borrow::Cow; /// Whether a test is expected to fail or not #[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] @@ -12,6 +13,22 @@ pub enum ShouldFail { YesWithMessage(&'static str), } +#[derive(Debug, Clone)] +pub struct TestDetails { + pub name: String, + pub reporting_name: String, +} + +impl TestDetails { + pub fn name(&self) -> String { + self.name.clone() + } + + pub fn reporting_name(&self) -> String { + self.reporting_name.clone() + } +} + /// Represents a Test in Forge /// /// This is meant to be a super trait of the other test interfaces. @@ -28,6 +45,18 @@ pub trait Test: Send + Sync { fn should_fail(&self) -> ShouldFail { ShouldFail::No } + + /// Name used specifically for external reporting + fn reporting_name(&self) -> Cow<'static, str> { + Cow::Borrowed(self.name()) + } + + fn details(&self) -> TestDetails { + TestDetails { + name: self.name().to_string(), + reporting_name: self.reporting_name().to_string(), + } + } } impl Test for &T { diff --git a/testsuite/forge/src/lib.rs b/testsuite/forge/src/lib.rs index bdd8ec3cc6eeb..3c8dffb773d1b 100644 --- a/testsuite/forge/src/lib.rs +++ b/testsuite/forge/src/lib.rs @@ -9,6 +9,7 @@ pub use anyhow::Result; mod interface; pub use interface::*; +pub mod observer; mod runner; pub use runner::*; @@ -19,6 +20,7 @@ pub use backend::*; mod report; pub use report::*; +pub mod result; mod github; pub use github::*; @@ -29,3 +31,6 @@ pub use slack::*; pub mod success_criteria; pub mod test_utils; + +pub mod config; +pub use config::ForgeConfig; diff --git a/testsuite/forge/src/observer/junit.rs b/testsuite/forge/src/observer/junit.rs new file mode 100644 index 0000000000000..30ddce90db671 --- /dev/null +++ b/testsuite/forge/src/observer/junit.rs @@ -0,0 +1,79 @@ +// Copyright © Aptos Foundation +// Parts of the project are originally copyright © Meta Platforms, Inc. +// SPDX-License-Identifier: Apache-2.0 + +use crate::{ + result::{TestObserver, TestResult}, + TestDetails, +}; +use anyhow::Result; +use quick_junit::{NonSuccessKind, Report, TestCase, TestSuite}; +use std::sync::Mutex; +use uuid::Uuid; + +pub struct JunitTestObserver { + name: String, + path: String, + results: Mutex>, +} + +impl JunitTestObserver { + pub fn new(name: String, path: String) -> Self { + Self { + name, + path, + results: Mutex::new(vec![]), + } + } +} + +impl TestObserver for JunitTestObserver { + fn name(&self) -> String { + format!("{} junit observer", self.name) + } + + fn handle_result(&self, details: &TestDetails, result: &TestResult) -> Result<()> { + self.results + .lock() + .unwrap() + .push((details.reporting_name(), result.clone())); + Ok(()) + } + + fn finish(&self) -> Result<()> { + let mut report = Report::new("forge"); + let uuid = Uuid::new_v4(); + report.set_uuid(uuid); + + let mut suite = TestSuite::new(self.name.clone()); + for (test_name, result) in self.results.lock().unwrap().iter() { + let status = match result { + TestResult::Ok => quick_junit::TestCaseStatus::success(), + TestResult::FailedWithMsg(msg) => { + // Not 100% sure what the difference between failure and error is. + let mut status = + quick_junit::TestCaseStatus::non_success(NonSuccessKind::Failure); + status.set_message(msg.clone()); + status + }, + }; + + let test_case = TestCase::new(test_name.clone(), status); + suite.add_test_case(test_case); + } + + report.add_test_suite(suite); + + // Write to stdout so github test runner can parse it easily + println!("=== BEGIN JUNIT ==="); + let stdout = std::io::stdout(); + report.serialize(stdout)?; + println!("=== END JUNIT ==="); + + // Also write to the file + let writer = std::fs::File::create(&self.path)?; + report.serialize(writer)?; + + Ok(()) + } +} diff --git a/testsuite/forge/src/observer/mod.rs b/testsuite/forge/src/observer/mod.rs new file mode 100644 index 0000000000000..e5948202b9e2d --- /dev/null +++ b/testsuite/forge/src/observer/mod.rs @@ -0,0 +1,5 @@ +// Copyright © Aptos Foundation +// Parts of the project are originally copyright © Meta Platforms, Inc. +// SPDX-License-Identifier: Apache-2.0 + +pub mod junit; diff --git a/testsuite/forge/src/result.rs b/testsuite/forge/src/result.rs new file mode 100644 index 0000000000000..0c96d2d1f1d19 --- /dev/null +++ b/testsuite/forge/src/result.rs @@ -0,0 +1,159 @@ +// Copyright © Aptos Foundation +// Parts of the project are originally copyright © Meta Platforms, Inc. +// SPDX-License-Identifier: Apache-2.0 + +use crate::TestDetails; +use anyhow::{bail, Result}; +use std::{ + fmt::{Display, Formatter}, + io::{self, Write as _}, +}; +use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; + +#[derive(Debug, Clone)] +pub enum TestResult { + Ok, + FailedWithMsg(String), +} + +impl Display for TestResult { + fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { + match self { + TestResult::Ok => write!(f, "Test Ok"), + TestResult::FailedWithMsg(msg) => write!(f, "Test Failed: {}", msg), + } + } +} + +pub trait TestObserver { + fn name(&self) -> String; + fn handle_result(&self, details: &TestDetails, result: &TestResult) -> Result<()>; + fn finish(&self) -> Result<()>; +} + +pub struct TestSummary { + stdout: StandardStream, + total: usize, + filtered_out: usize, + passed: usize, + failed: Vec, + observers: Vec>, +} + +impl TestSummary { + pub fn new(total: usize, filtered_out: usize) -> Self { + Self { + stdout: StandardStream::stdout(ColorChoice::Auto), + total, + filtered_out, + passed: 0, + failed: Vec::new(), + observers: Vec::new(), + } + } + + pub fn add_observer(&mut self, observer: Box) { + self.observers.push(observer); + } + + pub fn handle_result(&mut self, details: TestDetails, result: TestResult) -> Result<()> { + write!(self.stdout, "test {} ... ", details.name())?; + match result.clone() { + TestResult::Ok => { + self.passed += 1; + self.write_ok()?; + }, + TestResult::FailedWithMsg(msg) => { + self.failed.push(details.name()); + self.write_failed()?; + writeln!(self.stdout)?; + + write!(self.stdout, "Error: {}", msg)?; + }, + } + writeln!(self.stdout)?; + let mut errors = vec![]; + for observer in &self.observers { + let result = observer.handle_result(&details, &result); + if let Err(e) = result { + errors.push(format!("{}: {}", observer.name(), e)); + } + } + if !errors.is_empty() { + bail!("Failed to handle_result in observers: {:?}", errors); + } + Ok(()) + } + + pub fn finish(&self) -> Result<()> { + let mut errors = vec![]; + for observer in &self.observers { + let result = observer.finish(); + if let Err(e) = result { + errors.push(format!("{}: {}", observer.name(), e)); + } + } + if !errors.is_empty() { + bail!("Failed to finish observers: {:?}", errors); + } + Ok(()) + } + + fn write_ok(&mut self) -> io::Result<()> { + self.stdout + .set_color(ColorSpec::new().set_fg(Some(Color::Green)))?; + write!(self.stdout, "ok")?; + self.stdout.reset()?; + Ok(()) + } + + fn write_failed(&mut self) -> io::Result<()> { + self.stdout + .set_color(ColorSpec::new().set_fg(Some(Color::Red)))?; + write!(self.stdout, "FAILED")?; + self.stdout.reset()?; + Ok(()) + } + + pub fn write_starting_msg(&mut self) -> io::Result<()> { + writeln!(self.stdout)?; + writeln!( + self.stdout, + "running {} tests", + self.total - self.filtered_out + )?; + Ok(()) + } + + pub fn write_summary(&mut self) -> io::Result<()> { + // Print out the failing tests + if !self.failed.is_empty() { + writeln!(self.stdout)?; + writeln!(self.stdout, "failures:")?; + for name in &self.failed { + writeln!(self.stdout, " {}", name)?; + } + } + + writeln!(self.stdout)?; + write!(self.stdout, "test result: ")?; + if self.failed.is_empty() { + self.write_ok()?; + } else { + self.write_failed()?; + } + writeln!( + self.stdout, + ". {} passed; {} failed; {} filtered out", + self.passed, + self.failed.len(), + self.filtered_out + )?; + writeln!(self.stdout)?; + Ok(()) + } + + pub fn success(&self) -> bool { + self.failed.is_empty() + } +} diff --git a/testsuite/forge/src/runner.rs b/testsuite/forge/src/runner.rs index 73e0262708f9e..5545f9ef2939b 100644 --- a/testsuite/forge/src/runner.rs +++ b/testsuite/forge/src/runner.rs @@ -4,16 +4,18 @@ // TODO going to remove random seed once cluster deployment supports re-run genesis use crate::{ - success_criteria::{MetricsThreshold, SuccessCriteria, SystemMetricsThreshold}, - *, + config::ForgeConfig, + observer::junit::JunitTestObserver, + result::{TestResult, TestSummary}, + AdminContext, AdminTest, AptosContext, AptosTest, CoreContext, Factory, NetworkContext, + NetworkContextSynchronizer, NetworkTest, ShouldFail, Test, TestReport, Version, + NAMESPACE_CLEANUP_DURATION_BUFFER_SECS, }; use anyhow::{bail, format_err, Error, Result}; -use aptos_config::config::{NodeConfig, OverrideNodeConfig}; -use aptos_framework::ReleaseBundle; +use aptos_config::config::NodeConfig; use clap::{Parser, ValueEnum}; use rand::{rngs::OsRng, Rng, SeedableRng}; use std::{ - fmt::{Display, Formatter}, io::{self, Write}, num::NonZeroUsize, process, @@ -21,7 +23,7 @@ use std::{ sync::Arc, time::Duration, }; -use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; +use sugars::boxed; use tokio::runtime::Runtime; const KUBERNETES_SERVICE_HOST: &str = "KUBERNETES_SERVICE_HOST"; @@ -78,6 +80,9 @@ pub struct Options { /// Retain debug logs and above for all nodes instead of just the first 5 nodes #[clap(long, default_value = "false", env = "FORGE_RETAIN_DEBUG_LOGS")] retain_debug_logs: bool, + /// Optional path to write junit xml test report + #[clap(long, env = "FORGE_JUNIT_XML_PATH")] + junit_xml_path: Option, } impl Options { @@ -130,286 +135,6 @@ pub struct NodeResourceOverride { pub storage_gib: Option, } -pub struct ForgeConfig { - aptos_tests: Vec>, - admin_tests: Vec>, - network_tests: Vec>, - - /// The initial number of validators to spawn when the test harness creates a swarm - initial_validator_count: NonZeroUsize, - - /// The initial number of fullnodes to spawn when the test harness creates a swarm - initial_fullnode_count: usize, - - /// The initial version to use when the test harness creates a swarm - initial_version: InitialVersion, - - /// The initial genesis modules to use when starting a network - genesis_config: Option, - - /// Optional genesis helm values init function - genesis_helm_config_fn: Option, - - /// Optional validator node config override function - validator_override_node_config_fn: Option, - - /// Optional fullnode node config override function - fullnode_override_node_config_fn: Option, - - multi_region_config: bool, - - /// Transaction workload to run on the swarm - emit_job_request: EmitJobRequest, - - /// Success criteria - success_criteria: SuccessCriteria, - - /// The label of existing DBs to use, if None, will create new db. - existing_db_tag: Option, - - validator_resource_override: NodeResourceOverride, - - fullnode_resource_override: NodeResourceOverride, - - /// Retain debug logs and above for all nodes instead of just the first 5 nodes - retain_debug_logs: bool, -} - -impl ForgeConfig { - pub fn new() -> Self { - Self::default() - } - - pub fn add_aptos_test(mut self, aptos_test: T) -> Self { - self.aptos_tests.push(Box::new(aptos_test)); - self - } - - pub fn with_aptos_tests(mut self, aptos_tests: Vec>) -> Self { - self.aptos_tests = aptos_tests; - self - } - - pub fn add_admin_test(mut self, admin_test: T) -> Self { - self.admin_tests.push(Box::new(admin_test)); - self - } - - pub fn with_admin_tests(mut self, admin_tests: Vec>) -> Self { - self.admin_tests = admin_tests; - self - } - - pub fn add_network_test(mut self, network_test: T) -> Self { - self.network_tests.push(Box::new(network_test)); - self - } - - pub fn with_network_tests(mut self, network_tests: Vec>) -> Self { - self.network_tests = network_tests; - self - } - - pub fn with_initial_validator_count(mut self, initial_validator_count: NonZeroUsize) -> Self { - self.initial_validator_count = initial_validator_count; - self - } - - pub fn with_initial_fullnode_count(mut self, initial_fullnode_count: usize) -> Self { - self.initial_fullnode_count = initial_fullnode_count; - self - } - - pub fn with_genesis_helm_config_fn(mut self, genesis_helm_config_fn: GenesisConfigFn) -> Self { - self.genesis_helm_config_fn = Some(genesis_helm_config_fn); - self - } - - pub fn with_validator_override_node_config_fn(mut self, f: OverrideNodeConfigFn) -> Self { - self.validator_override_node_config_fn = Some(f); - self - } - - pub fn with_fullnode_override_node_config_fn(mut self, f: OverrideNodeConfigFn) -> Self { - self.fullnode_override_node_config_fn = Some(f); - self - } - - pub fn with_multi_region_config(mut self) -> Self { - self.multi_region_config = true; - self - } - - pub fn with_validator_resource_override( - mut self, - resource_override: NodeResourceOverride, - ) -> Self { - self.validator_resource_override = resource_override; - self - } - - pub fn with_fullnode_resource_override( - mut self, - resource_override: NodeResourceOverride, - ) -> Self { - self.fullnode_resource_override = resource_override; - self - } - - fn override_node_config_from_fn(config_fn: OverrideNodeConfigFn) -> OverrideNodeConfig { - let mut override_config = NodeConfig::default(); - let mut base_config = NodeConfig::default(); - config_fn(&mut override_config, &mut base_config); - OverrideNodeConfig::new(override_config, base_config) - } - - /// Builds a function that can be used to override the default helm values for the validator and fullnode. - /// If a configuration is intended to be set for all nodes, set the value in the default helm values file: - /// testsuite/forge/src/backend/k8s/helm-values/aptos-node-default-values.yaml - pub fn build_node_helm_config_fn(&self, retain_debug_logs: bool) -> Option { - let validator_override_node_config = self - .validator_override_node_config_fn - .clone() - .map(|config_fn| Self::override_node_config_from_fn(config_fn)); - let fullnode_override_node_config = self - .fullnode_override_node_config_fn - .clone() - .map(|config_fn| Self::override_node_config_from_fn(config_fn)); - let multi_region_config = self.multi_region_config; - let existing_db_tag = self.existing_db_tag.clone(); - let validator_resource_override = self.validator_resource_override; - let fullnode_resource_override = self.fullnode_resource_override; - - // Override specific helm values. See reference: terraform/helm/aptos-node/values.yaml - Some(Arc::new(move |helm_values: &mut serde_yaml::Value| { - if let Some(override_config) = &validator_override_node_config { - helm_values["validator"]["config"] = override_config.get_yaml().unwrap(); - } - if let Some(override_config) = &fullnode_override_node_config { - helm_values["fullnode"]["config"] = override_config.get_yaml().unwrap(); - } - if multi_region_config { - helm_values["multicluster"]["enabled"] = true.into(); - // Create headless services for validators and fullnodes. - // Note: chaos-mesh will not work with clusterIP services. - helm_values["service"]["validator"]["internal"]["type"] = "ClusterIP".into(); - helm_values["service"]["validator"]["internal"]["headless"] = true.into(); - helm_values["service"]["fullnode"]["internal"]["type"] = "ClusterIP".into(); - helm_values["service"]["fullnode"]["internal"]["headless"] = true.into(); - } - if let Some(existing_db_tag) = &existing_db_tag { - helm_values["validator"]["storage"]["labels"]["tag"] = - existing_db_tag.clone().into(); - helm_values["fullnode"]["storage"]["labels"]["tag"] = - existing_db_tag.clone().into(); - } - - // validator resource overrides - if let Some(cpu_cores) = validator_resource_override.cpu_cores { - helm_values["validator"]["resources"]["requests"]["cpu"] = cpu_cores.into(); - helm_values["validator"]["resources"]["limits"]["cpu"] = cpu_cores.into(); - } - if let Some(memory_gib) = validator_resource_override.memory_gib { - helm_values["validator"]["resources"]["requests"]["memory"] = - format!("{}Gi", memory_gib).into(); - helm_values["validator"]["resources"]["limits"]["memory"] = - format!("{}Gi", memory_gib).into(); - } - if let Some(storage_gib) = validator_resource_override.storage_gib { - helm_values["validator"]["storage"]["size"] = format!("{}Gi", storage_gib).into(); - } - // fullnode resource overrides - if let Some(cpu_cores) = fullnode_resource_override.cpu_cores { - helm_values["fullnode"]["resources"]["requests"]["cpu"] = cpu_cores.into(); - helm_values["fullnode"]["resources"]["limits"]["cpu"] = cpu_cores.into(); - } - if let Some(memory_gib) = fullnode_resource_override.memory_gib { - helm_values["fullnode"]["resources"]["requests"]["memory"] = - format!("{}Gi", memory_gib).into(); - helm_values["fullnode"]["resources"]["limits"]["memory"] = - format!("{}Gi", memory_gib).into(); - } - if let Some(storage_gib) = fullnode_resource_override.storage_gib { - helm_values["fullnode"]["storage"]["size"] = format!("{}Gi", storage_gib).into(); - } - - if retain_debug_logs { - helm_values["validator"]["podAnnotations"]["aptos.dev/min-log-level-to-retain"] = - serde_yaml::Value::String("debug".to_owned()); - helm_values["fullnode"]["podAnnotations"]["aptos.dev/min-log-level-to-retain"] = - serde_yaml::Value::String("debug".to_owned()); - helm_values["validator"]["rust_log"] = "debug,hyper=off".into(); - helm_values["fullnode"]["rust_log"] = "debug,hyper=off".into(); - } - helm_values["validator"]["config"]["storage"]["rocksdb_configs"] - ["enable_storage_sharding"] = true.into(); - helm_values["fullnode"]["config"]["storage"]["rocksdb_configs"] - ["enable_storage_sharding"] = true.into(); - helm_values["validator"]["config"]["indexer_db_config"]["enable_event"] = true.into(); - helm_values["fullnode"]["config"]["indexer_db_config"]["enable_event"] = true.into(); - })) - } - - pub fn with_initial_version(mut self, initial_version: InitialVersion) -> Self { - self.initial_version = initial_version; - self - } - - pub fn with_genesis_module_bundle(mut self, bundle: ReleaseBundle) -> Self { - self.genesis_config = Some(GenesisConfig::Bundle(bundle)); - self - } - - pub fn with_genesis_modules_path(mut self, genesis_modules: String) -> Self { - self.genesis_config = Some(GenesisConfig::Path(genesis_modules)); - self - } - - pub fn with_emit_job(mut self, emit_job_request: EmitJobRequest) -> Self { - self.emit_job_request = emit_job_request; - self - } - - pub fn get_emit_job(&self) -> &EmitJobRequest { - &self.emit_job_request - } - - pub fn with_success_criteria(mut self, success_criteria: SuccessCriteria) -> Self { - self.success_criteria = success_criteria; - self - } - - pub fn get_success_criteria_mut(&mut self) -> &mut SuccessCriteria { - &mut self.success_criteria - } - - pub fn with_existing_db(mut self, tag: String) -> Self { - self.existing_db_tag = Some(tag); - self - } - - pub fn number_of_tests(&self) -> usize { - self.admin_tests.len() + self.network_tests.len() + self.aptos_tests.len() - } - - pub fn all_tests(&self) -> Vec>> { - self.admin_tests - .iter() - .map(|t| Box::new(AnyTestRef::Admin(t.as_ref()))) - .chain( - self.network_tests - .iter() - .map(|t| Box::new(AnyTestRef::Network(t.as_ref()))), - ) - .chain( - self.aptos_tests - .iter() - .map(|t| Box::new(AnyTestRef::Aptos(t.as_ref()))), - ) - .collect() - } -} - // Workaround way to implement all_tests, for: // error[E0658]: cannot cast `dyn interface::admin::AdminTest` to `dyn interface::test::Test`, trait upcasting coercion is experimental pub enum AnyTestRef<'a> { @@ -474,45 +199,6 @@ impl ForgeRunnerMode { } } -impl Default for ForgeConfig { - fn default() -> Self { - let forge_run_mode = ForgeRunnerMode::try_from_env().unwrap_or(ForgeRunnerMode::K8s); - let success_criteria = if forge_run_mode == ForgeRunnerMode::Local { - SuccessCriteria::new(600).add_no_restarts() - } else { - SuccessCriteria::new(3500) - .add_no_restarts() - .add_system_metrics_threshold(SystemMetricsThreshold::new( - // Check that we don't use more than 12 CPU cores for 30% of the time. - MetricsThreshold::new(12.0, 30), - // Check that we don't use more than 10 GB of memory for 30% of the time. - MetricsThreshold::new_gb(10.0, 30), - )) - }; - Self { - aptos_tests: vec![], - admin_tests: vec![], - network_tests: vec![], - initial_validator_count: NonZeroUsize::new(1).unwrap(), - initial_fullnode_count: 0, - initial_version: InitialVersion::Oldest, - genesis_config: None, - genesis_helm_config_fn: None, - validator_override_node_config_fn: None, - fullnode_override_node_config_fn: None, - multi_region_config: false, - emit_job_request: EmitJobRequest::default().mode(EmitJobMode::MaxLoad { - mempool_backlog: 40000, - }), - success_criteria, - existing_db_tag: None, - validator_resource_override: NodeResourceOverride::default(), - fullnode_resource_override: NodeResourceOverride::default(), - retain_debug_logs: false, - } - } -} - pub struct Forge<'cfg, F> { options: &'cfg Options, tests: ForgeConfig, @@ -568,6 +254,15 @@ impl<'cfg, F: Factory> Forge<'cfg, F> { let mut report = TestReport::new(); let mut summary = TestSummary::new(test_count, filtered_out); + + // Optionally write junit xml test report for external processing + if let Some(junit_xml_path) = self.options.junit_xml_path.as_ref() { + let junit_observer = JunitTestObserver::new( + self.tests.get_suite_name().unwrap_or("local".to_string()), + junit_xml_path.to_owned(), + ); + summary.add_observer(boxed!(junit_observer)); + } summary.write_starting_msg()?; if test_count > 0 { @@ -603,9 +298,9 @@ impl<'cfg, F: Factory> Forge<'cfg, F> { swarm.chain_info().into_aptos_public_info(), &mut report, ); - let result = run_test(|| runtime.block_on(test.run(&mut aptos_ctx))); + let result = process_test_result(runtime.block_on(test.run(&mut aptos_ctx))); report.report_text(result.to_string()); - summary.handle_result(test.name().to_owned(), result)?; + summary.handle_result(test.details(), result)?; } // Run AdminTests @@ -615,9 +310,9 @@ impl<'cfg, F: Factory> Forge<'cfg, F> { swarm.chain_info(), &mut report, ); - let result = run_test(|| test.run(&mut admin_ctx)); + let result = process_test_result(test.run(&mut admin_ctx)); report.report_text(result.to_string()); - summary.handle_result(test.name().to_owned(), result)?; + summary.handle_result(test.details(), result)?; } let logs_location = swarm.logs_location(); @@ -634,17 +329,18 @@ impl<'cfg, F: Factory> Forge<'cfg, F> { let handle = network_ctx.runtime.handle().clone(); let _handle_context = handle.enter(); let network_ctx = NetworkContextSynchronizer::new(network_ctx, handle.clone()); - let result = run_test(|| handle.block_on(test.run(network_ctx.clone()))); + let result = process_test_result(handle.block_on(test.run(network_ctx.clone()))); // explicitly keep network context in scope so that its created tokio Runtime drops after all the stuff has run. let NetworkContextSynchronizer { ctx, handle } = network_ctx; drop(handle); let ctx = Arc::into_inner(ctx).unwrap().into_inner(); drop(ctx); report.report_text(result.to_string()); - summary.handle_result(test.name().to_owned(), result)?; + summary.handle_result(test.details(), result)?; } report.print_report(); + summary.finish()?; io::stdout().flush()?; io::stderr().flush()?; @@ -692,22 +388,8 @@ impl<'cfg, F: Factory> Forge<'cfg, F> { } } -enum TestResult { - Ok, - FailedWithMsg(String), -} - -impl Display for TestResult { - fn fmt(&self, f: &mut Formatter) -> std::fmt::Result { - match self { - TestResult::Ok => write!(f, "Test Ok"), - TestResult::FailedWithMsg(msg) => write!(f, "Test Failed: {}", msg), - } - } -} - -fn run_test Result<()>>(f: F) -> TestResult { - match f() { +fn process_test_result(result: Result<()>) -> TestResult { + match result { Ok(()) => TestResult::Ok, Err(e) => { let is_triggerd_by_github_actions = @@ -721,103 +403,6 @@ fn run_test Result<()>>(f: F) -> TestResult { } } -struct TestSummary { - stdout: StandardStream, - total: usize, - filtered_out: usize, - passed: usize, - failed: Vec, -} - -impl TestSummary { - fn new(total: usize, filtered_out: usize) -> Self { - Self { - stdout: StandardStream::stdout(ColorChoice::Auto), - total, - filtered_out, - passed: 0, - failed: Vec::new(), - } - } - - fn handle_result(&mut self, name: String, result: TestResult) -> io::Result<()> { - write!(self.stdout, "test {} ... ", name)?; - match result { - TestResult::Ok => { - self.passed += 1; - self.write_ok()?; - }, - TestResult::FailedWithMsg(msg) => { - self.failed.push(name); - self.write_failed()?; - writeln!(self.stdout)?; - - write!(self.stdout, "Error: {}", msg)?; - }, - } - writeln!(self.stdout)?; - Ok(()) - } - - fn write_ok(&mut self) -> io::Result<()> { - self.stdout - .set_color(ColorSpec::new().set_fg(Some(Color::Green)))?; - write!(self.stdout, "ok")?; - self.stdout.reset()?; - Ok(()) - } - - fn write_failed(&mut self) -> io::Result<()> { - self.stdout - .set_color(ColorSpec::new().set_fg(Some(Color::Red)))?; - write!(self.stdout, "FAILED")?; - self.stdout.reset()?; - Ok(()) - } - - fn write_starting_msg(&mut self) -> io::Result<()> { - writeln!(self.stdout)?; - writeln!( - self.stdout, - "running {} tests", - self.total - self.filtered_out - )?; - Ok(()) - } - - fn write_summary(&mut self) -> io::Result<()> { - // Print out the failing tests - if !self.failed.is_empty() { - writeln!(self.stdout)?; - writeln!(self.stdout, "failures:")?; - for name in &self.failed { - writeln!(self.stdout, " {}", name)?; - } - } - - writeln!(self.stdout)?; - write!(self.stdout, "test result: ")?; - if self.failed.is_empty() { - self.write_ok()?; - } else { - self.write_failed()?; - } - writeln!( - self.stdout, - ". {} passed; {} failed; {} filtered out", - self.passed, - self.failed.len(), - self.filtered_out - )?; - writeln!(self.stdout)?; - Ok(()) - } - - fn success(&self) -> bool { - self.failed.is_empty() - } -} - #[cfg(test)] mod test { use super::*; diff --git a/testsuite/forge_test.py b/testsuite/forge_test.py index 5b6c4b567e674..a337e06fd5109 100644 --- a/testsuite/forge_test.py +++ b/testsuite/forge_test.py @@ -1,6 +1,7 @@ from contextlib import ExitStack import json import os +import textwrap import unittest import tempfile from datetime import datetime, timezone, timedelta @@ -14,6 +15,8 @@ import forge from forge import ( + BEGIN_JUNIT, + END_JUNIT, ForgeCluster, ForgeConfigBackend, ForgeContext, @@ -29,6 +32,7 @@ find_recent_images, find_recent_images_by_profile_or_features, format_comment, + format_junit_xml, format_pre_comment, format_report, get_all_forge_jobs, @@ -167,6 +171,7 @@ def fake_context( forge_username="banana-eater", forge_blocking=True, forge_retain_debug_logs="true", + forge_junit_xml_path=None, github_actions="false", github_job_url="https://banana", ) @@ -661,6 +666,25 @@ def testPossibleAuthFailureMessage(self) -> None: output = result.format(context) self.assertFixture(output, "testPossibleAuthFailureMessage.fixture") + def testFormatJunitXml(self) -> None: + result = ForgeResult.empty() + context = fake_context() + + result.set_output( + textwrap.dedent( + f""" + {BEGIN_JUNIT} + + blah + + {END_JUNIT} + """ + ) + ) + + output = format_junit_xml(context, result) + self.assertFixture(output, "testFormatJunitXml.fixture") + class ForgeMainTests(unittest.TestCase, AssertFixtureMixin): maxDiff = None diff --git a/testsuite/testcases/src/lib.rs b/testsuite/testcases/src/lib.rs index 92320a3136405..3e3bc617c2fd7 100644 --- a/testsuite/testcases/src/lib.rs +++ b/testsuite/testcases/src/lib.rs @@ -39,6 +39,7 @@ use async_trait::async_trait; use futures::future::join_all; use rand::{rngs::StdRng, SeedableRng}; use std::{ + borrow::Cow, fmt::Write, ops::DerefMut, sync::Arc, @@ -644,6 +645,15 @@ impl Test for CompositeNetworkTest { fn name(&self) -> &'static str { "CompositeNetworkTest" } + + fn reporting_name(&self) -> Cow<'static, str> { + let mut name_builder = self.test.name().to_owned(); + for wrapper in self.wrappers.iter() { + name_builder = format!("{}({})", wrapper.name(), name_builder); + } + name_builder = format!("CompositeNetworkTest({}) with ", name_builder); + Cow::Owned(name_builder) + } } pub(crate) fn generate_onchain_config_blob(data: &[u8]) -> String { diff --git a/types/src/block_executor/config.rs b/types/src/block_executor/config.rs index 586b76e8c7942..9878195a3d6cf 100644 --- a/types/src/block_executor/config.rs +++ b/types/src/block_executor/config.rs @@ -21,12 +21,21 @@ pub struct BlockExecutorLocalConfig { #[derive(Clone, Debug, Deserialize, Serialize)] pub struct BlockExecutorConfigFromOnchain { pub block_gas_limit_type: BlockGasLimitType, + pub block_gas_limit_override: Option, } impl BlockExecutorConfigFromOnchain { + pub fn new(block_gas_limit_type: BlockGasLimitType) -> Self { + Self { + block_gas_limit_type, + block_gas_limit_override: None, + } + } + pub fn new_no_block_limit() -> Self { Self { block_gas_limit_type: BlockGasLimitType::NoLimit, + block_gas_limit_override: None, } } @@ -34,6 +43,7 @@ impl BlockExecutorConfigFromOnchain { Self { block_gas_limit_type: maybe_block_gas_limit .map_or(BlockGasLimitType::NoLimit, BlockGasLimitType::Limit), + block_gas_limit_override: None, } } @@ -52,6 +62,7 @@ impl BlockExecutorConfigFromOnchain { add_block_limit_outcome_onchain: false, use_granular_resource_group_conflicts: false, }, + block_gas_limit_override: None, } } } diff --git a/types/src/on_chain_config/execution_config.rs b/types/src/on_chain_config/execution_config.rs index 90a194092ad5d..0592b0f047aa0 100644 --- a/types/src/on_chain_config/execution_config.rs +++ b/types/src/on_chain_config/execution_config.rs @@ -49,9 +49,7 @@ impl OnChainExecutionConfig { } pub fn block_executor_onchain_config(&self) -> BlockExecutorConfigFromOnchain { - BlockExecutorConfigFromOnchain { - block_gas_limit_type: self.block_gas_limit_type(), - } + BlockExecutorConfigFromOnchain::new(self.block_gas_limit_type()) } /// The type of the transaction deduper being used. diff --git a/types/src/transaction/block_epilogue.rs b/types/src/transaction/block_epilogue.rs index b9ac969bee068..4b2db886209c7 100644 --- a/types/src/transaction/block_epilogue.rs +++ b/types/src/transaction/block_epilogue.rs @@ -48,4 +48,13 @@ impl BlockEndInfo { } => *block_gas_limit_reached || *block_output_limit_reached, } } + + pub fn block_effective_gas_units(&self) -> u64 { + match self { + Self::V0 { + block_effective_block_gas_units, + .. + } => *block_effective_block_gas_units, + } + } }