From c71c26adf32ce80dfb8c0e67fb8db36ed7b32885 Mon Sep 17 00:00:00 2001 From: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Date: Thu, 13 Jul 2023 14:58:39 +0300 Subject: [PATCH] Deprecating and removing tablet throttler CLI flags and tests (#13246) * Table throttler: --throttler-config-via-topo now defaults to 'true' Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * add deprecation message Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * endtoend tests: remove '--enable-lag-throttler' and use 'UpdateThrottlerConfig' everywhere Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * always use vtctldclient Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * use cluster.VtctldClientProcess Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * disable --throttler-config-via-topo in old throttler tests Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Remove --throttler-config-via-topo where used, since it now defaults 'true' Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * fix vreplication cluster setup, waiting for throttler config to apply Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * changelog Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * extend throttler threshold Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * a bit more verbose Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * fixed CLI test Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * remove old '--enable-lag-throttler' flag, introduce '--heartbeat_on_demand_duration' Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * more log info in throttler.Open() Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * more logging Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Revert to --heartbeat_enable Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Protect throttler config change application with initMutex And in e2e test update the throttler config on the keyspace when it's created. Only wait for the new tablets in a shard to have the throttler enabled when adding a Shard. Signed-off-by: Matt Lord * More CI testing Signed-off-by: Matt Lord * CI testing cont Signed-off-by: Matt Lord * Yes... Signed-off-by: Matt Lord * Somebody doesn't like force pushes so msg here Signed-off-by: Matt Lord * Increase on-demand heartbeat duration from 10s to 1m Signed-off-by: Matt Lord * Use only on-demand heartbeats everywhere Signed-off-by: Matt Lord * Use same throttler config everywhere Signed-off-by: Matt Lord * Update all keyspaces and don't fail test on missing JSON keys Signed-off-by: Matt Lord * Use constant heartbeats in vrepl e2e tests Until https://github.com/vitessio/vitess/issues/13175 is fixed. Signed-off-by: Matt Lord * Increase workflow command timeout Signed-off-by: Matt Lord * Don't wait for throttler on non-serving primaries Signed-off-by: Matt Lord * https://github.com/vitessio/vitess/issues/13175 is fixed, therefore re-instating on-deman heartbeats Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Added ToC Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Tweak comment and kick CI Signed-off-by: Matt Lord * Treat isOpen as the ready/running signal. Also align all initMutex usage. Signed-off-by: Matt Lord * Re-adjust comment Signed-off-by: Matt Lord * Adjust CheckIsReady() to match OnlineDDL's expectation/usage This was only using IsReady() before, now it's using IsOpen() and IsReady(). Signed-off-by: Matt Lord * Get rid of log messages from SrvKeyspaceWatcher when no node/key Signed-off-by: Matt Lord * More corrections/tweaks Signed-off-by: Matt Lord * Use more convenient/clear new IsRunning function Signed-off-by: Matt Lord * Revert "Use more convenient/clear new IsRunning function" This reverts commit 9aef27655cabd6f54784a99283ecfd028df6143e as this change was not correct. Signed-off-by: Matt Lord * Further fix correct use of IsOpen(), IsRunning(), IsEnabled() Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * throttler.throttledApps cannot be nil Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Remove --enable_lag_throttler flag Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Deprecate --throttler_config_via_topo Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * remove throttler mitigation code, as the problem was solved in https://github.com/vitessio/vitess/pull/13195 Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * deperecate throttler config flags Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Removed tabletmanager_throttler and tabletmanager_throttler_custom_config tests Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * changelog Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * remove EnableThrottler() call Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * restore default value Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * update threshold Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * update flags desc Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * using atomic.Bool Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * Update changelog/18.0/18.0.0/summary.md Co-authored-by: Matt Lord Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * use MarkDeprecated Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * do not expect flags in vttablet --help Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> * remove --throttler-config-via-topo from examples scripts Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> --------- Signed-off-by: Shlomi Noach <2607934+shlomi-noach@users.noreply.github.com> Signed-off-by: Matt Lord Co-authored-by: Matt Lord --- ...uster_endtoend_tabletmanager_throttler.yml | 139 -------- ..._tabletmanager_throttler_custom_config.yml | 139 -------- changelog/18.0/18.0.0/summary.md | 12 + examples/common/scripts/vttablet-up.sh | 5 +- go/flags/endtoend/vttablet.txt | 7 - .../tabletmanager/throttler/throttler_test.go | 319 ------------------ .../throttler_custom_config/throttler_test.go | 264 --------------- go/test/endtoend/throttler/util.go | 2 +- go/vt/vttablet/endtoend/vstreamer_test.go | 2 - go/vt/vttablet/onlineddl/executor.go | 21 -- .../tabletserver/repltracker/repltracker.go | 2 +- .../tabletserver/repltracker/writer.go | 2 +- .../vttablet/tabletserver/tabletenv/config.go | 6 +- go/vt/vttablet/tabletserver/tabletserver.go | 7 - .../tabletserver/throttle/throttler.go | 119 +++---- test/ci_workflow_gen.go | 2 - test/config.json | 22 -- 17 files changed, 67 insertions(+), 1003 deletions(-) delete mode 100644 .github/workflows/cluster_endtoend_tabletmanager_throttler.yml delete mode 100644 .github/workflows/cluster_endtoend_tabletmanager_throttler_custom_config.yml delete mode 100644 go/test/endtoend/tabletmanager/throttler/throttler_test.go delete mode 100644 go/test/endtoend/tabletmanager/throttler_custom_config/throttler_test.go diff --git a/.github/workflows/cluster_endtoend_tabletmanager_throttler.yml b/.github/workflows/cluster_endtoend_tabletmanager_throttler.yml deleted file mode 100644 index d4fdc55b178..00000000000 --- a/.github/workflows/cluster_endtoend_tabletmanager_throttler.yml +++ /dev/null @@ -1,139 +0,0 @@ -# DO NOT MODIFY: THIS FILE IS GENERATED USING "make generate_ci_workflows" - -name: Cluster (tabletmanager_throttler) -on: [push, pull_request] -concurrency: - group: format('{0}-{1}', ${{ github.ref }}, 'Cluster (tabletmanager_throttler)') - cancel-in-progress: true - -permissions: read-all - -env: - LAUNCHABLE_ORGANIZATION: "vitess" - LAUNCHABLE_WORKSPACE: "vitess-app" - GITHUB_PR_HEAD_SHA: "${{ github.event.pull_request.head.sha }}" - -jobs: - build: - name: Run endtoend tests on Cluster (tabletmanager_throttler) - runs-on: ubuntu-22.04 - - steps: - - name: Skip CI - run: | - if [[ "${{contains( github.event.pull_request.labels.*.name, 'Skip CI')}}" == "true" ]]; then - echo "skipping CI due to the 'Skip CI' label" - exit 1 - fi - - - name: Check if workflow needs to be skipped - id: skip-workflow - run: | - skip='false' - if [[ "${{github.event.pull_request}}" == "" ]] && [[ "${{github.ref}}" != "refs/heads/main" ]] && [[ ! "${{github.ref}}" =~ ^refs/heads/release-[0-9]+\.[0-9]$ ]] && [[ ! "${{github.ref}}" =~ "refs/tags/.*" ]]; then - skip='true' - fi - echo Skip ${skip} - echo "skip-workflow=${skip}" >> $GITHUB_OUTPUT - - - name: Check out code - if: steps.skip-workflow.outputs.skip-workflow == 'false' - uses: actions/checkout@v3 - - - name: Check for changes in relevant files - if: steps.skip-workflow.outputs.skip-workflow == 'false' - uses: frouioui/paths-filter@main - id: changes - with: - token: '' - filters: | - end_to_end: - - 'go/**/*.go' - - 'test.go' - - 'Makefile' - - 'build.env' - - 'go.sum' - - 'go.mod' - - 'proto/*.proto' - - 'tools/**' - - 'config/**' - - 'bootstrap.sh' - - '.github/workflows/cluster_endtoend_tabletmanager_throttler.yml' - - - name: Set up Go - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - uses: actions/setup-go@v4 - with: - go-version: 1.20.5 - - - name: Set up python - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - uses: actions/setup-python@v4 - - - name: Tune the OS - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - run: | - # Limit local port range to not use ports that overlap with server side - # ports that we listen on. - sudo sysctl -w net.ipv4.ip_local_port_range="22768 65535" - # Increase the asynchronous non-blocking I/O. More information at https://dev.mysql.com/doc/refman/5.7/en/innodb-parameters.html#sysvar_innodb_use_native_aio - echo "fs.aio-max-nr = 1048576" | sudo tee -a /etc/sysctl.conf - sudo sysctl -p /etc/sysctl.conf - - - name: Get dependencies - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - run: | - - # Get key to latest MySQL repo - sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29 - # Setup MySQL 8.0 - wget -c https://dev.mysql.com/get/mysql-apt-config_0.8.24-1_all.deb - echo mysql-apt-config mysql-apt-config/select-server select mysql-8.0 | sudo debconf-set-selections - sudo DEBIAN_FRONTEND="noninteractive" dpkg -i mysql-apt-config* - sudo apt-get update - # Install everything else we need, and configure - sudo apt-get install -y mysql-server mysql-client make unzip g++ etcd curl git wget eatmydata xz-utils libncurses5 - - sudo service mysql stop - sudo service etcd stop - sudo ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/ - sudo apparmor_parser -R /etc/apparmor.d/usr.sbin.mysqld - go mod download - - # install JUnit report formatter - go install github.com/vitessio/go-junit-report@HEAD - - - name: Setup launchable dependencies - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' && github.base_ref == 'main' - run: | - # Get Launchable CLI installed. If you can, make it a part of the builder image to speed things up - pip3 install --user launchable~=1.0 > /dev/null - - # verify that launchable setup is all correct. - launchable verify || true - - # Tell Launchable about the build you are producing and testing - launchable record build --name "$GITHUB_RUN_ID" --no-commit-collection --source . - - - name: Run cluster endtoend test - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - timeout-minutes: 45 - run: | - # We set the VTDATAROOT to the /tmp folder to reduce the file path of mysql.sock file - # which musn't be more than 107 characters long. - export VTDATAROOT="/tmp/" - source build.env - - set -x - - # run the tests however you normally do, then produce a JUnit XML file - eatmydata -- go run test.go -docker=false -follow -shard tabletmanager_throttler | tee -a output.txt | go-junit-report -set-exit-code > report.xml - - - name: Print test output and Record test result in launchable - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' && always() - run: | - # send recorded tests to launchable - launchable record tests --build "$GITHUB_RUN_ID" go-test . || true - - # print test output - cat output.txt diff --git a/.github/workflows/cluster_endtoend_tabletmanager_throttler_custom_config.yml b/.github/workflows/cluster_endtoend_tabletmanager_throttler_custom_config.yml deleted file mode 100644 index 4e3c5777be6..00000000000 --- a/.github/workflows/cluster_endtoend_tabletmanager_throttler_custom_config.yml +++ /dev/null @@ -1,139 +0,0 @@ -# DO NOT MODIFY: THIS FILE IS GENERATED USING "make generate_ci_workflows" - -name: Cluster (tabletmanager_throttler_custom_config) -on: [push, pull_request] -concurrency: - group: format('{0}-{1}', ${{ github.ref }}, 'Cluster (tabletmanager_throttler_custom_config)') - cancel-in-progress: true - -permissions: read-all - -env: - LAUNCHABLE_ORGANIZATION: "vitess" - LAUNCHABLE_WORKSPACE: "vitess-app" - GITHUB_PR_HEAD_SHA: "${{ github.event.pull_request.head.sha }}" - -jobs: - build: - name: Run endtoend tests on Cluster (tabletmanager_throttler_custom_config) - runs-on: ubuntu-22.04 - - steps: - - name: Skip CI - run: | - if [[ "${{contains( github.event.pull_request.labels.*.name, 'Skip CI')}}" == "true" ]]; then - echo "skipping CI due to the 'Skip CI' label" - exit 1 - fi - - - name: Check if workflow needs to be skipped - id: skip-workflow - run: | - skip='false' - if [[ "${{github.event.pull_request}}" == "" ]] && [[ "${{github.ref}}" != "refs/heads/main" ]] && [[ ! "${{github.ref}}" =~ ^refs/heads/release-[0-9]+\.[0-9]$ ]] && [[ ! "${{github.ref}}" =~ "refs/tags/.*" ]]; then - skip='true' - fi - echo Skip ${skip} - echo "skip-workflow=${skip}" >> $GITHUB_OUTPUT - - - name: Check out code - if: steps.skip-workflow.outputs.skip-workflow == 'false' - uses: actions/checkout@v3 - - - name: Check for changes in relevant files - if: steps.skip-workflow.outputs.skip-workflow == 'false' - uses: frouioui/paths-filter@main - id: changes - with: - token: '' - filters: | - end_to_end: - - 'go/**/*.go' - - 'test.go' - - 'Makefile' - - 'build.env' - - 'go.sum' - - 'go.mod' - - 'proto/*.proto' - - 'tools/**' - - 'config/**' - - 'bootstrap.sh' - - '.github/workflows/cluster_endtoend_tabletmanager_throttler_custom_config.yml' - - - name: Set up Go - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - uses: actions/setup-go@v4 - with: - go-version: 1.20.5 - - - name: Set up python - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - uses: actions/setup-python@v4 - - - name: Tune the OS - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - run: | - # Limit local port range to not use ports that overlap with server side - # ports that we listen on. - sudo sysctl -w net.ipv4.ip_local_port_range="22768 65535" - # Increase the asynchronous non-blocking I/O. More information at https://dev.mysql.com/doc/refman/5.7/en/innodb-parameters.html#sysvar_innodb_use_native_aio - echo "fs.aio-max-nr = 1048576" | sudo tee -a /etc/sysctl.conf - sudo sysctl -p /etc/sysctl.conf - - - name: Get dependencies - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - run: | - - # Get key to latest MySQL repo - sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 467B942D3A79BD29 - # Setup MySQL 8.0 - wget -c https://dev.mysql.com/get/mysql-apt-config_0.8.24-1_all.deb - echo mysql-apt-config mysql-apt-config/select-server select mysql-8.0 | sudo debconf-set-selections - sudo DEBIAN_FRONTEND="noninteractive" dpkg -i mysql-apt-config* - sudo apt-get update - # Install everything else we need, and configure - sudo apt-get install -y mysql-server mysql-client make unzip g++ etcd curl git wget eatmydata xz-utils libncurses5 - - sudo service mysql stop - sudo service etcd stop - sudo ln -s /etc/apparmor.d/usr.sbin.mysqld /etc/apparmor.d/disable/ - sudo apparmor_parser -R /etc/apparmor.d/usr.sbin.mysqld - go mod download - - # install JUnit report formatter - go install github.com/vitessio/go-junit-report@HEAD - - - name: Setup launchable dependencies - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' && github.base_ref == 'main' - run: | - # Get Launchable CLI installed. If you can, make it a part of the builder image to speed things up - pip3 install --user launchable~=1.0 > /dev/null - - # verify that launchable setup is all correct. - launchable verify || true - - # Tell Launchable about the build you are producing and testing - launchable record build --name "$GITHUB_RUN_ID" --no-commit-collection --source . - - - name: Run cluster endtoend test - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' - timeout-minutes: 45 - run: | - # We set the VTDATAROOT to the /tmp folder to reduce the file path of mysql.sock file - # which musn't be more than 107 characters long. - export VTDATAROOT="/tmp/" - source build.env - - set -x - - # run the tests however you normally do, then produce a JUnit XML file - eatmydata -- go run test.go -docker=false -follow -shard tabletmanager_throttler_custom_config | tee -a output.txt | go-junit-report -set-exit-code > report.xml - - - name: Print test output and Record test result in launchable - if: steps.skip-workflow.outputs.skip-workflow == 'false' && steps.changes.outputs.end_to_end == 'true' && always() - run: | - # send recorded tests to launchable - launchable record tests --build "$GITHUB_RUN_ID" go-test . || true - - # print test output - cat output.txt diff --git a/changelog/18.0/18.0.0/summary.md b/changelog/18.0/18.0.0/summary.md index fe670844041..c215e975f6f 100644 --- a/changelog/18.0/18.0.0/summary.md +++ b/changelog/18.0/18.0.0/summary.md @@ -9,6 +9,7 @@ - **[VTAdmin](#vtadmin)** - [Updated to node v18.16.0](#update-node) - **[Deprecations and Deletions](#deprecations-and-deletions)** + - [Deprecated Flags](#deprecated-flags) - [Deleted `k8stopo`](#deleted-k8stopo) - [Deleted `vtgr`](#deleted-vtgr) - **[New stats](#new-stats)** @@ -39,6 +40,17 @@ here https://nodejs.org/en/blog/release/v18.16.0. ### Deprecations and Deletions +#### Deprecated Command Line Flags + +Throttler related `vttablet` flags: + +- `--enable-lag-throttler` is now removed after being deprecated in `v17.0` +- `--throttle_threshold` is deprecated and will be removed in `v19.0` +- `--throttle_metrics_query` is deprecated and will be removed in `v19.0` +- `--throttle_metrics_threshold` is deprecated and will be removed in `v19.0` +- `--throttle_check_as_check_self` is deprecated and will be removed in `v19.0` +- `--throttler-config-via-topo` is deprecated after asummed `true` in `v17.0`. It will be removed in a future version. + #### Deleted `k8stopo` The `k8stopo` has been deprecated in Vitess 17, also see https://github.com/vitessio/vitess/issues/13298. With Vitess 18 diff --git a/examples/common/scripts/vttablet-up.sh b/examples/common/scripts/vttablet-up.sh index d3ce1ae06ba..51baeec45be 100755 --- a/examples/common/scripts/vttablet-up.sh +++ b/examples/common/scripts/vttablet-up.sh @@ -54,7 +54,10 @@ vttablet \ --service_map 'grpc-queryservice,grpc-tabletmanager,grpc-updatestream' \ --pid_file $VTDATAROOT/$tablet_dir/vttablet.pid \ --vtctld_addr http://$hostname:$vtctld_web_port/ \ - --throttler-config-via-topo --heartbeat_enable --heartbeat_interval=250ms --heartbeat_on_demand_duration=5s \ + --disable_active_reparents \ + --heartbeat_enable \ + --heartbeat_interval=250ms \ + --heartbeat_on_demand_duration=5s \ > $VTDATAROOT/$tablet_dir/vttablet.out 2>&1 & # Block waiting for the tablet to be listening diff --git a/go/flags/endtoend/vttablet.txt b/go/flags/endtoend/vttablet.txt index bfd4dc182f3..89fcc00f350 100644 --- a/go/flags/endtoend/vttablet.txt +++ b/go/flags/endtoend/vttablet.txt @@ -99,14 +99,12 @@ Usage of vttablet: --emit_stats If set, emit stats to push-based monitoring and stats backends --enable-consolidator Synonym to -enable_consolidator (default true) --enable-consolidator-replicas Synonym to -enable_consolidator_replicas - --enable-lag-throttler Synonym to -enable_lag_throttler --enable-per-workload-table-metrics If true, query counts and query error metrics include a label that identifies the workload --enable-tx-throttler Synonym to -enable_tx_throttler --enable_consolidator This option enables the query consolidator. (default true) --enable_consolidator_replicas This option enables the query consolidator only on replicas. --enable_hot_row_protection If true, incoming transactions for the same row (range) will be queued and cannot consume all txpool slots. --enable_hot_row_protection_dry_run If true, hot row protection is not enforced but logs if transactions would have been queued. - --enable_lag_throttler If true, vttablet will run a throttler service, and will implicitly enable heartbeats --enable_replication_reporter Use polling to track replication lag. --enable_transaction_limit If true, limit on number of transactions open at the same time will be enforced for all users. User trying to open a new transaction after exhausting their limit will receive an error immediately, regardless of whether there are available slots or not. --enable_transaction_limit_dry_run If true, limit on number of transactions open at the same time will be tracked for all users, but not enforced. @@ -313,12 +311,7 @@ Usage of vttablet: --tablet_manager_grpc_server_name string the server name to use to validate server certificate --tablet_manager_protocol string Protocol to use to make tabletmanager RPCs to vttablets. (default "grpc") --tablet_protocol string Protocol to use to make queryservice RPCs to vttablets. (default "grpc") - --throttle_check_as_check_self Should throttler/check return a throttler/check-self result (changes throttler behavior for writes) - --throttle_metrics_query SELECT Override default heartbeat/lag metric. Use either SELECT (must return single row, single value) or `SHOW GLOBAL ... LIKE ...` queries. Set -throttle_metrics_threshold respectively. - --throttle_metrics_threshold float Override default throttle threshold, respective to --throttle_metrics_query (default 1.7976931348623157e+308) --throttle_tablet_types string Comma separated VTTablet types to be considered by the throttler. default: 'replica'. example: 'replica,rdonly'. 'replica' aways implicitly included (default "replica") - --throttle_threshold duration Replication lag threshold for default lag throttling (default 1s) - --throttler-config-via-topo When 'true', read config from topo service and ignore throttle_threshold, throttle_metrics_threshold, throttle_metrics_query, throttle_check_as_check_self (default true) --topo_consul_lock_delay duration LockDelay for consul session. (default 15s) --topo_consul_lock_session_checks string List of checks for consul session. (default "serfHealth") --topo_consul_lock_session_ttl string TTL for consul session. diff --git a/go/test/endtoend/tabletmanager/throttler/throttler_test.go b/go/test/endtoend/tabletmanager/throttler/throttler_test.go deleted file mode 100644 index 5ca4bc32a87..00000000000 --- a/go/test/endtoend/tabletmanager/throttler/throttler_test.go +++ /dev/null @@ -1,319 +0,0 @@ -/* -Copyright 2020 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package throttler - -import ( - "context" - "flag" - "fmt" - "io" - "net/http" - "os" - "testing" - "time" - - "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base" - - "vitess.io/vitess/go/test/endtoend/cluster" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var ( - clusterInstance *cluster.LocalProcessCluster - primaryTablet *cluster.Vttablet - replicaTablet *cluster.Vttablet - hostname = "localhost" - keyspaceName = "ks" - cell = "zone1" - sqlSchema = ` - create table t1( - id bigint, - value varchar(16), - primary key(id) - ) Engine=InnoDB; -` - - vSchema = ` - { - "sharded": true, - "vindexes": { - "hash": { - "type": "hash" - } - }, - "tables": { - "t1": { - "column_vindexes": [ - { - "column": "id", - "name": "hash" - } - ] - } - } - }` - - httpClient = base.SetupHTTPClient(time.Second) - throttledAppsAPIPath = "throttler/throttled-apps" - checkAPIPath = "throttler/check" - checkSelfAPIPath = "throttler/check-self" -) - -const ( - throttlerThreshold = 1 * time.Second // standard, tight threshold - onDemandHeartbeatDuration = 5 * time.Second - applyConfigWait = 15 * time.Second // time after which we're sure the throttler has refreshed config and tablets -) - -func TestMain(m *testing.M) { - defer cluster.PanicHandler(nil) - flag.Parse() - - exitCode := func() int { - clusterInstance = cluster.NewCluster(cell, hostname) - defer clusterInstance.Teardown() - - // Start topo server - err := clusterInstance.StartTopo() - if err != nil { - return 1 - } - - // Set extra tablet args for lock timeout - clusterInstance.VtTabletExtraArgs = []string{ - "--throttler-config-via-topo=false", - "--lock_tables_timeout", "5s", - "--watch_replication_stream", - "--enable_replication_reporter", - "--enable-lag-throttler", - "--throttle_threshold", throttlerThreshold.String(), - "--heartbeat_interval", "250ms", - "--heartbeat_on_demand_duration", onDemandHeartbeatDuration.String(), - "--disable_active_reparents", - } - - // Start keyspace - keyspace := &cluster.Keyspace{ - Name: keyspaceName, - SchemaSQL: sqlSchema, - VSchema: vSchema, - } - - if err = clusterInstance.StartUnshardedKeyspace(*keyspace, 1, false); err != nil { - return 1 - } - - // Collect table paths and ports - tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets - for _, tablet := range tablets { - if tablet.Type == "primary" { - primaryTablet = tablet - } else if tablet.Type != "rdonly" { - replicaTablet = tablet - } - } - - return m.Run() - }() - os.Exit(exitCode) -} - -func throttledApps(tablet *cluster.Vttablet) (resp *http.Response, respBody string, err error) { - resp, err = httpClient.Get(fmt.Sprintf("http://localhost:%d/%s", tablet.HTTPPort, throttledAppsAPIPath)) - if err != nil { - return resp, respBody, err - } - b, err := io.ReadAll(resp.Body) - if err != nil { - return resp, respBody, err - } - respBody = string(b) - return resp, respBody, err -} - -func throttleCheck(tablet *cluster.Vttablet, skipRequestHeartbeats bool) (*http.Response, error) { - return httpClient.Get(fmt.Sprintf("http://localhost:%d/%s?s=%t", tablet.HTTPPort, checkAPIPath, skipRequestHeartbeats)) -} - -func throttleCheckSelf(tablet *cluster.Vttablet) (*http.Response, error) { - return httpClient.Head(fmt.Sprintf("http://localhost:%d/%s", tablet.HTTPPort, checkSelfAPIPath)) -} - -func warmUpHeartbeat(t *testing.T) (respStatus int) { - // because we run with -heartbeat_on_demand_duration=5s, the heartbeat is "cold" right now. - // Let's warm it up. - resp, err := throttleCheck(primaryTablet, false) - require.NoError(t, err) - defer resp.Body.Close() - time.Sleep(time.Second) - return resp.StatusCode -} - -// waitForThrottleCheckStatus waits for the tablet to return the provided HTTP code in a throttle check -func waitForThrottleCheckStatus(t *testing.T, tablet *cluster.Vttablet, wantCode int) { - _ = warmUpHeartbeat(t) - ctx, cancel := context.WithTimeout(context.Background(), onDemandHeartbeatDuration+applyConfigWait) - defer cancel() - - for { - resp, err := throttleCheck(tablet, true) - require.NoError(t, err) - - if wantCode == resp.StatusCode { - // Wait for any cached check values to be cleared and the new - // status value to be in effect everywhere before returning. - resp.Body.Close() - return - } - select { - case <-ctx.Done(): - b, err := io.ReadAll(resp.Body) - require.NoError(t, err) - resp.Body.Close() - - assert.Equal(t, wantCode, resp.StatusCode, "body: %v", string(b)) - return - default: - resp.Body.Close() - time.Sleep(time.Second) - } - } -} - -func TestThrottlerAfterMetricsCollected(t *testing.T) { - defer cluster.PanicHandler(t) - - // We run with on-demand heartbeats. Immediately as the tablet manager opens, it sends a one-time - // request for heartbeats, which means the throttler is able to collect initial "good" data. - // After a few seconds, the heartbeat lease terminates. We wait for that. - // {"StatusCode":429,"Value":4.864921,"Threshold":1,"Message":"Threshold exceeded"} - t.Run("expect push back once initial heartbeat lease terminates", func(t *testing.T) { - time.Sleep(onDemandHeartbeatDuration) - waitForThrottleCheckStatus(t, primaryTablet, http.StatusTooManyRequests) - }) - t.Run("requesting heartbeats", func(t *testing.T) { - respStatus := warmUpHeartbeat(t) - assert.NotEqual(t, http.StatusOK, respStatus) - }) - t.Run("expect OK once heartbeats lease renewed", func(t *testing.T) { - time.Sleep(1 * time.Second) - resp, err := throttleCheck(primaryTablet, false) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) - t.Run("expect OK once heartbeats lease renewed, still", func(t *testing.T) { - time.Sleep(1 * time.Second) - resp, err := throttleCheck(primaryTablet, false) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) - t.Run("validate throttled-apps", func(t *testing.T) { - resp, body, err := throttledApps(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - assert.Contains(t, body, "always-throttled-app") - }) - t.Run("validate check-self", func(t *testing.T) { - resp, err := throttleCheckSelf(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) - t.Run("validate check-self, again", func(t *testing.T) { - resp, err := throttleCheckSelf(replicaTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) -} - -func TestLag(t *testing.T) { - defer cluster.PanicHandler(t) - // Stop VTOrc because we want to stop replication to increase lag. - // We don't want VTOrc to fix this. - clusterInstance.DisableVTOrcRecoveries(t) - defer clusterInstance.EnableVTOrcRecoveries(t) - - t.Run("stopping replication", func(t *testing.T) { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("StopReplication", replicaTablet.Alias) - assert.NoError(t, err) - }) - t.Run("accumulating lag, expecting throttler push back", func(t *testing.T) { - time.Sleep(2 * throttlerThreshold) - - resp, err := throttleCheck(primaryTablet, false) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode) - }) - t.Run("primary self-check should still be fine", func(t *testing.T) { - resp, err := throttleCheckSelf(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - // self (on primary) is unaffected by replication lag - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) - t.Run("replica self-check should show error", func(t *testing.T) { - resp, err := throttleCheckSelf(replicaTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode) - }) - t.Run("starting replication", func(t *testing.T) { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("StartReplication", replicaTablet.Alias) - assert.NoError(t, err) - }) - t.Run("expecting replication to catch up and throttler check to return OK", func(t *testing.T) { - waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) - }) - t.Run("primary self-check should be fine", func(t *testing.T) { - resp, err := throttleCheckSelf(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - // self (on primary) is unaffected by replication lag - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) - t.Run("replica self-check should be fine", func(t *testing.T) { - resp, err := throttleCheckSelf(replicaTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) -} - -func TestNoReplicas(t *testing.T) { - defer cluster.PanicHandler(t) - t.Run("changing replica to RDONLY", func(t *testing.T) { - err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", replicaTablet.Alias, "RDONLY") - assert.NoError(t, err) - - // This makes no REPLICA servers available. We expect something like: - // {"StatusCode":200,"Value":0,"Threshold":1,"Message":""} - waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) - }) - t.Run("restoring to REPLICA", func(t *testing.T) { - - err := clusterInstance.VtctlclientProcess.ExecuteCommand("ChangeTabletType", replicaTablet.Alias, "REPLICA") - assert.NoError(t, err) - - waitForThrottleCheckStatus(t, primaryTablet, http.StatusOK) - }) -} diff --git a/go/test/endtoend/tabletmanager/throttler_custom_config/throttler_test.go b/go/test/endtoend/tabletmanager/throttler_custom_config/throttler_test.go deleted file mode 100644 index e173384eb62..00000000000 --- a/go/test/endtoend/tabletmanager/throttler_custom_config/throttler_test.go +++ /dev/null @@ -1,264 +0,0 @@ -/* -Copyright 2020 The Vitess Authors. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ -package throttler - -import ( - "context" - "flag" - "fmt" - "net/http" - "os" - "sync" - "testing" - "time" - - "vitess.io/vitess/go/mysql" - "vitess.io/vitess/go/sqltypes" - "vitess.io/vitess/go/vt/vttablet/tabletserver/throttle/base" - - "vitess.io/vitess/go/test/endtoend/cluster" - - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -var ( - clusterInstance *cluster.LocalProcessCluster - primaryTablet *cluster.Vttablet - replicaTablet *cluster.Vttablet - hostname = "localhost" - keyspaceName = "ks" - cell = "zone1" - sqlSchema = ` - create table t1( - id bigint, - value varchar(16), - primary key(id) - ) Engine=InnoDB; -` - - vSchema = ` - { - "sharded": true, - "vindexes": { - "hash": { - "type": "hash" - } - }, - "tables": { - "t1": { - "column_vindexes": [ - { - "column": "id", - "name": "hash" - } - ] - } - } - }` - - httpClient = base.SetupHTTPClient(time.Second) - checkAPIPath = "throttler/check" - checkSelfAPIPath = "throttler/check-self" - vtParams mysql.ConnParams -) - -const ( - testThreshold = 5 - applyConfigWait = 15 * time.Second // time after which we're sure the throttler has refreshed config and tablets - statusWaitTimeout = 30 * time.Second -) - -func TestMain(m *testing.M) { - defer cluster.PanicHandler(nil) - flag.Parse() - - exitCode := func() int { - clusterInstance = cluster.NewCluster(cell, hostname) - defer clusterInstance.Teardown() - - // Start topo server - err := clusterInstance.StartTopo() - if err != nil { - return 1 - } - - // Set extra tablet args for lock timeout - clusterInstance.VtTabletExtraArgs = []string{ - "--throttler-config-via-topo=false", - "--lock_tables_timeout", "5s", - "--watch_replication_stream", - "--enable_replication_reporter", - "--enable-lag-throttler", - "--throttle_metrics_query", "show global status like 'threads_running'", - "--throttle_metrics_threshold", fmt.Sprintf("%d", testThreshold), - "--throttle_check_as_check_self", - "--heartbeat_interval", "250ms", - } - - // Start keyspace - keyspace := &cluster.Keyspace{ - Name: keyspaceName, - SchemaSQL: sqlSchema, - VSchema: vSchema, - } - - if err = clusterInstance.StartUnshardedKeyspace(*keyspace, 0, false); err != nil { - return 1 - } - - // Collect table paths and ports - tablets := clusterInstance.Keyspaces[0].Shards[0].Vttablets - for _, tablet := range tablets { - if tablet.Type == "primary" { - primaryTablet = tablet - } else if tablet.Type != "rdonly" { - replicaTablet = tablet - } - } - - vtgateInstance := clusterInstance.NewVtgateInstance() - // Start vtgate - if err := vtgateInstance.Setup(); err != nil { - return 1 - } - // ensure it is torn down during cluster TearDown - clusterInstance.VtgateProcess = *vtgateInstance - vtParams = mysql.ConnParams{ - Host: clusterInstance.Hostname, - Port: clusterInstance.VtgateMySQLPort, - } - - return m.Run() - }() - os.Exit(exitCode) -} - -func throttleCheck(tablet *cluster.Vttablet) (*http.Response, error) { - resp, err := httpClient.Get(fmt.Sprintf("http://localhost:%d/%s", tablet.HTTPPort, checkAPIPath)) - return resp, err -} - -func waitForThrottlerStatus(tablet *cluster.Vttablet, status int) error { - ctx, cancel := context.WithTimeout(context.Background(), statusWaitTimeout) - defer cancel() - tkr := time.NewTicker(100 * time.Millisecond) - defer tkr.Stop() - - for { - resp, _ := throttleCheck(tablet) - seenStatus := resp.StatusCode - resp.Body.Close() - if seenStatus == status { - return nil - } - select { - case <-ctx.Done(): - return fmt.Errorf("timed out waiting for expected throttler status %d after %v; last seen value: %d", - status, statusWaitTimeout, seenStatus) - case <-tkr.C: - } - } -} - -func throttleCheckSelf(tablet *cluster.Vttablet) (*http.Response, error) { - return httpClient.Head(fmt.Sprintf("http://localhost:%d/%s", tablet.HTTPPort, checkSelfAPIPath)) -} - -func TestThrottlerThresholdOK(t *testing.T) { - defer cluster.PanicHandler(t) - - t.Run("immediately", func(t *testing.T) { - // The tablet throttler can still be initializing so we wait for - // the status to be OK. - err := waitForThrottlerStatus(primaryTablet, http.StatusOK) - require.NoError(t, err) - }) - t.Run("after long wait", func(t *testing.T) { - time.Sleep(applyConfigWait) - resp, err := throttleCheck(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - }) -} - -func TestThreadsRunning(t *testing.T) { - defer cluster.PanicHandler(t) - - sleepDuration := 10 * time.Second - var wg sync.WaitGroup - for i := 0; i < testThreshold; i++ { - // generate different Sleep() calls, all at minimum sleepDuration - wg.Add(1) - go func(i int) { - defer wg.Done() - vtgateExec(t, fmt.Sprintf("select sleep(%d)", int(sleepDuration.Seconds())+i), "") - }(i) - } - t.Run("exceeds threshold", func(t *testing.T) { - time.Sleep(sleepDuration / 2) - // by this time we will have testThreshold+1 threads_running, and we should hit the threshold - // {"StatusCode":429,"Value":2,"Threshold":2,"Message":"Threshold exceeded"} - { - resp, err := throttleCheck(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode) - } - { - resp, err := throttleCheckSelf(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusTooManyRequests, resp.StatusCode) - } - }) - t.Run("wait for queries to terminate", func(t *testing.T) { - wg.Wait() - }) - t.Run("restored below threshold", func(t *testing.T) { - { - resp, err := throttleCheck(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - } - { - resp, err := throttleCheckSelf(primaryTablet) - require.NoError(t, err) - defer resp.Body.Close() - assert.Equal(t, http.StatusOK, resp.StatusCode) - } - }) -} - -func vtgateExec(t *testing.T, query string, expectError string) *sqltypes.Result { - t.Helper() - - ctx := context.Background() - conn, err := mysql.Connect(ctx, &vtParams) - require.NoError(t, err) - defer conn.Close() - - qr, err := conn.ExecuteFetch(query, 1000, true) - if expectError == "" { - require.NoError(t, err) - } else { - require.Error(t, err, "error should not be nil") - assert.Contains(t, err.Error(), expectError, "Unexpected error") - } - return qr -} diff --git a/go/test/endtoend/throttler/util.go b/go/test/endtoend/throttler/util.go index 7da87bd9047..7a2f13cca77 100644 --- a/go/test/endtoend/throttler/util.go +++ b/go/test/endtoend/throttler/util.go @@ -44,7 +44,7 @@ type Config struct { const ( DefaultQuery = "select unix_timestamp(now(6))-max(ts/1000000000) as replication_lag from _vt.heartbeat" - DefaultThreshold = 1 * time.Second + DefaultThreshold = 5 * time.Second ConfigTimeout = 60 * time.Second ) diff --git a/go/vt/vttablet/endtoend/vstreamer_test.go b/go/vt/vttablet/endtoend/vstreamer_test.go index 645a99cfc2b..673f5801e45 100644 --- a/go/vt/vttablet/endtoend/vstreamer_test.go +++ b/go/vt/vttablet/endtoend/vstreamer_test.go @@ -59,8 +59,6 @@ func TestSchemaVersioning(t *testing.T) { tsv.EnableHistorian(false) tsv.SetTracking(false) tsv.EnableHeartbeat(false) - tsv.EnableThrottler(false) - defer tsv.EnableThrottler(true) defer tsv.EnableHeartbeat(true) defer tsv.EnableHistorian(true) defer tsv.SetTracking(true) diff --git a/go/vt/vttablet/onlineddl/executor.go b/go/vt/vttablet/onlineddl/executor.go index 1048734f6e6..37ff614479f 100644 --- a/go/vt/vttablet/onlineddl/executor.go +++ b/go/vt/vttablet/onlineddl/executor.go @@ -3481,7 +3481,6 @@ func (e *Executor) reviewRunningMigrations(ctx context.Context) (countRunnning i } } - var throttlerOnce sync.Once r, err := e.execQuery(ctx, sqlSelectRunningMigrations) if err != nil { return countRunnning, cancellable, err @@ -3591,26 +3590,6 @@ func (e *Executor) reviewRunningMigrations(ctx context.Context) (countRunnning i return countRunnning, cancellable, err } } - go throttlerOnce.Do(func() { - if !e.lagThrottler.IsRunning() { - return - } - // Self healing: in the following scenario: - // - a vitess migration - // - with on demand heartbeats - // - the streamer running on a replica - // - the streamer was throttled for long enough - // - then vplayer and vcopier are locked, waiting for the streamer to do something - // - since they are blocked, they're not running throttler checks - // - since streamer runs on replica, it only checks that replica - // - therefore no one asking for on-demand heartbeats - // - then, if the conditions for the streamer's throttling are done, the streamer then thinks there's replication lag, with nothing to remediate it. - // - it's a deadlock. - // And so, once per reviewRunningMigrations(), and assuming there _are_ running migrations, we ensure to hit a throttler check. This will kick - // on-demand heartbeats, unlocking the deadlock. - e.lagThrottler.CheckByType(ctx, throttlerapp.OnlineDDLName.String(), "", throttleCheckFlags, throttle.ThrottleCheckPrimaryWrite) - }) - } } case schema.DDLStrategyPTOSC: diff --git a/go/vt/vttablet/tabletserver/repltracker/repltracker.go b/go/vt/vttablet/tabletserver/repltracker/repltracker.go index 3d6359ed902..5ab44eb774e 100644 --- a/go/vt/vttablet/tabletserver/repltracker/repltracker.go +++ b/go/vt/vttablet/tabletserver/repltracker/repltracker.go @@ -66,7 +66,7 @@ type ReplTracker struct { func NewReplTracker(env tabletenv.Env, alias *topodatapb.TabletAlias) *ReplTracker { return &ReplTracker{ mode: env.Config().ReplicationTracker.Mode, - forceHeartbeat: env.Config().EnableLagThrottler || env.Config().ReplicationTracker.HeartbeatOnDemandSeconds.Get() > 0, + forceHeartbeat: env.Config().ReplicationTracker.HeartbeatOnDemandSeconds.Get() > 0, hw: newHeartbeatWriter(env, alias), hr: newHeartbeatReader(env), poller: &poller{}, diff --git a/go/vt/vttablet/tabletserver/repltracker/writer.go b/go/vt/vttablet/tabletserver/repltracker/writer.go index 310ee80021a..bbd28aa557e 100644 --- a/go/vt/vttablet/tabletserver/repltracker/writer.go +++ b/go/vt/vttablet/tabletserver/repltracker/writer.go @@ -74,7 +74,7 @@ func newHeartbeatWriter(env tabletenv.Env, alias *topodatapb.TabletAlias) *heart config := env.Config() // config.EnableLagThrottler is a feature flag for the throttler; if throttler runs, then heartbeat must also run - if config.ReplicationTracker.Mode != tabletenv.Heartbeat && !config.EnableLagThrottler && config.ReplicationTracker.HeartbeatOnDemandSeconds.Get() == 0 { + if config.ReplicationTracker.Mode != tabletenv.Heartbeat && config.ReplicationTracker.HeartbeatOnDemandSeconds.Get() == 0 { return &heartbeatWriter{} } heartbeatInterval := config.ReplicationTracker.HeartbeatIntervalSeconds.Get() diff --git a/go/vt/vttablet/tabletserver/tabletenv/config.go b/go/vt/vttablet/tabletserver/tabletenv/config.go index e26c69b7eb4..1be90478be2 100644 --- a/go/vt/vttablet/tabletserver/tabletenv/config.go +++ b/go/vt/vttablet/tabletserver/tabletenv/config.go @@ -203,7 +203,6 @@ func registerTabletEnvFlags(fs *pflag.FlagSet) { fs.BoolVar(&enableHeartbeat, "heartbeat_enable", false, "If true, vttablet records (if master) or checks (if replica) the current time of a replication heartbeat in the sidecar database's heartbeat table. The result is used to inform the serving state of the vttablet via healthchecks.") fs.DurationVar(&heartbeatInterval, "heartbeat_interval", 1*time.Second, "How frequently to read and write replication heartbeat.") fs.DurationVar(&heartbeatOnDemandDuration, "heartbeat_on_demand_duration", 0, "If non-zero, heartbeats are only written upon consumer request, and only run for up to given duration following the request. Frequent requests can keep the heartbeat running consistently; when requests are infrequent heartbeat may completely stop between requests") - flagutil.DualFormatBoolVar(fs, ¤tConfig.EnableLagThrottler, "enable_lag_throttler", defaultConfig.EnableLagThrottler, "If true, vttablet will run a throttler service, and will implicitly enable heartbeats") fs.BoolVar(¤tConfig.EnforceStrictTransTables, "enforce_strict_trans_tables", defaultConfig.EnforceStrictTransTables, "If true, vttablet requires MySQL to run with STRICT_TRANS_TABLES or STRICT_ALL_TABLES on. It is recommended to not turn this flag off. Otherwise MySQL may alter your supplied values before saving them to the database.") flagutil.DualFormatBoolVar(fs, &enableConsolidator, "enable_consolidator", true, "This option enables the query consolidator.") @@ -366,8 +365,7 @@ type TabletConfig struct { TxThrottlerDefaultPriority int `json:"-"` TxThrottlerTabletTypes *topoproto.TabletTypeListFlag `json:"-"` - EnableLagThrottler bool `json:"-"` - EnableTableGC bool `json:"-"` // can be turned off programmatically by tests + EnableTableGC bool `json:"-"` // can be turned off programmatically by tests TransactionLimitConfig `json:"-"` @@ -835,8 +833,6 @@ var defaultConfig = TabletConfig{ TxThrottlerDefaultPriority: sqlparser.MaxPriorityValue, // This leads to all queries being candidates to throttle TxThrottlerTabletTypes: &topoproto.TabletTypeListFlag{topodatapb.TabletType_REPLICA}, - EnableLagThrottler: false, // Feature flag; to switch to 'true' at some stage in the future - TransactionLimitConfig: defaultTransactionLimitConfig(), EnforceStrictTransTables: true, diff --git a/go/vt/vttablet/tabletserver/tabletserver.go b/go/vt/vttablet/tabletserver/tabletserver.go index 9c01ec1ba1e..795546aac5f 100644 --- a/go/vt/vttablet/tabletserver/tabletserver.go +++ b/go/vt/vttablet/tabletserver/tabletserver.go @@ -1890,13 +1890,6 @@ func (tsv *TabletServer) EnableHeartbeat(enabled bool) { tsv.rt.EnableHeartbeat(enabled) } -// EnableThrottler forces throttler to be on or off. -// When throttler is off, it responds to all check requests with HTTP 200 OK -// Only to be used for testing. -func (tsv *TabletServer) EnableThrottler(enabled bool) { - tsv.Config().EnableLagThrottler = enabled -} - // SetTracking forces tracking to be on or off. // Only to be used for testing. func (tsv *TabletServer) SetTracking(enabled bool) { diff --git a/go/vt/vttablet/tabletserver/throttle/throttler.go b/go/vt/vttablet/tabletserver/throttle/throttler.go index 6b65c09f9b8..ce4c265fa78 100644 --- a/go/vt/vttablet/tabletserver/throttle/throttler.go +++ b/go/vt/vttablet/tabletserver/throttle/throttler.go @@ -68,12 +68,8 @@ const ( var ( // flag vars - throttleThreshold = 1 * time.Second - throttleTabletTypes = "replica" - throttleMetricQuery string - throttleMetricThreshold = math.MaxFloat64 - throttlerCheckAsCheckSelf = false - throttlerConfigViaTopo = true + defaultThrottleLagThreshold = 5 * time.Second + throttleTabletTypes = "replica" ) func init() { @@ -84,11 +80,11 @@ func init() { func registerThrottlerFlags(fs *pflag.FlagSet) { fs.StringVar(&throttleTabletTypes, "throttle_tablet_types", throttleTabletTypes, "Comma separated VTTablet types to be considered by the throttler. default: 'replica'. example: 'replica,rdonly'. 'replica' aways implicitly included") - fs.DurationVar(&throttleThreshold, "throttle_threshold", throttleThreshold, "Replication lag threshold for default lag throttling") - fs.StringVar(&throttleMetricQuery, "throttle_metrics_query", throttleMetricQuery, "Override default heartbeat/lag metric. Use either `SELECT` (must return single row, single value) or `SHOW GLOBAL ... LIKE ...` queries. Set -throttle_metrics_threshold respectively.") - fs.Float64Var(&throttleMetricThreshold, "throttle_metrics_threshold", throttleMetricThreshold, "Override default throttle threshold, respective to --throttle_metrics_query") - fs.BoolVar(&throttlerCheckAsCheckSelf, "throttle_check_as_check_self", throttlerCheckAsCheckSelf, "Should throttler/check return a throttler/check-self result (changes throttler behavior for writes)") - fs.BoolVar(&throttlerConfigViaTopo, "throttler-config-via-topo", throttlerConfigViaTopo, "When 'true', read config from topo service and ignore throttle_threshold, throttle_metrics_threshold, throttle_metrics_query, throttle_check_as_check_self") + fs.MarkDeprecated("throttle_threshold", "Replication lag threshold for default lag throttling") + fs.MarkDeprecated("throttle_metrics_query", "Override default heartbeat/lag metric. Use either `SELECT` (must return single row, single value) or `SHOW GLOBAL ... LIKE ...` queries. Set -throttle_metrics_threshold respectively.") + fs.MarkDeprecated("throttle_metrics_threshold", "Override default throttle threshold, respective to --throttle_metrics_query") + fs.MarkDeprecated("throttle_check_as_check_self", "Should throttler/check return a throttler/check-self result (changes throttler behavior for writes)") + fs.MarkDeprecated("throttler-config-via-topo", "Assumed to be 'true'") } var ( @@ -145,6 +141,7 @@ type Throttler struct { metricsQuery atomic.Value MetricsThreshold atomic.Uint64 + checkAsCheckSelf atomic.Bool mysqlClusterThresholds *cache.Cache aggregatedMetrics *cache.Cache @@ -216,10 +213,7 @@ func NewThrottler(env tabletenv.Env, srvTopoServer srvtopo.Server, ts *topo.Serv throttler.initThrottleTabletTypes() throttler.check = NewThrottlerCheck(throttler) - throttler.StoreMetricsThreshold(throttleThreshold.Seconds()) //default - if throttleMetricThreshold != math.MaxFloat64 { - throttler.StoreMetricsThreshold(throttleMetricThreshold) // override - } + throttler.StoreMetricsThreshold(defaultThrottleLagThreshold.Seconds()) //default return throttler } @@ -302,7 +296,7 @@ func (throttler *Throttler) normalizeThrottlerConfig(throttlerConfig *topodatapb if throttlerConfig.CustomQuery == "" { // no custom query; we check replication lag if throttlerConfig.Threshold == 0 { - throttlerConfig.Threshold = throttleThreshold.Seconds() + throttlerConfig.Threshold = defaultThrottleLagThreshold.Seconds() } } return throttlerConfig @@ -337,9 +331,6 @@ func (throttler *Throttler) WatchSrvKeyspaceCallback(srvks *topodatapb.SrvKeyspa // This may cause the throttler to be enabled/disabled, and of course it affects the throttling query/threshold. // Note: you should be holding the initMutex when calling this function. func (throttler *Throttler) applyThrottlerConfig(ctx context.Context, throttlerConfig *topodatapb.ThrottlerConfig) { - if !throttlerConfigViaTopo { - return - } log.Infof("Throttler: applying topo config: %+v", throttlerConfig) if throttlerConfig.CustomQuery == "" { throttler.metricsQuery.Store(sqlparser.BuildParsedQuery(defaultReplicationLagQuery, sidecardb.GetIdentifier()).Query) @@ -347,7 +338,7 @@ func (throttler *Throttler) applyThrottlerConfig(ctx context.Context, throttlerC throttler.metricsQuery.Store(throttlerConfig.CustomQuery) } throttler.StoreMetricsThreshold(throttlerConfig.Threshold) - throttlerCheckAsCheckSelf = throttlerConfig.CheckAsCheckSelf + throttler.checkAsCheckSelf.Store(throttlerConfig.CheckAsCheckSelf) for _, appRule := range throttlerConfig.ThrottledApps { throttler.ThrottleApp(appRule.Name, logutil.ProtoToTime(appRule.ExpiresAt), appRule.Ratio) } @@ -428,10 +419,6 @@ func (throttler *Throttler) Disable(ctx context.Context) bool { // Open opens database pool and initializes the schema func (throttler *Throttler) Open() error { - // TODO: remove `EnableLagThrottler` in v18 - if throttler.env.Config().EnableLagThrottler { - log.Warningf("The flags `--enable_lag_throttler` and `--throttle_threshold` will be removed in v18. Use 'vtctl UpdateThrottlerConfig', see https://vitess.io/docs/17.0/reference/programs/vtctldclient/vtctldclient_updatethrottlerconfig/") - } log.Infof("Throttler: started execution of Open. Acquiring initMutex lock") throttler.initMutex.Lock() defer throttler.initMutex.Unlock() @@ -446,63 +433,51 @@ func (throttler *Throttler) Open() error { // is not known when the TabletServer is created, which in turn creates the // Throttler. throttler.metricsQuery.Store(sqlparser.BuildParsedQuery(defaultReplicationLagQuery, sidecardb.GetIdentifier()).Query) // default - if throttleMetricQuery != "" { - throttler.metricsQuery.Store(throttleMetricQuery) // override - } throttler.initConfig() throttler.pool.Open(throttler.env.Config().DB.AppWithDB(), throttler.env.Config().DB.DbaWithDB(), throttler.env.Config().DB.AppDebugWithDB()) atomic.StoreInt64(&throttler.isOpen, 1) throttler.ThrottleApp("always-throttled-app", time.Now().Add(time.Hour*24*365*10), DefaultThrottleRatio) - if throttlerConfigViaTopo { - log.Infof("Throttler: throttler-config-via-topo detected") - // We want to read throttler config from topo and apply it. - // But also, we're in an Open() function, which blocks state manager's operation, and affects - // opening of all other components. We thus read the throttler config in the background. - // However, we want to handle a situation where the read errors out. - // So we kick a loop that keeps retrying reading the config, for as long as this throttler is open. - retryReadAndApplyThrottlerConfig := func() { - retryInterval := 10 * time.Second - retryTicker := time.NewTicker(retryInterval) - defer retryTicker.Stop() - for { - if !throttler.IsOpen() { - // Throttler is not open so no need to keep retrying. - log.Errorf("Throttler.retryReadAndApplyThrottlerConfig(): throttler no longer seems to be open, exiting") - return - } + log.Infof("Throttler: throttler-config-via-topo detected") + // We want to read throttler config from topo and apply it. + // But also, we're in an Open() function, which blocks state manager's operation, and affects + // opening of all other components. We thus read the throttler config in the background. + // However, we want to handle a situation where the read errors out. + // So we kick a loop that keeps retrying reading the config, for as long as this throttler is open. + retryReadAndApplyThrottlerConfig := func() { + retryInterval := 10 * time.Second + retryTicker := time.NewTicker(retryInterval) + defer retryTicker.Stop() + for { + if !throttler.IsOpen() { + // Throttler is not open so no need to keep retrying. + log.Errorf("Throttler.retryReadAndApplyThrottlerConfig(): throttler no longer seems to be open, exiting") + return + } - throttlerConfig, err := throttler.readThrottlerConfig(ctx) - if err == nil { - log.Errorf("Throttler.retryReadAndApplyThrottlerConfig(): success reading throttler config: %+v", throttlerConfig) - // It's possible that during a retry-sleep, the throttler is closed and opened again, leading - // to two (or more) instances of this goroutine. That's not a big problem; it's fine if all - // attempt to read the throttler config; but we just want to ensure they don't step on each other - // while applying the changes. - throttler.initMutex.Lock() - defer throttler.initMutex.Unlock() - throttler.applyThrottlerConfig(ctx, throttlerConfig) // may issue an Enable - go throttler.watchSrvKeyspaceOnce.Do(func() { - // We start watching SrvKeyspace only after we know it's been created. Now is that time! - throttler.srvTopoServer.WatchSrvKeyspace(context.Background(), throttler.cell, throttler.keyspace, throttler.WatchSrvKeyspaceCallback) - }) - return - } - // It's possible, especially in CI, that this throttler opened before the SrvKeyspace entry is created in topo. - // We thus retry until the entry is found. - log.Errorf("Throttler.retryReadAndApplyThrottlerConfig(): error reading throttler config. Will retry in %v. Err=%+v", retryInterval, err) - <-retryTicker.C + throttlerConfig, err := throttler.readThrottlerConfig(ctx) + if err == nil { + log.Errorf("Throttler.retryReadAndApplyThrottlerConfig(): success reading throttler config: %+v", throttlerConfig) + // It's possible that during a retry-sleep, the throttler is closed and opened again, leading + // to two (or more) instances of this goroutine. That's not a big problem; it's fine if all + // attempt to read the throttler config; but we just want to ensure they don't step on each other + // while applying the changes. + throttler.initMutex.Lock() + defer throttler.initMutex.Unlock() + throttler.applyThrottlerConfig(ctx, throttlerConfig) // may issue an Enable + go throttler.watchSrvKeyspaceOnce.Do(func() { + // We start watching SrvKeyspace only after we know it's been created. Now is that time! + throttler.srvTopoServer.WatchSrvKeyspace(context.Background(), throttler.cell, throttler.keyspace, throttler.WatchSrvKeyspaceCallback) + }) + return } - } - go retryReadAndApplyThrottlerConfig() - } else { - // backwards-cmpatible: check for --enable-lag-throttler flag in vttablet - // this will be removed in a future version - if throttler.env.Config().EnableLagThrottler { - go throttler.Enable(ctx) + log.Errorf("Throttler.retryReadAndApplyThrottlerConfig(): error reading throttler config. Will retry in %v. Err=%+v", retryInterval, err) + <-retryTicker.C } } + go retryReadAndApplyThrottlerConfig() + return nil } @@ -1100,7 +1075,7 @@ func (throttler *Throttler) CheckByType(ctx context.Context, appName string, rem case ThrottleCheckSelf: return throttler.checkSelf(ctx, appName, remoteAddr, flags) case ThrottleCheckPrimaryWrite: - if throttlerCheckAsCheckSelf { + if throttler.checkAsCheckSelf.Load() { return throttler.checkSelf(ctx, appName, remoteAddr, flags) } return throttler.checkShard(ctx, appName, remoteAddr, flags) diff --git a/test/ci_workflow_gen.go b/test/ci_workflow_gen.go index aa5fe3219ca..f3a55ec05c0 100644 --- a/test/ci_workflow_gen.go +++ b/test/ci_workflow_gen.go @@ -92,9 +92,7 @@ var ( "vreplication_migrate_vdiff2_convert_tz", "onlineddl_revert", "onlineddl_scheduler", - "tabletmanager_throttler", "tabletmanager_throttler_topo", - "tabletmanager_throttler_custom_config", "tabletmanager_tablegc", "tabletmanager_consul", "vtgate_concurrentdml", diff --git a/test/config.json b/test/config.json index 7cfc4a2db9e..aa055386003 100644 --- a/test/config.json +++ b/test/config.json @@ -447,17 +447,6 @@ "site_test" ] }, - "tabletmanager_throttler": { - "File": "unused.go", - "Args": ["vitess.io/vitess/go/test/endtoend/tabletmanager/throttler"], - "Command": [], - "Manual": false, - "Shard": "tabletmanager_throttler", - "RetryMax": 1, - "Tags": [ - "site_test" - ] - }, "tabletmanager_throttler_topo": { "File": "unused.go", "Args": ["vitess.io/vitess/go/test/endtoend/tabletmanager/throttler_topo"], @@ -469,17 +458,6 @@ "site_test" ] }, - "tabletmanager_throttler_custom_config": { - "File": "unused.go", - "Args": ["vitess.io/vitess/go/test/endtoend/tabletmanager/throttler_custom_config"], - "Command": [], - "Manual": false, - "Shard": "tabletmanager_throttler_custom_config", - "RetryMax": 1, - "Tags": [ - "site_test" - ] - }, "tabletmanager_tablegc": { "File": "unused.go", "Args": ["vitess.io/vitess/go/test/endtoend/tabletmanager/tablegc"],