From c0b86a3f7bf9e0ca11a42bd57f536247eaec8f8c Mon Sep 17 00:00:00 2001 From: Ti Chi Robot Date: Wed, 18 Dec 2024 08:23:21 +0800 Subject: [PATCH] br: Add retry limit of connection err in fine grained backup (#57713) (#58216) close pingcap/tidb#57449 --- br/pkg/backup/client.go | 51 ++++++++++++++++++-- br/tests/br_fine_grained_disconnect/run.sh | 49 +++++++++++++++++++ br/tests/br_fine_grained_disconnect/workload | 12 +++++ 3 files changed, 108 insertions(+), 4 deletions(-) create mode 100755 br/tests/br_fine_grained_disconnect/run.sh create mode 100644 br/tests/br_fine_grained_disconnect/workload diff --git a/br/pkg/backup/client.go b/br/pkg/backup/client.go index 1d29ea760dc65..d1a41287da605 100644 --- a/br/pkg/backup/client.go +++ b/br/pkg/backup/client.go @@ -77,12 +77,30 @@ type Checksum struct { // ProgressUnit represents the unit of progress. type ProgressUnit string +type StoreBasedErr struct { + storeID uint64 + err error +} + +func (e *StoreBasedErr) Error() string { + return fmt.Sprintf("Store ID '%d': %v", e.storeID, e.err.Error()) +} + +func (e *StoreBasedErr) Unwrap() error { + return e.err +} + +func (e *StoreBasedErr) Cause() error { + return e.err +} + const ( // backupFineGrainedMaxBackoff is 1 hour. // given it begins the fine-grained backup, there must be some problems in the cluster. // We need to be more patient. backupFineGrainedMaxBackoff = 3600000 backupRetryTimes = 5 + disconnectRetryTimeout = 20000 // RangeUnit represents the progress updated counter when a range finished. RangeUnit ProgressUnit = "range" // RegionUnit represents the progress updated counter when a region finished. @@ -1122,6 +1140,7 @@ func (bc *Client) fineGrainedBackup( }) bo := utils.AdaptTiKVBackoffer(ctx, backupFineGrainedMaxBackoff, berrors.ErrUnknown) + maxDisconnect := make(map[uint64]uint) for { // Step1, check whether there is any incomplete range incomplete := pr.Res.GetIncompleteRange(req.StartKey, req.EndKey) @@ -1169,8 +1188,19 @@ func (bc *Client) fineGrainedBackup( for { select { case err := <-errCh: - // TODO: should we handle err here? - return errors.Trace(err) + if !berrors.Is(err, berrors.ErrFailedToConnect) { + return errors.Trace(err) + } + storeErr, ok := err.(*StoreBasedErr) + if !ok { + return errors.Trace(err) + } + + storeID := storeErr.storeID + maxDisconnect[storeID]++ + if maxDisconnect[storeID] > backupRetryTimes { + return errors.Annotatef(err, "Failed to connect to store %d more than %d times", storeID, backupRetryTimes) + } case resp, ok := <-respCh: if !ok { // Finished. @@ -1271,12 +1301,22 @@ func (bc *Client) handleFineGrained( storeID := targetPeer.GetStoreId() lockResolver := bc.mgr.GetLockResolver() client, err := bc.mgr.GetBackupClient(ctx, storeID) + + // inject a disconnect failpoint + failpoint.Inject("disconnect", func(_ failpoint.Value) { + logutil.CL(ctx).Warn("This is a injected disconnection error") + err = berrors.ErrFailedToConnect + }) + if err != nil { if berrors.Is(err, berrors.ErrFailedToConnect) { // When the leader store is died, // 20s for the default max duration before the raft election timer fires. logutil.CL(ctx).Warn("failed to connect to store, skipping", logutil.ShortError(err), zap.Uint64("storeID", storeID)) - return 20000, nil + return disconnectRetryTimeout, &StoreBasedErr{ + storeID: storeID, + err: err, + } } logutil.CL(ctx).Error("fail to connect store", zap.Uint64("StoreID", storeID)) @@ -1315,7 +1355,10 @@ func (bc *Client) handleFineGrained( // When the leader store is died, // 20s for the default max duration before the raft election timer fires. logutil.CL(ctx).Warn("failed to connect to store, skipping", logutil.ShortError(err), zap.Uint64("storeID", storeID)) - return 20000, nil + return disconnectRetryTimeout, &StoreBasedErr{ + storeID: storeID, + err: err, + } } logutil.CL(ctx).Error("failed to send fine-grained backup", zap.Uint64("storeID", storeID), logutil.ShortError(err)) return 0, errors.Annotatef(err, "failed to send fine-grained backup [%s, %s)", diff --git a/br/tests/br_fine_grained_disconnect/run.sh b/br/tests/br_fine_grained_disconnect/run.sh new file mode 100755 index 0000000000000..8a1d5d302f2f8 --- /dev/null +++ b/br/tests/br_fine_grained_disconnect/run.sh @@ -0,0 +1,49 @@ +#!/bin/sh +# +# Copyright 2022 PingCAP, Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -eu +DB="$TEST_NAME" +TABLE="usertable" +DB_COUNT=3 +CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) + +function create_db_with_table(){ + for i in $(seq $DB_COUNT); do + run_sql "CREATE DATABASE $DB${i};" + go-ycsb load mysql -P $CUR/workload -p mysql.host=$TIDB_IP -p mysql.port=$TIDB_PORT -p mysql.user=root -p mysql.db=$DB${i} + done +} + +function drop_db(){ + for i in $(seq $DB_COUNT); do + run_sql "DROP DATABASE $DB${i};" + done +} + +# Create dbs with table +create_db_with_table + +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/noop-backup=100*return(1)" +export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/disconnect=100" + +output=$(run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB/${CRYPTER_METHOD}_file" 2>&1) + +if ! echo "$output" | grep -q "Failed to connect to store \d+ more than \d+ times"; then + exit 1 +fi + +# Drop dbs finally +drop_db diff --git a/br/tests/br_fine_grained_disconnect/workload b/br/tests/br_fine_grained_disconnect/workload new file mode 100644 index 0000000000000..448ca3c1a477f --- /dev/null +++ b/br/tests/br_fine_grained_disconnect/workload @@ -0,0 +1,12 @@ +recordcount=1000 +operationcount=0 +workload=core + +readallfields=true + +readproportion=0 +updateproportion=0 +scanproportion=0 +insertproportion=0 + +requestdistribution=uniform \ No newline at end of file