Skip to content

Commit

Permalink
br: Add retry limit of connection err in fine grained backup (#57713) (
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-chi-bot authored Dec 18, 2024
1 parent 6351e9e commit c0b86a3
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 4 deletions.
51 changes: 47 additions & 4 deletions br/pkg/backup/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -77,12 +77,30 @@ type Checksum struct {
// ProgressUnit represents the unit of progress.
type ProgressUnit string

type StoreBasedErr struct {
storeID uint64
err error
}

func (e *StoreBasedErr) Error() string {
return fmt.Sprintf("Store ID '%d': %v", e.storeID, e.err.Error())
}

func (e *StoreBasedErr) Unwrap() error {
return e.err
}

func (e *StoreBasedErr) Cause() error {
return e.err
}

const (
// backupFineGrainedMaxBackoff is 1 hour.
// given it begins the fine-grained backup, there must be some problems in the cluster.
// We need to be more patient.
backupFineGrainedMaxBackoff = 3600000
backupRetryTimes = 5
disconnectRetryTimeout = 20000
// RangeUnit represents the progress updated counter when a range finished.
RangeUnit ProgressUnit = "range"
// RegionUnit represents the progress updated counter when a region finished.
Expand Down Expand Up @@ -1122,6 +1140,7 @@ func (bc *Client) fineGrainedBackup(
})

bo := utils.AdaptTiKVBackoffer(ctx, backupFineGrainedMaxBackoff, berrors.ErrUnknown)
maxDisconnect := make(map[uint64]uint)
for {
// Step1, check whether there is any incomplete range
incomplete := pr.Res.GetIncompleteRange(req.StartKey, req.EndKey)
Expand Down Expand Up @@ -1169,8 +1188,19 @@ func (bc *Client) fineGrainedBackup(
for {
select {
case err := <-errCh:
// TODO: should we handle err here?
return errors.Trace(err)
if !berrors.Is(err, berrors.ErrFailedToConnect) {
return errors.Trace(err)
}
storeErr, ok := err.(*StoreBasedErr)
if !ok {
return errors.Trace(err)
}

storeID := storeErr.storeID
maxDisconnect[storeID]++
if maxDisconnect[storeID] > backupRetryTimes {
return errors.Annotatef(err, "Failed to connect to store %d more than %d times", storeID, backupRetryTimes)
}
case resp, ok := <-respCh:
if !ok {
// Finished.
Expand Down Expand Up @@ -1271,12 +1301,22 @@ func (bc *Client) handleFineGrained(
storeID := targetPeer.GetStoreId()
lockResolver := bc.mgr.GetLockResolver()
client, err := bc.mgr.GetBackupClient(ctx, storeID)

// inject a disconnect failpoint
failpoint.Inject("disconnect", func(_ failpoint.Value) {
logutil.CL(ctx).Warn("This is a injected disconnection error")
err = berrors.ErrFailedToConnect
})

if err != nil {
if berrors.Is(err, berrors.ErrFailedToConnect) {
// When the leader store is died,
// 20s for the default max duration before the raft election timer fires.
logutil.CL(ctx).Warn("failed to connect to store, skipping", logutil.ShortError(err), zap.Uint64("storeID", storeID))
return 20000, nil
return disconnectRetryTimeout, &StoreBasedErr{
storeID: storeID,
err: err,
}
}

logutil.CL(ctx).Error("fail to connect store", zap.Uint64("StoreID", storeID))
Expand Down Expand Up @@ -1315,7 +1355,10 @@ func (bc *Client) handleFineGrained(
// When the leader store is died,
// 20s for the default max duration before the raft election timer fires.
logutil.CL(ctx).Warn("failed to connect to store, skipping", logutil.ShortError(err), zap.Uint64("storeID", storeID))
return 20000, nil
return disconnectRetryTimeout, &StoreBasedErr{
storeID: storeID,
err: err,
}
}
logutil.CL(ctx).Error("failed to send fine-grained backup", zap.Uint64("storeID", storeID), logutil.ShortError(err))
return 0, errors.Annotatef(err, "failed to send fine-grained backup [%s, %s)",
Expand Down
49 changes: 49 additions & 0 deletions br/tests/br_fine_grained_disconnect/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/bin/sh
#
# Copyright 2022 PingCAP, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -eu
DB="$TEST_NAME"
TABLE="usertable"
DB_COUNT=3
CUR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

function create_db_with_table(){
for i in $(seq $DB_COUNT); do
run_sql "CREATE DATABASE $DB${i};"
go-ycsb load mysql -P $CUR/workload -p mysql.host=$TIDB_IP -p mysql.port=$TIDB_PORT -p mysql.user=root -p mysql.db=$DB${i}
done
}

function drop_db(){
for i in $(seq $DB_COUNT); do
run_sql "DROP DATABASE $DB${i};"
done
}

# Create dbs with table
create_db_with_table

export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/noop-backup=100*return(1)"
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/disconnect=100"

output=$(run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB/${CRYPTER_METHOD}_file" 2>&1)

if ! echo "$output" | grep -q "Failed to connect to store \d+ more than \d+ times"; then
exit 1
fi

# Drop dbs finally
drop_db
12 changes: 12 additions & 0 deletions br/tests/br_fine_grained_disconnect/workload
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
recordcount=1000
operationcount=0
workload=core

readallfields=true

readproportion=0
updateproportion=0
scanproportion=0
insertproportion=0

requestdistribution=uniform

0 comments on commit c0b86a3

Please sign in to comment.