Skip to content

Commit

Permalink
backup: retry on internal error and make more errors can be retried (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
3pointer authored May 5, 2022
1 parent b3ec891 commit 167d4eb
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 8 deletions.
27 changes: 21 additions & 6 deletions br/pkg/backup/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -934,9 +934,17 @@ func doSendBackup(
})
bCli, err := client.Backup(ctx, &req)
failpoint.Inject("reset-retryable-error", func(val failpoint.Value) {
if val.(bool) {
logutil.CL(ctx).Debug("failpoint reset-retryable-error injected.")
err = status.Error(codes.Unavailable, "Unavailable error")
switch val.(string) {
case "Unavaiable":
{
logutil.CL(ctx).Debug("failpoint reset-retryable-error unavailable injected.")
err = status.Error(codes.Unavailable, "Unavailable error")
}
case "Internal":
{
logutil.CL(ctx).Debug("failpoint reset-retryable-error internal injected.")
err = status.Error(codes.Internal, "Internal error")
}
}
})
failpoint.Inject("reset-not-retryable-error", func(val failpoint.Value) {
Expand Down Expand Up @@ -1030,16 +1038,23 @@ const (

// isRetryableError represents whether we should retry reset grpc connection.
func isRetryableError(err error) bool {

if status.Code(err) == codes.Unavailable {
return true
// some errors can be retried
// https://github.com/pingcap/tidb/issues/34350
switch status.Code(err) {
case codes.Unavailable, codes.DeadlineExceeded,
codes.ResourceExhausted, codes.Aborted, codes.Internal:
{
log.Warn("backup met some errors, these errors can be retry 5 times", zap.Error(err))
return true
}
}

// At least, there are two possible cancel() call,
// one from backup range, another from gRPC, here we retry when gRPC cancel with connection closing
if status.Code(err) == codes.Canceled {
if s, ok := status.FromError(err); ok {
if strings.Contains(s.Message(), gRPC_Cancel) {
log.Warn("backup met grpc cancel error, this errors can be retry 5 times", zap.Error(err))
return true
}
}
Expand Down
1 change: 1 addition & 0 deletions br/pkg/utils/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ var retryableServerError = []string{
"body write aborted",
"error during dispatch",
"put object timeout",
"internalerror",
}

// RetryableFunc presents a retryable operation.
Expand Down
4 changes: 2 additions & 2 deletions br/tests/br_full/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ done

# backup full and kill tikv to test reset connection
echo "backup with limit start..."
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/reset-retryable-error=1*return(true)"
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/reset-retryable-error=1*return(\"Unavailable\")->1*return(\"Internal\")"
run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB-limit" --concurrency 4
export GO_FAILPOINTS=""

Expand All @@ -49,7 +49,7 @@ fi

# backup full
echo "backup with lz4 start..."
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/backup-storage-error=1*return(\"connection refused\")"
export GO_FAILPOINTS="github.com/pingcap/tidb/br/pkg/backup/backup-storage-error=1*return(\"connection refused\")->1*return(\"InternalError\")"
run_br --pd $PD_ADDR backup full -s "local://$TEST_DIR/$DB-lz4" --concurrency 4 --compression lz4
export GO_FAILPOINTS=""
size_lz4=$(du -d 0 $TEST_DIR/$DB-lz4 | awk '{print $1}')
Expand Down

0 comments on commit 167d4eb

Please sign in to comment.