Skip to content

Commit

Permalink
utilccl: add ErrBreakerOpen to bulk retryable error
Browse files Browse the repository at this point in the history
In cockroachdb#68787
we saw a backup job failing on retry because of attempting
to plan a flow on a node that has been shutdown. This should
not usually happen since everytime we plan the flow we fetch
the nodes that can participate in the distsql flow.

In case we do encounter a `breaker open` error we can retry
hoping that the next time the flow is planned it doesn't
attempt to dial the dead node.

Release note: None
  • Loading branch information
adityamaru committed Aug 13, 2021
1 parent b365f26 commit 9f11b43
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
1 change: 1 addition & 0 deletions pkg/ccl/utilccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ go_library(
"//pkg/util/grpcutil",
"//pkg/util/timeutil",
"//pkg/util/uuid",
"@com_github_cockroachdb_circuitbreaker//:circuitbreaker",
"@com_github_cockroachdb_errors//:errors",
"@com_github_cockroachdb_redact//:redact",
],
Expand Down
10 changes: 9 additions & 1 deletion pkg/ccl/utilccl/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,11 @@ package utilccl
import (
"strings"

circuit "github.com/cockroachdb/circuitbreaker"
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
"github.com/cockroachdb/cockroach/pkg/sql/flowinfra"
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
"github.com/cockroachdb/errors"
)

// IsDistSQLRetryableError returns true if the supplied error, or any of its parent
Expand All @@ -35,6 +37,11 @@ func IsDistSQLRetryableError(err error) bool {
return strings.Contains(errStr, `rpc error`)
}

// isBreakerOpenError returns true if err is a circuit.ErrBreakerOpen.
func isBreakerOpenError(err error) bool {
return errors.Is(err, circuit.ErrBreakerOpen)
}

// IsPermanentBulkJobError returns true if the error results in a permanent
// failure of a bulk job (IMPORT, BACKUP, RESTORE). This function is a allowlist
// instead of a blocklist: only known safe errors are confirmed to not be
Expand All @@ -47,5 +54,6 @@ func IsPermanentBulkJobError(err error) bool {
return !IsDistSQLRetryableError(err) &&
!grpcutil.IsClosedConnection(err) &&
!flowinfra.IsNoInboundStreamConnectionError(err) &&
!kvcoord.IsSendError(err)
!kvcoord.IsSendError(err) &&
!isBreakerOpenError(err)
}

0 comments on commit 9f11b43

Please sign in to comment.