Skip to content

Commit

Permalink
sql: retry all "rpc error" distributed errors as local
Browse files Browse the repository at this point in the history
This commit includes all errors that contain `rpc error` substring to be
retried-as-local. In particular, this allows us to avoid problems with
DistSQL using no-longer-live SQL pod after that pod is shutdown. (This
usage of the downed pod is currently expected given that the cache of
live instances isn't updated when the pod is shutdown.)

Release note: None
  • Loading branch information
yuzefovich committed Aug 10, 2023
1 parent 847722e commit 5c09eb1
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 23 deletions.
1 change: 1 addition & 0 deletions pkg/jobs/joberror/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ go_library(
deps = [
"//pkg/kv/kvclient/kvcoord",
"//pkg/sql/flowinfra",
"//pkg/sql/sqlerrors",
"//pkg/util/circuit",
"//pkg/util/grpcutil",
"//pkg/util/sysutil",
Expand Down
24 changes: 2 additions & 22 deletions pkg/jobs/joberror/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,36 +11,16 @@
package joberror

import (
"strings"

circuitbreaker "github.com/cockroachdb/circuitbreaker"
"github.com/cockroachdb/cockroach/pkg/kv/kvclient/kvcoord"
"github.com/cockroachdb/cockroach/pkg/sql/flowinfra"
"github.com/cockroachdb/cockroach/pkg/sql/sqlerrors"
"github.com/cockroachdb/cockroach/pkg/util/circuit"
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
"github.com/cockroachdb/cockroach/pkg/util/sysutil"
"github.com/cockroachdb/errors"
)

// IsDistSQLRetryableError returns true if the supplied error, or any of its parent
// causes is an rpc error.
// This is an unfortunate implementation that should be looking for a more
// specific error.
func IsDistSQLRetryableError(err error) bool {
if err == nil {
return false
}

// TODO(knz): this is a bad implementation. Make it go away
// by avoiding string comparisons.

errStr := err.Error()
// When a crdb node dies, any DistSQL flows with processors scheduled on
// it get an error with "rpc error" in the message from the call to
// `(*DistSQLPlanner).Run`.
return strings.Contains(errStr, `rpc error`)
}

// isBreakerOpenError returns true if err is a circuit.ErrBreakerOpen.
//
// NB: Two packages have ErrBreakerOpen error types. The cicruitbreaker package
Expand All @@ -57,7 +37,7 @@ func IsPermanentBulkJobError(err error) bool {
if err == nil {
return false
}
return !IsDistSQLRetryableError(err) &&
return !sqlerrors.IsDistSQLRetryableError(err) &&
!grpcutil.IsClosedConnection(err) &&
!flowinfra.IsFlowRetryableError(err) &&
!flowinfra.IsNoInboundStreamConnectionError(err) &&
Expand Down
4 changes: 3 additions & 1 deletion pkg/sql/distsql_running.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql/rowexec"
"github.com/cockroachdb/cockroach/pkg/sql/sem/tree"
"github.com/cockroachdb/cockroach/pkg/sql/sessiondatapb"
"github.com/cockroachdb/cockroach/pkg/sql/sqlerrors"
"github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry"
"github.com/cockroachdb/cockroach/pkg/sql/types"
"github.com/cockroachdb/cockroach/pkg/util/buildutil"
Expand Down Expand Up @@ -1998,7 +1999,8 @@ func (dsp *DistSQLPlanner) PlanAndRun(
// cancellation has already occurred.
return
}
if !pgerror.IsSQLRetryableError(distributedErr) &&
if !sqlerrors.IsDistSQLRetryableError(distributedErr) &&
!pgerror.IsSQLRetryableError(distributedErr) &&
!flowinfra.IsFlowRetryableError(distributedErr) &&
!isDialErr(distributedErr) {
// Only re-run the query if we think there is a high chance of a
Expand Down
21 changes: 21 additions & 0 deletions pkg/sql/sqlerrors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
package sqlerrors

import (
"strings"

"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/sql/catalog/descpb"
Expand Down Expand Up @@ -411,3 +413,22 @@ func errHasCode(err error, code ...pgcode.Code) bool {
}
return false
}

// IsDistSQLRetryableError returns true if the supplied error, or any of its parent
// causes is an rpc error.
// This is an unfortunate implementation that should be looking for a more
// specific error.
func IsDistSQLRetryableError(err error) bool {
if err == nil {
return false
}

// TODO(knz): this is a bad implementation. Make it go away
// by avoiding string comparisons.

errStr := err.Error()
// When a crdb node dies, any DistSQL flows with processors scheduled on
// it get an error with "rpc error" in the message from the call to
// `(*DistSQLPlanner).Run`.
return strings.Contains(errStr, `rpc error`)
}

0 comments on commit 5c09eb1

Please sign in to comment.