Skip to content

Commit

Permalink
Merge #120806
Browse files Browse the repository at this point in the history
120806: sql: treat memory monitor errors as retriable r=fqazi a=fqazi

Release note (bug fix): Memory exhaustion errors which can occur during schema changes are no longer considered to be permanent failures. These schema changes will now be retried rather than reverted.

This is just rebase of a PR by `@ajwerner:` #71816, with a minor change to improve test reliability.

Fixes: #120807

Co-authored-by: Andrew Werner <[email protected]>
  • Loading branch information
craig[bot] and ajwerner committed Mar 22, 2024
2 parents 3a5f364 + f280d53 commit 620aa02
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 1 deletion.
2 changes: 1 addition & 1 deletion pkg/sql/schema_changer.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ func IsPermanentSchemaChangeError(err error) bool {
}

switch pgerror.GetPGCode(err) {
case pgcode.SerializationFailure, pgcode.InternalConnectionFailure:
case pgcode.SerializationFailure, pgcode.InternalConnectionFailure, pgcode.OutOfMemory:
return false

case pgcode.Internal, pgcode.RangeUnavailable:
Expand Down
56 changes: 56 additions & 0 deletions pkg/sql/schema_changer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,12 +57,14 @@ import (
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/testutils/skip"
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
"github.com/cockroachdb/cockroach/pkg/testutils/testcluster"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/ctxgroup"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/json"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/cockroach/pkg/util/mon"
"github.com/cockroachdb/cockroach/pkg/util/protoutil"
"github.com/cockroachdb/cockroach/pkg/util/randutil"
"github.com/cockroachdb/cockroach/pkg/util/retry"
Expand Down Expand Up @@ -7765,3 +7767,57 @@ func TestLegacySchemaChangerWaitsForOtherSchemaChanges(t *testing.T) {
return errors.New("")
})
}

// TestMemoryMonitorErrorsDuringBackfillAreRetried tests that we properly classify memory
// monitor errors as retriable. It's a regression test to ensure that we don't end up
// trying to revert schema changes which encounter such errors. Prior to the commit which
// added this test, these errors would result in failures which looked like:
//
// reversing schema change \d+ due to irrecoverable error: memory budget exceeded: 1 bytes requested, 2 currently allocated, 2 bytes in budget
func TestMemoryMonitorErrorsDuringBackfillAreRetried(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)
ctx := context.Background()

// Run across both nodes to make sure that the error makes it across distsql
// boundaries.
testutils.RunTrueAndFalse(t, "local", func(t *testing.T, local bool) {
var shouldFail atomic.Int64
knobs := &execinfra.TestingKnobs{
RunBeforeBackfillChunk: func(sp roachpb.Span) error {
switch shouldFail.Add(1) {
case 1:
return mon.NewMemoryBudgetExceededError(1, 2, 2)
default:
return nil
}
},
}

var dataNode, otherNode int
if local {
dataNode, otherNode = 0, 1
} else {
dataNode, otherNode = 1, 0
}
tca := base.TestClusterArgs{
ReplicationMode: base.ReplicationManual,
ServerArgsPerNode: map[int]base.TestServerArgs{
otherNode: {},
dataNode: {Knobs: base.TestingKnobs{
DistSQL: knobs,
JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(),
}},
},
}
tc := testcluster.StartTestCluster(t, 2, tca)
defer tc.Stopper().Stop(ctx)
tdb := sqlutils.MakeSQLRunner(tc.ServerConn(0))
tdb.Exec(t, "CREATE TABLE foo (i INT PRIMARY KEY)")
tdb.Exec(t, "INSERT INTO foo VALUES (1)")
tdb.Exec(t, `ALTER TABLE foo EXPERIMENTAL_RELOCATE SELECT ARRAY[$1], 1`,
tc.Server(dataNode).GetFirstStoreID())
tdb.Exec(t, `ALTER TABLE foo ADD COLUMN j INT NOT NULL DEFAULT 42`)
require.Equalf(t, shouldFail.Load(), int64(2), "not all failure conditions were hit %d", shouldFail.Load())
})
}

0 comments on commit 620aa02

Please sign in to comment.