From b27a91605fb4ac53d20cc9e0391eb0a1e6400f5f Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Mon, 19 Sep 2022 16:37:14 -0400 Subject: [PATCH 1/5] sql: resolveOID does not properly handle dropped descriptors Previously, when resolve OID ran into errors on dropped descriptors we would surface internal inactive descriptor errors to the caller. This was inadequate because these errors would not be properly ignored leading to unexpected failures, which did not exist on 22.1. To address this, this patch adds logic to ignore inactive descriptor errors like we would have in the past, returning that no OID has been resolved. Release note: None --- pkg/sql/resolve_oid.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/sql/resolve_oid.go b/pkg/sql/resolve_oid.go index da9b820aad49..cdba7448764f 100644 --- a/pkg/sql/resolve_oid.go +++ b/pkg/sql/resolve_oid.go @@ -15,6 +15,7 @@ import ( "fmt" "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/sql/catalog" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" @@ -77,7 +78,12 @@ func resolveOID( results, err := ie.QueryRowEx(ctx, "queryOid", txn, sessiondata.NoSessionDataOverride, q, toResolve) if err != nil { - if errors.HasType(err, (*tree.MultipleResultsError)(nil)) { + if catalog.HasInactiveDescriptorError(err) { + // Descriptor is either dropped or offline, so + // the OID does not exist. + return nil, true, pgerror.Newf(info.errType, + "%s %s does not exist", info.objName, toResolve) + } else if errors.HasType(err, (*tree.MultipleResultsError)(nil)) { return nil, false, pgerror.Newf(pgcode.AmbiguousAlias, "more than one %s named %s", info.objName, toResolve) } From ff30d4b411b5db86d26a81365d44017c39e8450b Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Fri, 16 Sep 2022 20:18:42 -0400 Subject: [PATCH 2/5] workload/schemachanger: fix detect of potential exec errors Previously, when we refactored support for how statement generation and errors, we did not correctly update support for potential execution errors. As a result support for inserts / add foreign key constraints was regressed. This path, fixes the support for possible execution errors in both cases. Release note: None Release Justification: Low changes to re-enable an existing test case --- .../workload/schemachange/schema_change_external_test.go | 4 ++-- pkg/workload/schemachange/operation_generator.go | 7 ++----- pkg/workload/schemachange/schemachange.go | 5 ----- 3 files changed, 4 insertions(+), 12 deletions(-) diff --git a/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go b/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go index 9c8d1676cc2e..754dbce520b8 100644 --- a/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go +++ b/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go @@ -34,7 +34,7 @@ import ( func TestWorkload(t *testing.T) { defer leaktest.AfterTest(t)() defer utilccl.TestingEnableEnterprise()() - skip.WithIssue(t, 78478) + skip.UnderRace(t, "test connections can be too slow under race option.") dir := t.TempDir() ctx := context.Background() @@ -90,7 +90,7 @@ func TestWorkload(t *testing.T) { ql, err := wl.Ops(ctx, []string{pgURL.String()}, reg) require.NoError(t, err) - const N = 100 + const N = 800 workerFn := func(ctx context.Context, fn func(ctx context.Context) error) func() error { return func() error { for i := 0; i < N; i++ { diff --git a/pkg/workload/schemachange/operation_generator.go b/pkg/workload/schemachange/operation_generator.go index f4ca24d89ad6..f075e199533f 100644 --- a/pkg/workload/schemachange/operation_generator.go +++ b/pkg/workload/schemachange/operation_generator.go @@ -54,7 +54,6 @@ type operationGeneratorParams struct { // The OperationBuilder has the sole responsibility of generating ops type operationGenerator struct { params *operationGeneratorParams - potentialExecErrors errorCodeSet expectedCommitErrors errorCodeSet potentialCommitErrors errorCodeSet @@ -103,7 +102,6 @@ func makeOperationGenerator(params *operationGeneratorParams) *operationGenerato return &operationGenerator{ params: params, expectedCommitErrors: makeExpectedErrorSet(), - potentialExecErrors: makeExpectedErrorSet(), potentialCommitErrors: makeExpectedErrorSet(), candidateExpectedCommitErrors: makeExpectedErrorSet(), } @@ -112,7 +110,6 @@ func makeOperationGenerator(params *operationGeneratorParams) *operationGenerato // Reset internal state used per operation within a transaction func (og *operationGenerator) resetOpState() { og.candidateExpectedCommitErrors.reset() - og.potentialExecErrors.reset() og.opGenLog = strings.Builder{} } @@ -881,7 +878,7 @@ func (og *operationGenerator) addForeignKeyConstraint( // perfectly if an error is expected. We can confirm post transaction with a time // travel query. _ = rowsSatisfyConstraint - og.potentialExecErrors.add(pgcode.ForeignKeyViolation) + stmt.potentialExecErrors.add(pgcode.ForeignKeyViolation) og.potentialCommitErrors.add(pgcode.ForeignKeyViolation) // It's possible for the table to be dropped concurrently, while we are running @@ -2404,7 +2401,7 @@ func (og *operationGenerator) insertRow(ctx context.Context, tx pgx.Tx) (stmt *o return nil, err } // We may have errors that are possible, but not guaranteed. - potentialErrors.add(og.potentialExecErrors) + potentialErrors.add(stmt.potentialExecErrors) if invalidInsert { generatedErrors.add(stmt.expectedExecErrors) // We will be pessimistic and assume that other column related errors can diff --git a/pkg/workload/schemachange/schemachange.go b/pkg/workload/schemachange/schemachange.go index 93f3d1240e3c..36d65fa6f979 100644 --- a/pkg/workload/schemachange/schemachange.go +++ b/pkg/workload/schemachange/schemachange.go @@ -313,17 +313,12 @@ func (w *schemaChangeWorker) recordInHist(elapsed time.Duration, bin histBin) { func (w *schemaChangeWorker) getErrorState() string { return fmt.Sprintf("Dumping state before death:\n"+ - "Expected errors: %s"+ - "Potential exec errors: %s"+ "Expected commit errors: %s"+ "Potential commit errors: %s"+ "==========================="+ "Executed queries for generating errors: %s"+ "==========================="+ "Previous statements %s", - "", - // w.opGen.expectedExecErrors.String(), - w.opGen.potentialExecErrors.String(), w.opGen.expectedCommitErrors.String(), w.opGen.potentialCommitErrors.String(), w.opGen.GetOpGenLog(), From e1491dea61074f4dc4b275f905527e9d8fcf371a Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Mon, 19 Sep 2022 22:27:52 -0400 Subject: [PATCH 3/5] workload/schemachange: detect expected errors for CTAS Previously, when a CTAS statement was executed we did not correctly detect if the SELECT portion could fail on generated columns. This could lead to test failures, even though the observed errors are correct. To address, this patch will execute the select independently to validate any generated column errors. Release note: None --- pkg/workload/schemachange/error_screening.go | 12 +++++++++--- pkg/workload/schemachange/operation_generator.go | 8 ++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pkg/workload/schemachange/error_screening.go b/pkg/workload/schemachange/error_screening.go index 6292014bee12..4d5ad403700d 100644 --- a/pkg/workload/schemachange/error_screening.go +++ b/pkg/workload/schemachange/error_screening.go @@ -560,14 +560,20 @@ SELECT count(*) > 0 return nil } +func getValidGenerationErrors() errorCodeSet { + return errorCodeSet{ + pgcode.NumericValueOutOfRange: true, + pgcode.FloatingPointException: true, + pgcode.InvalidTextRepresentation: true, + } +} + // isValidGenerationError these codes can be observed when evaluating values // for generated expressions. These are errors are not ignored, but added into // the expected set of errors. func isValidGenerationError(code string) bool { pgCode := pgcode.MakeCode(code) - return pgCode == pgcode.NumericValueOutOfRange || - pgCode == pgcode.FloatingPointException || - pgCode == pgcode.InvalidTextRepresentation + return getValidGenerationErrors().contains(pgCode) } // validateGeneratedExpressionsForInsert goes through generated expressions and diff --git a/pkg/workload/schemachange/operation_generator.go b/pkg/workload/schemachange/operation_generator.go index f075e199533f..c447fee36163 100644 --- a/pkg/workload/schemachange/operation_generator.go +++ b/pkg/workload/schemachange/operation_generator.go @@ -1335,6 +1335,14 @@ func (og *operationGenerator) createTableAs(ctx context.Context, tx pgx.Tx) (*op {code: pgcode.DuplicateColumn, condition: duplicateColumns}, }.add(opStmt.expectedExecErrors) + // Confirm the select itself doesn't run into any column generation errors, + // by executing it independently first until we add validation when adding + // generated columns. See issue: #81698?, which will allow us to remove this + // logic in the future. + if opStmt.expectedExecErrors.empty() { + opStmt.potentialExecErrors.merge(getValidGenerationErrors()) + } + opStmt.sql = fmt.Sprintf(`CREATE TABLE %s AS %s FETCH FIRST %d ROWS ONLY`, destTableName, selectStatement.String(), MaxRowsToConsume) return opStmt, nil From b3aa5332e7854709dc341d91b3ca6f6055e82a8c Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Tue, 20 Sep 2022 16:35:47 -0400 Subject: [PATCH 4/5] workload/schemachange: fix watchdog stack dumping behaviour Previously, the watch dog threads inside the schema changer workload could dump the stacks way incorrectly, since it didn't give connections time to terminate. This change adapts the watch dog logic to have a delay. Release note: None --- pkg/ccl/testccl/workload/schemachange/BUILD.bazel | 3 ++- pkg/workload/schemachange/watch_dog.go | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/pkg/ccl/testccl/workload/schemachange/BUILD.bazel b/pkg/ccl/testccl/workload/schemachange/BUILD.bazel index b5e6e0246d16..8341b8995be8 100644 --- a/pkg/ccl/testccl/workload/schemachange/BUILD.bazel +++ b/pkg/ccl/testccl/workload/schemachange/BUILD.bazel @@ -3,11 +3,12 @@ load("@io_bazel_rules_go//go:def.bzl", "go_test") go_test( name = "schemachange_test", + size = "large", srcs = [ "main_test.go", "schema_change_external_test.go", ], - args = ["-test.timeout=295s"], + args = ["-test.timeout=895s"], data = [ "//c-deps:libgeos", ], diff --git a/pkg/workload/schemachange/watch_dog.go b/pkg/workload/schemachange/watch_dog.go index da021dc5d3f6..18a983d5f929 100644 --- a/pkg/workload/schemachange/watch_dog.go +++ b/pkg/workload/schemachange/watch_dog.go @@ -93,7 +93,15 @@ func (w *schemaChangeWatchDog) watchLoop(ctx context.Context) { close(responseChannel) return case <-ctx.Done(): - panic("dumping stacks, we failed to terminate threads on time.") + // Give the connections a small amount of time to clean up, if they fail + // to do so, we will dump stacks. + select { + case <-w.cmdChannel: + return + case <-time.After(time.Second * 4): + panic("dumping stacks, we failed to terminate threads on time.") + + } case <-time.After(time.Second): // If the connection is making progress, the watch dog timer can be reset // again. From 2abce7100d4375197b9c859c4d07032c71da8d79 Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Wed, 21 Sep 2022 13:01:41 -0400 Subject: [PATCH 5/5] workload/schemachange/random-load: enable as release blocker Fixes: #87615 Previously, this test was no longer marked as a release blocker because of how unstable it was. With the latest set of fixes we expect this to be far more stable. Release note: None --- pkg/cmd/roachtest/tests/schemachange_random_load.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/cmd/roachtest/tests/schemachange_random_load.go b/pkg/cmd/roachtest/tests/schemachange_random_load.go index e1171ff6531a..bced74144660 100644 --- a/pkg/cmd/roachtest/tests/schemachange_random_load.go +++ b/pkg/cmd/roachtest/tests/schemachange_random_load.go @@ -45,9 +45,6 @@ func registerSchemaChangeRandomLoad(r registry.Registry) { spec.Geo(), spec.Zones(geoZonesStr), ), - // This is set while development is still happening on the workload and we - // fix (or bypass) minor schema change bugs that are discovered. - NonReleaseBlocker: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { maxOps := 5000 concurrency := 20