From e2549335fae7d7bc5ef638886b2fc1d77bfadaea Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Mon, 19 Sep 2022 16:37:14 -0400 Subject: [PATCH 1/5] sql: resolveOID does not properly handle dropped descriptors Previously, when resolve OID ran into errors on dropped descriptors we would surface internal inactive descriptor errors to the caller. This was inadequate because these errors would not be properly ignored leading to unexpected failures, which did not exist on 22.1. To address this, this patch adds logic to ignore inactive descriptor errors like we would have in the past, returning that no OID has been resolved. Release note: None --- pkg/sql/resolve_oid.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/sql/resolve_oid.go b/pkg/sql/resolve_oid.go index da9b820aad49..cdba7448764f 100644 --- a/pkg/sql/resolve_oid.go +++ b/pkg/sql/resolve_oid.go @@ -15,6 +15,7 @@ import ( "fmt" "github.com/cockroachdb/cockroach/pkg/kv" + "github.com/cockroachdb/cockroach/pkg/sql/catalog" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgcode" "github.com/cockroachdb/cockroach/pkg/sql/pgwire/pgerror" "github.com/cockroachdb/cockroach/pkg/sql/sem/tree" @@ -77,7 +78,12 @@ func resolveOID( results, err := ie.QueryRowEx(ctx, "queryOid", txn, sessiondata.NoSessionDataOverride, q, toResolve) if err != nil { - if errors.HasType(err, (*tree.MultipleResultsError)(nil)) { + if catalog.HasInactiveDescriptorError(err) { + // Descriptor is either dropped or offline, so + // the OID does not exist. + return nil, true, pgerror.Newf(info.errType, + "%s %s does not exist", info.objName, toResolve) + } else if errors.HasType(err, (*tree.MultipleResultsError)(nil)) { return nil, false, pgerror.Newf(pgcode.AmbiguousAlias, "more than one %s named %s", info.objName, toResolve) } From 070825e8f59df41eed843672dab549f298b93e67 Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Fri, 16 Sep 2022 20:18:42 -0400 Subject: [PATCH 2/5] workload/schemachanger: fix detect of potential exec errors Previously, when we refactored support for how statement generation and errors, we did not correctly update support for potential execution errors. As a result support for inserts / add foreign key constraints was regressed. This path, fixes the support for possible execution errors in both cases. Release note: None Release Justification: Low changes to re-enable an existing test case --- .../workload/schemachange/schema_change_external_test.go | 4 ++-- pkg/workload/schemachange/operation_generator.go | 9 +++------ pkg/workload/schemachange/schemachange.go | 1 - 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go b/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go index 9c8d1676cc2e..754dbce520b8 100644 --- a/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go +++ b/pkg/ccl/testccl/workload/schemachange/schema_change_external_test.go @@ -34,7 +34,7 @@ import ( func TestWorkload(t *testing.T) { defer leaktest.AfterTest(t)() defer utilccl.TestingEnableEnterprise()() - skip.WithIssue(t, 78478) + skip.UnderRace(t, "test connections can be too slow under race option.") dir := t.TempDir() ctx := context.Background() @@ -90,7 +90,7 @@ func TestWorkload(t *testing.T) { ql, err := wl.Ops(ctx, []string{pgURL.String()}, reg) require.NoError(t, err) - const N = 100 + const N = 800 workerFn := func(ctx context.Context, fn func(ctx context.Context) error) func() error { return func() error { for i := 0; i < N; i++ { diff --git a/pkg/workload/schemachange/operation_generator.go b/pkg/workload/schemachange/operation_generator.go index 88b471794cbb..64932d69b94d 100644 --- a/pkg/workload/schemachange/operation_generator.go +++ b/pkg/workload/schemachange/operation_generator.go @@ -55,7 +55,6 @@ type operationGeneratorParams struct { // The OperationBuilder has the sole responsibility of generating ops type operationGenerator struct { params *operationGeneratorParams - potentialExecErrors errorCodeSet expectedCommitErrors errorCodeSet potentialCommitErrors errorCodeSet @@ -126,7 +125,6 @@ func makeOperationGenerator(params *operationGeneratorParams) *operationGenerato return &operationGenerator{ params: params, expectedCommitErrors: makeExpectedErrorSet(), - potentialExecErrors: makeExpectedErrorSet(), potentialCommitErrors: makeExpectedErrorSet(), candidateExpectedCommitErrors: makeExpectedErrorSet(), } @@ -135,7 +133,6 @@ func makeOperationGenerator(params *operationGeneratorParams) *operationGenerato // Reset internal state used per operation within a transaction func (og *operationGenerator) resetOpState() { og.candidateExpectedCommitErrors.reset() - og.potentialExecErrors.reset() } // Reset internal state used per transaction @@ -903,7 +900,7 @@ func (og *operationGenerator) addForeignKeyConstraint( // perfectly if an error is expected. We can confirm post transaction with a time // travel query. _ = rowsSatisfyConstraint - og.potentialExecErrors.add(pgcode.ForeignKeyViolation) + stmt.potentialExecErrors.add(pgcode.ForeignKeyViolation) og.potentialCommitErrors.add(pgcode.ForeignKeyViolation) // It's possible for the table to be dropped concurrently, while we are running @@ -2496,7 +2493,7 @@ func (og *operationGenerator) insertRow(ctx context.Context, tx pgx.Tx) (stmt *o return nil, err } // We may have errors that are possible, but not guaranteed. - potentialErrors.add(og.potentialExecErrors) + potentialErrors.add(stmt.potentialExecErrors) if invalidInsert { generatedErrors.add(stmt.expectedExecErrors) // We will be pessimistic and assume that other column related errors can @@ -3607,7 +3604,7 @@ func (og *operationGenerator) typeFromTypeName( return nil, errors.Wrapf(err, "typeFromTypeName: %s", typeName) } typ, err := tree.ResolveType( - context.Background(), + ctx, stmt.AST.(*tree.Select).Select.(*tree.SelectClause).Exprs[0].Expr.(*tree.CastExpr).Type, &txTypeResolver{tx: tx}, ) diff --git a/pkg/workload/schemachange/schemachange.go b/pkg/workload/schemachange/schemachange.go index 82ea3eaac972..5c8c9e078752 100644 --- a/pkg/workload/schemachange/schemachange.go +++ b/pkg/workload/schemachange/schemachange.go @@ -319,7 +319,6 @@ func (w *schemaChangeWorker) WrapWithErrorState(err error) error { } return &ErrorState{ cause: err, - PotentialErrors: w.opGen.potentialExecErrors.StringSlice(), PotentialCommitErrors: w.opGen.potentialCommitErrors.StringSlice(), ExpectedCommitErrors: w.opGen.expectedCommitErrors.StringSlice(), QueriesForGeneratingErrors: w.opGen.GetOpGenLog(), From 99edb39cd5d2c789114988f377986a4e94c6cb64 Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Mon, 19 Sep 2022 22:27:52 -0400 Subject: [PATCH 3/5] workload/schemachange: detect expected errors for CTAS Previously, when a CTAS statement was executed we did not correctly detect if the SELECT portion could fail on generated columns. This could lead to test failures, even though the observed errors are correct. To address, this patch will execute the select independently to validate any generated column errors. Release note: None --- pkg/workload/schemachange/error_screening.go | 12 +++++++++--- pkg/workload/schemachange/operation_generator.go | 8 ++++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/pkg/workload/schemachange/error_screening.go b/pkg/workload/schemachange/error_screening.go index ed502bb75690..4f0eeadb5b75 100644 --- a/pkg/workload/schemachange/error_screening.go +++ b/pkg/workload/schemachange/error_screening.go @@ -562,14 +562,20 @@ SELECT count(*) > 0 return nil } +func getValidGenerationErrors() errorCodeSet { + return errorCodeSet{ + pgcode.NumericValueOutOfRange: true, + pgcode.FloatingPointException: true, + pgcode.InvalidTextRepresentation: true, + } +} + // isValidGenerationError these codes can be observed when evaluating values // for generated expressions. These are errors are not ignored, but added into // the expected set of errors. func isValidGenerationError(code string) bool { pgCode := pgcode.MakeCode(code) - return pgCode == pgcode.NumericValueOutOfRange || - pgCode == pgcode.FloatingPointException || - pgCode == pgcode.InvalidTextRepresentation + return getValidGenerationErrors().contains(pgCode) } // validateGeneratedExpressionsForInsert goes through generated expressions and diff --git a/pkg/workload/schemachange/operation_generator.go b/pkg/workload/schemachange/operation_generator.go index 64932d69b94d..3aacb9a68cb8 100644 --- a/pkg/workload/schemachange/operation_generator.go +++ b/pkg/workload/schemachange/operation_generator.go @@ -1415,6 +1415,14 @@ func (og *operationGenerator) createTableAs(ctx context.Context, tx pgx.Tx) (*op {code: pgcode.DuplicateColumn, condition: duplicateColumns}, }.add(opStmt.expectedExecErrors) + // Confirm the select itself doesn't run into any column generation errors, + // by executing it independently first until we add validation when adding + // generated columns. See issue: #81698?, which will allow us to remove this + // logic in the future. + if opStmt.expectedExecErrors.empty() { + opStmt.potentialExecErrors.merge(getValidGenerationErrors()) + } + opStmt.sql = fmt.Sprintf(`CREATE TABLE %s AS %s FETCH FIRST %d ROWS ONLY`, destTableName, selectStatement.String(), MaxRowsToConsume) return opStmt, nil From 70b78f271911b7bb9e39df2ab37e8798b6e5cc85 Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Tue, 20 Sep 2022 16:35:47 -0400 Subject: [PATCH 4/5] workload/schemachange: fix watchdog stack dumping behaviour Previously, the watch dog threads inside the schema changer workload could dump the stacks way incorrectly, since it didn't give connections time to terminate. This change adapts the watch dog logic to have a delay. Release note: None --- pkg/ccl/testccl/workload/schemachange/BUILD.bazel | 3 ++- pkg/workload/schemachange/watch_dog.go | 11 ++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pkg/ccl/testccl/workload/schemachange/BUILD.bazel b/pkg/ccl/testccl/workload/schemachange/BUILD.bazel index b5e6e0246d16..8341b8995be8 100644 --- a/pkg/ccl/testccl/workload/schemachange/BUILD.bazel +++ b/pkg/ccl/testccl/workload/schemachange/BUILD.bazel @@ -3,11 +3,12 @@ load("@io_bazel_rules_go//go:def.bzl", "go_test") go_test( name = "schemachange_test", + size = "large", srcs = [ "main_test.go", "schema_change_external_test.go", ], - args = ["-test.timeout=295s"], + args = ["-test.timeout=895s"], data = [ "//c-deps:libgeos", ], diff --git a/pkg/workload/schemachange/watch_dog.go b/pkg/workload/schemachange/watch_dog.go index da021dc5d3f6..f0137a910410 100644 --- a/pkg/workload/schemachange/watch_dog.go +++ b/pkg/workload/schemachange/watch_dog.go @@ -93,7 +93,16 @@ func (w *schemaChangeWatchDog) watchLoop(ctx context.Context) { close(responseChannel) return case <-ctx.Done(): - panic("dumping stacks, we failed to terminate threads on time.") + // Give the connections a small amount of time to clean up, if they fail + // to do so, we will dump stacks. + select { + case responseChannel := <-w.cmdChannel: + close(responseChannel) + return + case <-time.After(time.Second * 4): + panic("dumping stacks, we failed to terminate threads on time.") + + } case <-time.After(time.Second): // If the connection is making progress, the watch dog timer can be reset // again. From 85904e55c9fad44f29ca1c7a0e935c7d92831c9a Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Wed, 21 Sep 2022 13:01:41 -0400 Subject: [PATCH 5/5] workload/schemachange/random-load: enable as release blocker Fixes: #87615 Previously, this test was no longer marked as a release blocker because of how unstable it was. With the latest set of fixes we expect this to be far more stable. Release note: None --- pkg/cmd/roachtest/tests/schemachange_random_load.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/cmd/roachtest/tests/schemachange_random_load.go b/pkg/cmd/roachtest/tests/schemachange_random_load.go index 1bfcc35d9c18..d0742a0b8811 100644 --- a/pkg/cmd/roachtest/tests/schemachange_random_load.go +++ b/pkg/cmd/roachtest/tests/schemachange_random_load.go @@ -46,9 +46,6 @@ func registerSchemaChangeRandomLoad(r registry.Registry) { spec.Zones(geoZonesStr), ), NativeLibs: registry.LibGEOS, - // This is set while development is still happening on the workload and we - // fix (or bypass) minor schema change bugs that are discovered. - NonReleaseBlocker: true, Run: func(ctx context.Context, t test.Test, c cluster.Cluster) { maxOps := 5000 concurrency := 20