Skip to content

Commit

Permalink
doctor: skip validation for dropped descriptors
Browse files Browse the repository at this point in the history
In some cases, dropped descriptors appear in our `system.descriptors`
table with dangling job mutations without an associated job.
This patch teaches debug doctor examine to skip validation
on such dropped descriptors.

Epic: none

Fixes: #123477
Fixes: #122956

Release note: none
  • Loading branch information
annrpom committed May 16, 2024
1 parent 2566787 commit 52a697b
Show file tree
Hide file tree
Showing 9 changed files with 94 additions and 28 deletions.
1 change: 1 addition & 0 deletions pkg/cli/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,7 @@ go_library(
"//pkg/sql/sem/eval",
"//pkg/sql/sem/tree",
"//pkg/sql/sqlstats",
"//pkg/sql/stats",
"//pkg/storage",
"//pkg/storage/enginepb",
"//pkg/storage/fs",
Expand Down
7 changes: 3 additions & 4 deletions pkg/cli/doctor.go
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ func fromZipDir(
return errors.Wrap(err, "failed unmarshalling job payload")
}

// when can progress be null? if it exists but we have never started it?
// Skip NULL job payloads.
if fields[5] == "NULL" {
return nil
}
Expand All @@ -554,9 +554,8 @@ func fromZipDir(
Payload string `json:"hex_payload"`
Progress string `json:"hex_progress"`
}, 0)
// TODO(before merge): when does this happen?
if err := parseJSONFile(zipDirPath, "system.jobs.json", &jobsTableJSON); err != nil {
return nil, nil, nil, errors.Wrapf(err, "failed to parse system.jobs.json")
if err := parseJSONFile(zipDirPath, "crdb_internal.system_jobs.json", &jobsTableJSON); err != nil {
return nil, nil, nil, errors.Wrapf(err, "failed to parse crdb_internal.system_jobs.json")
}
for _, job := range jobsTableJSON {
row := jobs.JobMetadata{
Expand Down
71 changes: 59 additions & 12 deletions pkg/cli/doctor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,14 @@ import (
"github.com/cockroachdb/datadriven"
)

var descriptorWithFKMutation = `'{
func descriptorWithFKMutation(isDropped bool) string {
dropState := ``
dropTime := ``
if isDropped {
dropState = `"state": "DROP",`
dropTime = `"dropTime": "1713940113794672911",`
}
return `'{
"table": {
"columns": [
{
Expand All @@ -32,8 +39,9 @@ var descriptorWithFKMutation = `'{
"width": 64
}
}
],
"createAsOfTime": {
],` +
dropTime +
`"createAsOfTime": {
"logical": 1,
"wallTime": "1713940112376217646"
},
Expand Down Expand Up @@ -85,7 +93,7 @@ var descriptorWithFKMutation = `'{
"state": "DELETE_ONLY"
}
],
"name": "bar",
"name": "foo",
"nextColumnId": 2,
"nextConstraintId": 3,
"nextFamilyId": 1,
Expand All @@ -107,7 +115,7 @@ var descriptorWithFKMutation = `'{
1
],
"keyColumnNames": [
"col143_w3_144"
"i"
],
"name": "table_w3_143_pkey",
"partitioning": {},
Expand All @@ -133,15 +141,18 @@ var descriptorWithFKMutation = `'{
},
"replacementOf": {
"time": {}
},
"unexposedParentSchemaId": 381,
},` +
dropState +
`"unexposedParentSchemaId": 381,
"version": "2"
}
}'`
}

// This test doctoring a secure cluster.
func TestDoctorCluster(t *testing.T) {
defer leaktest.AfterTest(t)()
//
c := NewCLITest(TestCLIParams{T: t})
defer c.Cleanup()

Expand Down Expand Up @@ -174,16 +185,20 @@ func TestDoctorCluster(t *testing.T) {
// TestDoctorClusterBroken tests that debug doctor examine will pick up a multitude of issues on a corrupt descriptor.
func TestDoctorClusterBroken(t *testing.T) {
defer leaktest.AfterTest(t)()
c := NewCLITest(TestCLIParams{T: t})
c := NewCLITest(TestCLIParams{T: t, DisableAutoStats: true})
defer c.Cleanup()

jobDesc := fmt.Sprintf("SELECT crdb_internal.unsafe_upsert_descriptor('foo'::regclass::oid::int,"+
"crdb_internal.json_to_pb('cockroach.sql.sqlbase.Descriptor', %s::jsonb), true)", descriptorWithFKMutation)
desc := fmt.Sprintf("SELECT crdb_internal.unsafe_upsert_descriptor('foo'::regclass::oid::int,"+
"crdb_internal.json_to_pb('cockroach.sql.sqlbase.Descriptor', %s::jsonb), true)", descriptorWithFKMutation(false /* isDropped */))

// Introduce a descriptor with an attached job mutation (along with other issues).
// Introduce a descriptor with an attached job mutation (along with other issues). We want to ensure that the number of
// jobs created is deterministic (auto table stats will be collected due to the "schema change" on foo); therefore,
// we should disable automatic stats collection and instead create our own.
c.RunWithArgs([]string{"sql", "-e", strings.Join([]string{
"CREATE TABLE foo (i INT)",
jobDesc,
desc,
"CREATE STATISTICS foo_stats FROM foo",
"SELECT pg_catalog.pg_sleep(1)",
}, ";\n"),
})

Expand All @@ -200,6 +215,38 @@ func TestDoctorClusterBroken(t *testing.T) {
})
}

// TestDoctorClusterDropped tests that debug doctor examine will avoid validating dropped descriptors.
func TestDoctorClusterDropped(t *testing.T) {
defer leaktest.AfterTest(t)()
c := NewCLITest(TestCLIParams{T: t})
defer c.Cleanup()

desc := fmt.Sprintf("SELECT crdb_internal.unsafe_upsert_descriptor('foo'::regclass::oid::int,"+
"crdb_internal.json_to_pb('cockroach.sql.sqlbase.Descriptor', %s::jsonb), true)", descriptorWithFKMutation(true /* isDropped */))
// Introduce a dropped descriptor with an attached job mutation (along with other issues).
c.RunWithArgs([]string{"sql", "-e", strings.Join([]string{
"CREATE TABLE foo (i INT)",
desc,
"INSERT INTO system.users VALUES ('node', NULL, true, 3)",
"GRANT node TO root",
"DELETE FROM system.namespace WHERE name = 'foo'",
"SELECT pg_catalog.pg_sleep(1)",
}, ";\n"),
})

t.Run("examine", func(t *testing.T) {
out, err := c.RunWithCapture("debug doctor examine cluster")
if err != nil {
t.Fatal(err)
}

// Using datadriven allows TESTFLAGS=-rewrite.
datadriven.RunTest(t, datapathutils.TestDataPath(t, "doctor", "test_examine_cluster_dropped"), func(t *testing.T, td *datadriven.TestData) string {
return out
})
})
}

// TestDoctorZipDir tests the operation of zip over secure clusters.
func TestDoctorZipDir(t *testing.T) {
defer leaktest.AfterTest(t)()
Expand Down
6 changes: 6 additions & 0 deletions pkg/cli/testdata/doctor/test_examine_cluster_dropped
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
debug doctor examine cluster
----
debug doctor examine cluster
Examining 62 descriptors and 62 namespace entries...
Examining 8 jobs...
No problems found!
10 changes: 6 additions & 4 deletions pkg/cli/testdata/doctor/test_examine_cluster_jobs
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@ debug doctor examine cluster
----
debug doctor examine cluster
Examining 62 descriptors and 63 namespace entries...
ParentID 183, ParentSchemaID 381: relation "bar" (104): index "table_w3_143_pkey" key column ID 1 should have name "i", but found name "col143_w3_144"
ParentID 183, ParentSchemaID 381: relation "bar" (104): mutation job 962952277419655169: job 962952277419655169 not found
ParentID 100, ParentSchemaID 101: namespace entry "foo" (104): mismatched name "bar" in relation descriptor
Examining 60 jobs...
ParentID 183, ParentSchemaID 381: relation "foo" (104): referenced database ID 183: referenced descriptor not found
ParentID 183, ParentSchemaID 381: relation "foo" (104): referenced schema ID 381: referenced descriptor not found
ParentID 183, ParentSchemaID 381: relation "foo" (104): expected matching namespace entry, found none
ParentID 183, ParentSchemaID 381: relation "foo" (104): mutation job 962952277419655169: job 962952277419655169 not found
ParentID 100, ParentSchemaID 101: namespace entry "foo" (104): mismatched name "foo" in relation descriptor
Examining 8 jobs...
ERROR: validation failed
11 changes: 11 additions & 0 deletions pkg/cli/testutils.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/security/securitytest"
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/sql/sqlstats"
"github.com/cockroachdb/cockroach/pkg/sql/stats"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
Expand Down Expand Up @@ -100,6 +101,10 @@ type TestCLIParams struct {
// UseSystemTenant is used to force the test to target the system tenant
// in a shared process multitenant test.
UseSystemTenant bool

// DisableAutoStats is used to disable the collection of automatic table statistics
// for the entire cluster.
DisableAutoStats bool
}

// testTempFilePrefix is a sentinel marker to be used as the prefix of a
Expand Down Expand Up @@ -145,6 +150,11 @@ func newCLITestWithArgs(params TestCLIParams, argsFn func(args *base.TestServerA

c.cleanupFunc = func() error { return nil }

settings := makeClusterSettings()
if params.DisableAutoStats {
stats.AutomaticStatisticsClusterMode.Override(context.Background(), &settings.SV, false)
}

if !params.NoServer {
if !params.Insecure {
c.cleanupFunc = securitytest.CreateTestCerts(certsDir)
Expand All @@ -154,6 +164,7 @@ func newCLITestWithArgs(params TestCLIParams, argsFn func(args *base.TestServerA
DefaultTestTenant: base.TestControlsTenantsExplicitly,

Insecure: params.Insecure,
Settings: settings,
SSLCertsDir: c.certsDir,
StoreSpecs: params.StoreSpecs,
Locality: params.Locality,
Expand Down
4 changes: 4 additions & 0 deletions pkg/sql/doctor/doctor.go
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,10 @@ func ExamineDescriptors(
for _, row := range descTable {
id := descpb.ID(row.ID)
desc := descLookupFn(id)
// No need to validate dropped descriptors
if desc.Dropped() {
continue
}
ve := cb.ValidateWithRecover(ctx, version, desc)
for _, err := range ve {
problemsFound = true
Expand Down
12 changes: 4 additions & 8 deletions pkg/sql/doctor/doctor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -196,10 +196,8 @@ func TestExamineDescriptors(t *testing.T) {
}}),
},
},
expected: `Examining 1 descriptors and 0 namespace entries...
ParentID 0, ParentSchemaID 29: relation "foo" (1): invalid parent ID 0
ParentID 0, ParentSchemaID 29: relation "foo" (1): table must contain at least 1 column
`,
valid: true,
expected: "Examining 1 descriptors and 0 namespace entries...\n",
},
{ // 5
descTable: doctor.DescriptorTable{
Expand Down Expand Up @@ -415,12 +413,10 @@ func TestExamineDescriptors(t *testing.T) {
},
},
namespaceTable: doctor.NamespaceTable{
{NameInfo: descpb.NameInfo{ParentID: 52, ParentSchemaID: 29, Name: "t"}, ID: 51},
{NameInfo: descpb.NameInfo{Name: "db"}, ID: 52},
},
expected: `Examining 2 descriptors and 2 namespace entries...
ParentID 52, ParentSchemaID 29: namespace entry "t" (51): descriptor is being dropped
`,
valid: true,
expected: "Examining 2 descriptors and 1 namespace entries...\n",
},
{ // 17
descTable: doctor.DescriptorTable{
Expand Down

0 comments on commit 52a697b

Please sign in to comment.