Skip to content

Commit

Permalink
sql: scaffolding for db metadata update job
Browse files Browse the repository at this point in the history
This commit creates a new forever running BG job called
UpdateCachedTableMetadata. This is just some scaffolding -
the job doesn't do anything right now but it will be used
to populate the system table storing cached table metadata
used by obs surfaces.

Epic: none

Release note: None
  • Loading branch information
xinhaoz committed Aug 22, 2024
1 parent 4625d8b commit 163400d
Show file tree
Hide file tree
Showing 17 changed files with 182 additions and 24 deletions.
12 changes: 12 additions & 0 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1362,6 +1362,18 @@
<tr><td>APPLICATION</td><td>jobs.typedesc_schema_change.resume_completed</td><td>Number of typedesc_schema_change jobs which successfully resumed to completion</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.typedesc_schema_change.resume_failed</td><td>Number of typedesc_schema_change jobs which failed with a non-retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.typedesc_schema_change.resume_retry_error</td><td>Number of typedesc_schema_change jobs which failed with a retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.currently_idle</td><td>Number of update_cached_table_metadata jobs currently considered Idle and can be freely shut down</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.currently_paused</td><td>Number of update_cached_table_metadata jobs currently considered Paused</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.currently_running</td><td>Number of update_cached_table_metadata jobs currently running in Resume or OnFailOrCancel state</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.expired_pts_records</td><td>Number of expired protected timestamp records owned by update_cached_table_metadata jobs</td><td>records</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.fail_or_cancel_completed</td><td>Number of update_cached_table_metadata jobs which successfully completed their failure or cancelation process</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.fail_or_cancel_failed</td><td>Number of update_cached_table_metadata jobs which failed with a non-retriable error on their failure or cancelation process</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.fail_or_cancel_retry_error</td><td>Number of update_cached_table_metadata jobs which failed with a retriable error on their failure or cancelation process</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.protected_age_sec</td><td>The age of the oldest PTS record protected by update_cached_table_metadata jobs</td><td>seconds</td><td>GAUGE</td><td>SECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.protected_record_count</td><td>Number of protected timestamp records held by update_cached_table_metadata jobs</td><td>records</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.resume_completed</td><td>Number of update_cached_table_metadata jobs which successfully resumed to completion</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.resume_failed</td><td>Number of update_cached_table_metadata jobs which failed with a non-retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.update_cached_table_metadata.resume_retry_error</td><td>Number of update_cached_table_metadata jobs which failed with a retriable error</td><td>jobs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>kv.protectedts.reconciliation.errors</td><td>number of errors encountered during reconciliation runs on this node</td><td>Count</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>kv.protectedts.reconciliation.num_runs</td><td>number of successful reconciliation runs on this node</td><td>Count</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>kv.protectedts.reconciliation.records_processed</td><td>number of records processed without error during reconciliation on this node</td><td>Count</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
2 changes: 1 addition & 1 deletion pkg/cli/testdata/doctor/test_examine_cluster
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ debug doctor examine cluster
debug doctor examine cluster
Examining 64 descriptors and 63 namespace entries...
ParentID 100, ParentSchemaID 101: relation "foo" (105): expected matching namespace entry, found none
Examining 10 jobs...
Examining 11 jobs...
ERROR: validation failed
2 changes: 1 addition & 1 deletion pkg/cli/testdata/doctor/test_examine_cluster_dropped
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ debug doctor examine cluster
----
debug doctor examine cluster
Examining 63 descriptors and 63 namespace entries...
Examining 8 jobs...
Examining 9 jobs...
No problems found!
2 changes: 1 addition & 1 deletion pkg/cli/testdata/doctor/test_examine_cluster_jobs
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ Examining 63 descriptors and 64 namespace entries...
ParentID 183, ParentSchemaID 381: relation "foo" (104): expected matching namespace entry, found none
ParentID 183, ParentSchemaID 381: relation "foo" (104): mutation job 962952277419655169: job 962952277419655169 not found
ParentID 100, ParentSchemaID 101: namespace entry "foo" (104): mismatched name "foo" in relation descriptor
Examining 8 jobs...
Examining 9 jobs...
ERROR: validation failed
9 changes: 5 additions & 4 deletions pkg/jobs/jobs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,10 +216,11 @@ func (rts *registryTestSuite) setUp(t *testing.T) func() {
ManagerDisableJobCreation: true,
}
args.Knobs.UpgradeManager = &upgradebase.TestingKnobs{
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
SkipUpdateCachedTableMetadataBootstrap: true,
}
args.Knobs.KeyVisualizer = &keyvisualizer.TestingKnobs{SkipJobBootstrap: true}

Expand Down
6 changes: 6 additions & 0 deletions pkg/jobs/jobspb/jobs.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1334,6 +1334,9 @@ message MVCCStatisticsJobProgress {

}

message UpdateCachedTableMetadataDetails {}
message UpdateCachedTableMetadataProgress {}

message ImportRollbackDetails {
// TableID is the descriptor ID of table that should be rolled back.
//
Expand Down Expand Up @@ -1413,6 +1416,7 @@ message Payload {
ImportRollbackDetails import_rollback_details = 46;
HistoryRetentionDetails history_retention_details = 47;
LogicalReplicationDetails logical_replication_details = 48;
UpdateCachedTableMetadataDetails update_cached_table_metadata_details = 49;
}
reserved 26;
// PauseReason is used to describe the reason that the job is currently paused
Expand Down Expand Up @@ -1490,6 +1494,7 @@ message Progress {
ImportRollbackProgress import_rollback_progress = 34;
HistoryRetentionProgress HistoryRetentionProgress = 35;
LogicalReplicationProgress LogicalReplication = 36;
UpdateCachedTableMetadataProgress cached_db_metadata = 37;
}

uint64 trace_id = 21 [(gogoproto.nullable) = false, (gogoproto.customname) = "TraceID", (gogoproto.customtype) = "github.com/cockroachdb/cockroach/pkg/util/tracing/tracingpb.TraceID"];
Expand Down Expand Up @@ -1530,6 +1535,7 @@ enum Type {
HISTORY_RETENTION = 26 [(gogoproto.enumvalue_customname) = "TypeHistoryRetention"];
LOGICAL_REPLICATION = 27 [(gogoproto.enumvalue_customname) = "TypeLogicalReplication"];
AUTO_CREATE_PARTIAL_STATS = 28 [(gogoproto.enumvalue_customname) = "TypeAutoCreatePartialStats"];
UPDATE_CACHED_TABLE_METADATA = 29 [(gogoproto.enumvalue_customname) = "TypeUpdateCachedTableMetadata"];
}

message Job {
Expand Down
16 changes: 15 additions & 1 deletion pkg/jobs/jobspb/wrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ var (
_ Details = ImportRollbackDetails{}
_ Details = HistoryRetentionDetails{}
_ Details = LogicalReplicationDetails{}
_ Details = UpdateCachedTableMetadataDetails{}
)

// ProgressDetails is a marker interface for job progress details proto structs.
Expand Down Expand Up @@ -77,6 +78,7 @@ var (
_ ProgressDetails = ImportRollbackProgress{}
_ ProgressDetails = HistoryRetentionProgress{}
_ ProgressDetails = LogicalReplicationProgress{}
_ ProgressDetails = UpdateCachedTableMetadataProgress{}
)

// Type returns the payload's job type and panics if the type is invalid.
Expand Down Expand Up @@ -169,6 +171,7 @@ var AutomaticJobTypes = [...]Type{
TypeKeyVisualizer,
TypeAutoUpdateSQLActivity,
TypeMVCCStatisticsUpdate,
TypeUpdateCachedTableMetadata,
}

// DetailsType returns the type for a payload detail.
Expand Down Expand Up @@ -232,6 +235,8 @@ func DetailsType(d isPayload_Details) (Type, error) {
return TypeHistoryRetention, nil
case *Payload_LogicalReplicationDetails:
return TypeLogicalReplication, nil
case *Payload_UpdateCachedTableMetadataDetails:
return TypeUpdateCachedTableMetadata, nil
default:
return TypeUnspecified, errors.Newf("Payload.Type called on a payload with an unknown details type: %T", d)
}
Expand Down Expand Up @@ -283,6 +288,7 @@ var JobDetailsForEveryJobType = map[Type]Details{
TypeImportRollback: ImportRollbackDetails{},
TypeHistoryRetention: HistoryRetentionDetails{},
TypeLogicalReplication: LogicalReplicationDetails{},
TypeUpdateCachedTableMetadata: UpdateCachedTableMetadataDetails{},
}

// WrapProgressDetails wraps a ProgressDetails object in the protobuf wrapper
Expand Down Expand Up @@ -346,6 +352,8 @@ func WrapProgressDetails(details ProgressDetails) interface {
return &Progress_HistoryRetentionProgress{HistoryRetentionProgress: &d}
case LogicalReplicationProgress:
return &Progress_LogicalReplication{LogicalReplication: &d}
case UpdateCachedTableMetadataProgress:
return &Progress_CachedDbMetadata{CachedDbMetadata: &d}
default:
panic(errors.AssertionFailedf("WrapProgressDetails: unknown progress type %T", d))
}
Expand Down Expand Up @@ -407,6 +415,8 @@ func (p *Payload) UnwrapDetails() Details {
return *d.HistoryRetentionDetails
case *Payload_LogicalReplicationDetails:
return *d.LogicalReplicationDetails
case *Payload_UpdateCachedTableMetadataDetails:
return *d.UpdateCachedTableMetadataDetails
default:
return nil
}
Expand Down Expand Up @@ -468,6 +478,8 @@ func (p *Progress) UnwrapDetails() ProgressDetails {
return *d.HistoryRetentionProgress
case *Progress_LogicalReplication:
return *d.LogicalReplication
case *Progress_CachedDbMetadata:
return *d.CachedDbMetadata
default:
return nil
}
Expand Down Expand Up @@ -553,6 +565,8 @@ func WrapPayloadDetails(details Details) interface {
return &Payload_HistoryRetentionDetails{HistoryRetentionDetails: &d}
case LogicalReplicationDetails:
return &Payload_LogicalReplicationDetails{LogicalReplicationDetails: &d}
case UpdateCachedTableMetadataDetails:
return &Payload_UpdateCachedTableMetadataDetails{UpdateCachedTableMetadataDetails: &d}
default:
panic(errors.AssertionFailedf("jobs.WrapPayloadDetails: unknown details type %T", d))
}
Expand Down Expand Up @@ -588,7 +602,7 @@ const (
func (Type) SafeValue() {}

// NumJobTypes is the number of jobs types.
const NumJobTypes = 29
const NumJobTypes = 30

// ChangefeedDetailsMarshaler allows for dependency injection of
// cloud.SanitizeExternalStorageURI to avoid the dependency from this
Expand Down
2 changes: 2 additions & 0 deletions pkg/jobs/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,8 @@ const (
// MVCCStatisticsJobID A static job ID used for the MVCC statistics update
// job.
MVCCStatisticsJobID = jobspb.JobID(104)

UpdateCachedTableMetadataJobID = jobspb.JobID(105)
)

// MakeJobID generates a new job ID.
Expand Down
25 changes: 14 additions & 11 deletions pkg/jobs/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,10 @@ func TestRegistryGC(t *testing.T) {
},
UpgradeManager: &upgradebase.TestingKnobs{
// This test wants to look at job records.
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
SkipUpdateCachedTableMetadataBootstrap: true,
},
KeyVisualizer: &keyvisualizer.TestingKnobs{
SkipJobBootstrap: true,
Expand Down Expand Up @@ -284,10 +285,11 @@ func TestRegistryGCPagination(t *testing.T) {
},
UpgradeManager: &upgradebase.TestingKnobs{
// This test wants to count job records.
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
SkipUpdateCachedTableMetadataBootstrap: true,
},
KeyVisualizer: &keyvisualizer.TestingKnobs{
SkipJobBootstrap: true,
Expand Down Expand Up @@ -758,10 +760,11 @@ func TestRetriesWithExponentialBackoff(t *testing.T) {
ManagerDisableJobCreation: true,
},
UpgradeManager: &upgradebase.TestingKnobs{
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
DontUseJobs: true,
SkipJobMetricsPollingJobBootstrap: true,
SkipUpdateSQLActivityJobBootstrap: true,
SkipMVCCStatisticsJobBootstrap: true,
SkipUpdateCachedTableMetadataBootstrap: true,
},
KeyVisualizer: &keyvisualizer.TestingKnobs{
SkipJobBootstrap: true,
Expand Down
1 change: 1 addition & 0 deletions pkg/sql/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ go_library(
"unsplit.go",
"unsupported_vars.go",
"update.go",
"update_cached_table_metadata_job.go",
"upsert.go",
"user.go",
"values.go",
Expand Down
65 changes: 65 additions & 0 deletions pkg/sql/update_cached_table_metadata_job.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package sql

import (
"context"

"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/errors"
)

type tableMetadataUpdateJobResumer struct {
job *jobs.Job
}

var _ jobs.Resumer = (*tableMetadataUpdateJobResumer)(nil)

// Resume is part of the jobs.Resumer interface.
func (j *tableMetadataUpdateJobResumer) Resume(ctx context.Context, execCtxI interface{}) error {
log.Infof(ctx, "starting table metadata update job")
j.job.MarkIdle(true)

<-ctx.Done()
return nil
}

// OnFailOrCancel implements jobs.Resumer.
func (j *tableMetadataUpdateJobResumer) OnFailOrCancel(
ctx context.Context, execCtx interface{}, jobErr error,
) error {
if jobs.HasErrJobCanceled(jobErr) {
err := errors.NewAssertionErrorWithWrappedErrf(
jobErr, "mvcc statistics update job is not cancelable",
)
log.Errorf(ctx, "%v", err)
}
return nil
}

// CollectProfile implements jobs.Resumer.
func (j *tableMetadataUpdateJobResumer) CollectProfile(
ctx context.Context, execCtx interface{},
) error {
return nil
}

func init() {
jobs.RegisterConstructor(
jobspb.TypeUpdateCachedTableMetadata,
func(job *jobs.Job, settings *cluster.Settings) jobs.Resumer {
return &tableMetadataUpdateJobResumer{job: job}
}, jobs.DisablesTenantCostControl,
)
}
2 changes: 2 additions & 0 deletions pkg/upgrade/upgradebase/testing_knobs.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ type TestingKnobs struct {
SkipUpdateSQLActivityJobBootstrap bool

SkipMVCCStatisticsJobBootstrap bool

SkipUpdateCachedTableMetadataBootstrap bool
}

// ModuleTestingKnobs makes TestingKnobs a base.ModuleTestingKnobs.
Expand Down
1 change: 1 addition & 0 deletions pkg/upgrade/upgrades/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ go_library(
"descriptor_utils.go",
"first_upgrade.go",
"permanent_create_jobs_metrics_polling_job.go",
"permanent_create_update_cached_table_metadata_job.go",
"permanent_ensure_sql_schema_telemetry_schedule.go",
"permanent_key_visualizer_migration.go",
"permanent_mvcc_statistics_migration.go",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the Business Source License
// included in the file licenses/BSL.txt.
//
// As of the Change Date specified in that file, in accordance with
// the Business Source License, use of this software will be governed
// by the Apache License, Version 2.0, included in the file
// licenses/APL.txt.

package upgrades

import (
"context"

"github.com/cockroachdb/cockroach/pkg/clusterversion"
"github.com/cockroachdb/cockroach/pkg/jobs"
"github.com/cockroachdb/cockroach/pkg/jobs/jobspb"
_ "github.com/cockroachdb/cockroach/pkg/jobs/metricspoller" // Ensure job implementation is linked.
"github.com/cockroachdb/cockroach/pkg/security/username"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/upgrade"
)

func createUpdateCachedTableMetadataJob(
ctx context.Context, _ clusterversion.ClusterVersion, d upgrade.TenantDeps,
) error {
if d.TestingKnobs != nil && d.TestingKnobs.SkipUpdateCachedTableMetadataBootstrap {
return nil
}

return d.DB.Txn(ctx, func(ctx context.Context, txn isql.Txn) error {
jr := jobs.Record{
JobID: jobs.UpdateCachedTableMetadataJobID,
Description: jobspb.TypeUpdateCachedTableMetadata.String(),
Details: jobspb.UpdateCachedTableMetadataDetails{},
Progress: jobspb.UpdateCachedTableMetadataProgress{},
CreatedBy: &jobs.CreatedByInfo{Name: username.NodeUser, ID: username.NodeUserID},
Username: username.NodeUserName(),
NonCancelable: true,
}
return d.JobRegistry.CreateIfNotExistAdoptableJobWithTxn(ctx, jr, txn)
})
}
1 change: 1 addition & 0 deletions pkg/upgrade/upgrades/permanent_upgrades.go
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ func bootstrapCluster(
{"create jobs metrics polling job", createJobsMetricsPollingJob},
{"create sql activity updater job", createActivityUpdateJobMigration},
{"create mvcc stats job", createMVCCStatisticsJob},
{"create update cached table metadata job", createUpdateCachedTableMetadataJob},
} {
log.Infof(ctx, "executing bootstrap step %q", u.name)
if err := u.fn(ctx, cv, deps); err != nil {
Expand Down
4 changes: 2 additions & 2 deletions pkg/upgrade/upgrades/upgrades.go
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@ var upgrades = []upgradebase.Upgrade{
),

upgrade.NewTenantUpgrade(
"add new table_metadata table to the system tenant",
"add new table_metadata table and job to the system tenant",
clusterversion.V24_3_TableMetadata.Version(),
upgrade.NoPrecondition,
addTableMetadataTable,
addTableMetadataTableAndJob,
upgrade.RestoreActionNotRequired("cluster restore does not restore this table"),
),

Expand Down
Loading

0 comments on commit 163400d

Please sign in to comment.