diff --git a/pkg/jobs/BUILD.bazel b/pkg/jobs/BUILD.bazel index 598f95505bfd..f1bec751295d 100644 --- a/pkg/jobs/BUILD.bazel +++ b/pkg/jobs/BUILD.bazel @@ -30,6 +30,7 @@ go_library( "//pkg/roachpb", "//pkg/scheduledjobs", "//pkg/security", + "//pkg/server/telemetry", "//pkg/settings", "//pkg/settings/cluster", "//pkg/sql/catalog", diff --git a/pkg/jobs/metrics.go b/pkg/jobs/metrics.go index 064ef97bdbd8..00d46df58484 100644 --- a/pkg/jobs/metrics.go +++ b/pkg/jobs/metrics.go @@ -16,6 +16,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/util/metric" io_prometheus_client "github.com/prometheus/client_model/go" ) @@ -150,3 +151,39 @@ func (m *Metrics) init(histogramWindowInterval time.Duration) { // MakeChangefeedMetricsHook allows for registration of changefeed metrics from // ccl code. var MakeChangefeedMetricsHook func(time.Duration) metric.Struct + +// JobTelemetryMetrics is a telemetry metrics for individual job types. +type JobTelemetryMetrics struct { + Successful telemetry.Counter + Failed telemetry.Counter + Canceled telemetry.Counter +} + +// newJobTelemetryMetrics creates a new JobTelemetryMetrics object +// for a given job type name. +func newJobTelemetryMetrics(jobName string) *JobTelemetryMetrics { + return &JobTelemetryMetrics{ + Successful: telemetry.GetCounterOnce(fmt.Sprintf("job.%s.successful", jobName)), + Failed: telemetry.GetCounterOnce(fmt.Sprintf("job.%s.failed", jobName)), + Canceled: telemetry.GetCounterOnce(fmt.Sprintf("job.%s.canceled", jobName)), + } +} + +// getJobTelemetryMetricsArray initializes an array of job related telemetry +// metrics +func getJobTelemetryMetricsArray() [jobspb.NumJobTypes]*JobTelemetryMetrics { + var metrics [jobspb.NumJobTypes]*JobTelemetryMetrics + for i := 0; i < jobspb.NumJobTypes; i++ { + jt := jobspb.Type(i) + if jt == jobspb.TypeUnspecified { // do not track TypeUnspecified + continue + } + typeStr := strings.ToLower(strings.Replace(jt.String(), " ", "_", -1)) + metrics[i] = newJobTelemetryMetrics(typeStr) + } + return metrics +} + +// TelemetryMetrics contains telemetry metrics for different +// job types. +var TelemetryMetrics = getJobTelemetryMetricsArray() diff --git a/pkg/jobs/registry.go b/pkg/jobs/registry.go index 101cf0a3fb6e..cb02b5670c78 100644 --- a/pkg/jobs/registry.go +++ b/pkg/jobs/registry.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -1218,6 +1219,7 @@ func (r *Registry) stepThroughStateMachine( // restarted during the next adopt loop and reverting will be retried. return errors.Wrapf(err, "job %d: could not mark as canceled: %v", job.ID(), jobErr) } + telemetry.Inc(TelemetryMetrics[jobType].Canceled) return errors.WithSecondaryError(errors.Errorf("job %s", status), jobErr) case StatusSucceeded: if jobErr != nil { @@ -1232,6 +1234,7 @@ func (r *Registry) stepThroughStateMachine( // better. return r.stepThroughStateMachine(ctx, execCtx, resumer, job, StatusReverting, errors.Wrapf(err, "could not mark job %d as succeeded", job.ID())) } + telemetry.Inc(TelemetryMetrics[jobType].Successful) return nil case StatusReverting: if err := job.reverted(ctx, nil /* txn */, jobErr, nil /* fn */); err != nil { @@ -1286,6 +1289,7 @@ func (r *Registry) stepThroughStateMachine( // restarted during the next adopt loop and reverting will be retried. return errors.Wrapf(err, "job %d: could not mark as failed: %s", job.ID(), jobErr) } + telemetry.Inc(TelemetryMetrics[jobType].Failed) return jobErr default: return errors.NewAssertionErrorWithWrappedErrf(jobErr, diff --git a/pkg/sql/logictest/testdata/logic_test/alter_table b/pkg/sql/logictest/testdata/logic_test/alter_table index 0063789d8011..fa3271970a1d 100644 --- a/pkg/sql/logictest/testdata/logic_test/alter_table +++ b/pkg/sql/logictest/testdata/logic_test/alter_table @@ -1688,3 +1688,14 @@ SELECT count(descriptor_id) WHERE descriptor_id = ('test.public.t45985'::REGCLASS)::INT8; ---- 0 + +# Validate that the schema_change_successful metric +query T +SELECT feature_name FROM crdb_internal.feature_usage +WHERE feature_name IN ('job.schema_change.successful', +'job.schema_change.failed') AND +usage_count > 0 +ORDER BY feature_name DESC +---- +job.schema_change.successful +job.schema_change.failed diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats index cfb8ce4f3eb7..eae91a2e8fce 100644 --- a/pkg/sql/logictest/testdata/logic_test/distsql_stats +++ b/pkg/sql/logictest/testdata/logic_test/distsql_stats @@ -1065,3 +1065,18 @@ SHOW STATISTICS USING JSON FOR TABLE greeting_stats statement ok ALTER TABLE greeting_stats INJECT STATISTICS '$stats' + +# Validate that the schema_change_successful metric +query T +SELECT feature_name FROM crdb_internal.feature_usage +WHERE feature_name in ('job.typedesc_schema_change.successful', +'job.schema_change.successful', +'job.create_stats.successful', +'job.auto_create_stats.successful') AND +usage_count > 0 +ORDER BY feature_name DESC +---- +job.typedesc_schema_change.successful +job.schema_change.successful +job.create_stats.successful +job.auto_create_stats.successful diff --git a/pkg/sql/logictest/testdata/logic_test/jobs b/pkg/sql/logictest/testdata/logic_test/jobs index 21214acf5197..79af2e7012cd 100644 --- a/pkg/sql/logictest/testdata/logic_test/jobs +++ b/pkg/sql/logictest/testdata/logic_test/jobs @@ -130,3 +130,14 @@ user testuser # testuser should no longer have the ability to control jobs. statement error pq: user testuser does not have CONTROLJOB privilege PAUSE JOB (SELECT job_id FROM [SHOW JOBS] WHERE user_name = 'testuser2' AND job_type = 'SCHEMA CHANGE GC') + +user root + +# Validate that the schema_change_successful metric +query T +SELECT feature_name FROM crdb_internal.feature_usage +WHERE feature_name in ('job.schema_change.successful') AND +usage_count > 0 +ORDER BY feature_name DESC +---- +job.schema_change.successful