From 69b285c8221675fe7500416fe29cc5e410f689c0 Mon Sep 17 00:00:00 2001 From: David Taylor Date: Sun, 23 Jun 2024 18:07:37 +0000 Subject: [PATCH] jobs/jobsprofiler: limit retained dsp diagrams Release note (ops change): Fixed a bug where collection of debug information for very long-running jobs could use excessive space in the job_info system table and/or cause some interactions with the jobs system to become slow. Epic: none. --- pkg/jobs/jobsprofiler/BUILD.bazel | 1 + pkg/jobs/jobsprofiler/profiler.go | 25 +++++++++++++++++++ pkg/jobs/jobsprofiler/profiler_test.go | 24 ++++++++++++++++++ .../profilerconstants/constants.go | 3 +++ 4 files changed, 53 insertions(+) diff --git a/pkg/jobs/jobsprofiler/BUILD.bazel b/pkg/jobs/jobsprofiler/BUILD.bazel index 35cd25c25f68..193bb0c8f58d 100644 --- a/pkg/jobs/jobsprofiler/BUILD.bazel +++ b/pkg/jobs/jobsprofiler/BUILD.bazel @@ -38,6 +38,7 @@ go_test( "//pkg/sql", "//pkg/sql/execinfrapb", "//pkg/sql/isql", + "//pkg/sql/physicalplan", "//pkg/testutils", "//pkg/testutils/jobutils", "//pkg/testutils/serverutils", diff --git a/pkg/jobs/jobsprofiler/profiler.go b/pkg/jobs/jobsprofiler/profiler.go index fd365d17964c..17ccfd8f43f2 100644 --- a/pkg/jobs/jobsprofiler/profiler.go +++ b/pkg/jobs/jobsprofiler/profiler.go @@ -25,6 +25,15 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/timeutil" ) +const MaxRetainedDSPDiagramsPerJob = 5 + +// dspDiagMaxCulledPerWrite limits how many old diagrams writing a new one will +// cull to try to maintain the limit of 5; typically it would cull no more than +// one in the steady-state but an upgrading cluster that has accumulated many +// old rows might try to cull more, so we bound how many are eligible at a time +// to some large but finite upper-bound. +const dspDiagMaxCulledPerWrite = 100 + // StorePlanDiagram stores the DistSQL diagram generated from p in the job info // table. The generation of the plan diagram and persistence to the info table // are done asynchronously and this method does not block on their completion. @@ -46,6 +55,22 @@ func StorePlanDiagram( dspKey := profilerconstants.MakeDSPDiagramInfoKey(timeutil.Now().UnixNano()) infoStorage := jobs.InfoStorageForJob(txn, jobID) + + // Limit total retained diagrams by culling older ones as needed. + count, err := infoStorage.Count(ctx, profilerconstants.DSPDiagramInfoKeyPrefix, profilerconstants.DSPDiagramInfoKeyMax) + if err != nil { + return err + } + const keep = MaxRetainedDSPDiagramsPerJob - 1 + if toCull := min(count-keep, dspDiagMaxCulledPerWrite); toCull > 0 { + if err := infoStorage.DeleteRange( + ctx, profilerconstants.DSPDiagramInfoKeyPrefix, profilerconstants.DSPDiagramInfoKeyMax, toCull, + ); err != nil { + return err + } + } + + // Write the new diagram. return infoStorage.Write(ctx, dspKey, []byte(diagURL.String())) }) // Don't log the error if the context has been canceled. This will likely be diff --git a/pkg/jobs/jobsprofiler/profiler_test.go b/pkg/jobs/jobsprofiler/profiler_test.go index 413a485ccbc4..0cf467d5486e 100644 --- a/pkg/jobs/jobsprofiler/profiler_test.go +++ b/pkg/jobs/jobsprofiler/profiler_test.go @@ -27,6 +27,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/sql" "github.com/cockroachdb/cockroach/pkg/sql/execinfrapb" "github.com/cockroachdb/cockroach/pkg/sql/isql" + "github.com/cockroachdb/cockroach/pkg/sql/physicalplan" "github.com/cockroachdb/cockroach/pkg/testutils" "github.com/cockroachdb/cockroach/pkg/testutils/jobutils" "github.com/cockroachdb/cockroach/pkg/testutils/serverutils" @@ -51,6 +52,29 @@ func TestProfilerStorePlanDiagram(t *testing.T) { ctx := context.Background() defer s.Stopper().Stop(ctx) + // First verify that directly calling StorePlanDiagram writes n times causes + // the expected number of persisted rows in job_info, respecting the limit. + db := s.ExecutorConfig().(sql.ExecutorConfig).InternalDB + const fakeJobID = 4567 + plan := &sql.PhysicalPlan{PhysicalPlan: physicalplan.MakePhysicalPlan(&physicalplan.PhysicalInfrastructure{})} + for i := 1; i < 10; i++ { + jobsprofiler.StorePlanDiagram(ctx, s.ApplicationLayer().AppStopper(), plan, db, fakeJobID) + testutils.SucceedsSoon(t, func() error { + var count int + if err := sqlDB.QueryRow( + `SELECT count(*) FROM system.job_info WHERE job_id = $1`, fakeJobID, + ).Scan(&count); err != nil { + return err + } + if expected := min(i, jobsprofiler.MaxRetainedDSPDiagramsPerJob); count != expected { + return errors.Errorf("expected %d rows, got %d", expected, count) + } + return nil + }) + } + + // Now run various jobs that have been extended to persist diagrams and make + // sure that they also create persisted diagram rows. _, err := sqlDB.Exec(`CREATE DATABASE test`) require.NoError(t, err) _, err = sqlDB.Exec(`CREATE TABLE foo (id INT PRIMARY KEY)`) diff --git a/pkg/jobs/jobsprofiler/profilerconstants/constants.go b/pkg/jobs/jobsprofiler/profilerconstants/constants.go index 22532607f39c..d0da48c31f47 100644 --- a/pkg/jobs/jobsprofiler/profilerconstants/constants.go +++ b/pkg/jobs/jobsprofiler/profilerconstants/constants.go @@ -21,6 +21,9 @@ import ( const DSPDiagramInfoKeyPrefix = "~dsp-diag-url-" +// DSPDiagramInfoKeyMax sorts after any diagram info key, because `:“ > [0-9]. +const DSPDiagramInfoKeyMax = DSPDiagramInfoKeyPrefix + ":" + // MakeDSPDiagramInfoKey constructs an ephemeral DSP diagram info key. func MakeDSPDiagramInfoKey(timestampInNanos int64) string { return fmt.Sprintf("%s%d", DSPDiagramInfoKeyPrefix, timestampInNanos)