Skip to content

Commit

Permalink
jobs/jobsprofiler: limit retained dsp diagrams
Browse files Browse the repository at this point in the history
Release note (ops change): Fixed a bug where collection of debug information
for very long-running jobs could use excessive space in the job_info system table
and/or cause some interactions with the jobs system to become slow.

Epic: none.
  • Loading branch information
dt committed Jun 24, 2024
1 parent 281ddcf commit 69b285c
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 0 deletions.
1 change: 1 addition & 0 deletions pkg/jobs/jobsprofiler/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ go_test(
"//pkg/sql",
"//pkg/sql/execinfrapb",
"//pkg/sql/isql",
"//pkg/sql/physicalplan",
"//pkg/testutils",
"//pkg/testutils/jobutils",
"//pkg/testutils/serverutils",
Expand Down
25 changes: 25 additions & 0 deletions pkg/jobs/jobsprofiler/profiler.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,15 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
)

const MaxRetainedDSPDiagramsPerJob = 5

// dspDiagMaxCulledPerWrite limits how many old diagrams writing a new one will
// cull to try to maintain the limit of 5; typically it would cull no more than
// one in the steady-state but an upgrading cluster that has accumulated many
// old rows might try to cull more, so we bound how many are eligible at a time
// to some large but finite upper-bound.
const dspDiagMaxCulledPerWrite = 100

// StorePlanDiagram stores the DistSQL diagram generated from p in the job info
// table. The generation of the plan diagram and persistence to the info table
// are done asynchronously and this method does not block on their completion.
Expand All @@ -46,6 +55,22 @@ func StorePlanDiagram(

dspKey := profilerconstants.MakeDSPDiagramInfoKey(timeutil.Now().UnixNano())
infoStorage := jobs.InfoStorageForJob(txn, jobID)

// Limit total retained diagrams by culling older ones as needed.
count, err := infoStorage.Count(ctx, profilerconstants.DSPDiagramInfoKeyPrefix, profilerconstants.DSPDiagramInfoKeyMax)
if err != nil {
return err
}
const keep = MaxRetainedDSPDiagramsPerJob - 1
if toCull := min(count-keep, dspDiagMaxCulledPerWrite); toCull > 0 {
if err := infoStorage.DeleteRange(
ctx, profilerconstants.DSPDiagramInfoKeyPrefix, profilerconstants.DSPDiagramInfoKeyMax, toCull,
); err != nil {
return err
}
}

// Write the new diagram.
return infoStorage.Write(ctx, dspKey, []byte(diagURL.String()))
})
// Don't log the error if the context has been canceled. This will likely be
Expand Down
24 changes: 24 additions & 0 deletions pkg/jobs/jobsprofiler/profiler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/sql"
"github.com/cockroachdb/cockroach/pkg/sql/execinfrapb"
"github.com/cockroachdb/cockroach/pkg/sql/isql"
"github.com/cockroachdb/cockroach/pkg/sql/physicalplan"
"github.com/cockroachdb/cockroach/pkg/testutils"
"github.com/cockroachdb/cockroach/pkg/testutils/jobutils"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
Expand All @@ -51,6 +52,29 @@ func TestProfilerStorePlanDiagram(t *testing.T) {
ctx := context.Background()
defer s.Stopper().Stop(ctx)

// First verify that directly calling StorePlanDiagram writes n times causes
// the expected number of persisted rows in job_info, respecting the limit.
db := s.ExecutorConfig().(sql.ExecutorConfig).InternalDB
const fakeJobID = 4567
plan := &sql.PhysicalPlan{PhysicalPlan: physicalplan.MakePhysicalPlan(&physicalplan.PhysicalInfrastructure{})}
for i := 1; i < 10; i++ {
jobsprofiler.StorePlanDiagram(ctx, s.ApplicationLayer().AppStopper(), plan, db, fakeJobID)
testutils.SucceedsSoon(t, func() error {
var count int
if err := sqlDB.QueryRow(
`SELECT count(*) FROM system.job_info WHERE job_id = $1`, fakeJobID,
).Scan(&count); err != nil {
return err
}
if expected := min(i, jobsprofiler.MaxRetainedDSPDiagramsPerJob); count != expected {
return errors.Errorf("expected %d rows, got %d", expected, count)
}
return nil
})
}

// Now run various jobs that have been extended to persist diagrams and make
// sure that they also create persisted diagram rows.
_, err := sqlDB.Exec(`CREATE DATABASE test`)
require.NoError(t, err)
_, err = sqlDB.Exec(`CREATE TABLE foo (id INT PRIMARY KEY)`)
Expand Down
3 changes: 3 additions & 0 deletions pkg/jobs/jobsprofiler/profilerconstants/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,9 @@ import (

const DSPDiagramInfoKeyPrefix = "~dsp-diag-url-"

// DSPDiagramInfoKeyMax sorts after any diagram info key, because `:“ > [0-9].
const DSPDiagramInfoKeyMax = DSPDiagramInfoKeyPrefix + ":"

// MakeDSPDiagramInfoKey constructs an ephemeral DSP diagram info key.
func MakeDSPDiagramInfoKey(timestampInNanos int64) string {
return fmt.Sprintf("%s%d", DSPDiagramInfoKeyPrefix, timestampInNanos)
Expand Down

0 comments on commit 69b285c

Please sign in to comment.