Skip to content

Commit

Permalink
server/status: add running non-idle jobs metric
Browse files Browse the repository at this point in the history
Previously serverless was using the sql jobs running metric to determine
if a tenant process is idle and can be shut down. With the introduction
of continiously running jobs this isn't a good indicator anymore. A
recent addition is a per job metrics that show running or idle. The auto
scaler doesn't care about the individual jobs and only cares about the
total number of jobs that a running but haven't reported as being idle.
The pull rate is also very high so the retriving all the individual
running/idle metrics for each job type isn't optimal. So this PR adds a
single metric that just aggregates and tracks the total count of jobs
running and not idle.

Release justification: Bug fixes and low-risk updates to new functionality
Release note: None
  • Loading branch information
darinpp committed Apr 5, 2022
1 parent b7b37f4 commit 09e5af0
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 3 deletions.
13 changes: 13 additions & 0 deletions pkg/jobs/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import (
// Metrics are for production monitoring of each job type.
type Metrics struct {
JobMetrics [jobspb.NumJobTypes]*JobTypeMetrics
// RunningNonIdleJobs is the total number of running jobs that are not idle.
RunningNonIdleJobs *metric.Gauge

RowLevelTTL metric.Struct
Changefeed metric.Struct
Expand Down Expand Up @@ -173,6 +175,16 @@ var (
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_GAUGE,
}

// MetaRunningNonIdleJobs is the count of currently running jobs that are not
// reporting as being idle.
MetaRunningNonIdleJobs = metric.Metadata{
Name: "jobs.running_non_idle",
Help: "number of running jobs that are not idle",
Measurement: "jobs",
Unit: metric.Unit_COUNT,
MetricType: io_prometheus_client.MetricType_GAUGE,
}
)

// MetricStruct implements the metric.Struct interface.
Expand All @@ -192,6 +204,7 @@ func (m *Metrics) init(histogramWindowInterval time.Duration) {
m.AdoptIterations = metric.NewCounter(metaAdoptIterations)
m.ClaimedJobs = metric.NewCounter(metaClaimedJobs)
m.ResumedJobs = metric.NewCounter(metaResumedClaimedJobs)
m.RunningNonIdleJobs = metric.NewGauge(MetaRunningNonIdleJobs)
for i := 0; i < jobspb.NumJobTypes; i++ {
jt := jobspb.Type(i)
if jt == jobspb.TypeUnspecified { // do not track TypeUnspecified
Expand Down
14 changes: 12 additions & 2 deletions pkg/jobs/registry.go
Original file line number Diff line number Diff line change
Expand Up @@ -1125,7 +1125,11 @@ func (r *Registry) stepThroughStateMachine(
var err error
func() {
jm.CurrentlyRunning.Inc(1)
defer jm.CurrentlyRunning.Dec(1)
r.metrics.RunningNonIdleJobs.Inc(1)
defer func() {
jm.CurrentlyRunning.Dec(1)
r.metrics.RunningNonIdleJobs.Dec(1)
}()
err = resumer.Resume(resumeCtx, execCtx)
}()

Expand Down Expand Up @@ -1219,7 +1223,11 @@ func (r *Registry) stepThroughStateMachine(
var err error
func() {
jm.CurrentlyRunning.Inc(1)
defer jm.CurrentlyRunning.Dec(1)
r.metrics.RunningNonIdleJobs.Inc(1)
defer func() {
jm.CurrentlyRunning.Dec(1)
r.metrics.RunningNonIdleJobs.Dec(1)
}()
err = resumer.OnFailOrCancel(onFailOrCancelCtx, execCtx)
}()
if successOnFailOrCancel := err == nil; successOnFailOrCancel {
Expand Down Expand Up @@ -1307,8 +1315,10 @@ func (r *Registry) MarkIdle(job *Job, isIdle bool) {
if aj.isIdle != isIdle {
log.Infof(r.serverCtx, "%s job %d: toggling idleness to %+v", jobType, job.ID(), isIdle)
if isIdle {
r.metrics.RunningNonIdleJobs.Dec(1)
jm.CurrentlyIdle.Inc(1)
} else {
r.metrics.RunningNonIdleJobs.Inc(1)
jm.CurrentlyIdle.Dec(1)
}
aj.isIdle = isIdle
Expand Down
6 changes: 5 additions & 1 deletion pkg/jobs/registry_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1038,22 +1038,26 @@ func TestJobIdleness(t *testing.T) {
job2 := createJob()

require.False(t, r.TestingIsJobIdle(job1.ID()))

require.EqualValues(t, 2, r.metrics.RunningNonIdleJobs.Value())
r.MarkIdle(job1, true)
r.MarkIdle(job2, true)
require.True(t, r.TestingIsJobIdle(job1.ID()))
require.Equal(t, int64(2), currentlyIdle.Value())
require.EqualValues(t, 0, r.metrics.RunningNonIdleJobs.Value())

// Repeated calls should not increase metric
r.MarkIdle(job1, true)
r.MarkIdle(job1, true)
require.Equal(t, int64(2), currentlyIdle.Value())
require.EqualValues(t, 0, r.metrics.RunningNonIdleJobs.Value())

r.MarkIdle(job1, false)
require.Equal(t, int64(1), currentlyIdle.Value())
require.False(t, r.TestingIsJobIdle(job1.ID()))
require.EqualValues(t, 1, r.metrics.RunningNonIdleJobs.Value())
r.MarkIdle(job2, false)
require.Equal(t, int64(0), currentlyIdle.Value())
require.EqualValues(t, 2, r.metrics.RunningNonIdleJobs.Value())

// Let the jobs complete
resumeErrChan <- nil
Expand Down
6 changes: 6 additions & 0 deletions pkg/ts/catalog/chart_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -2859,6 +2859,12 @@ var charts = []sectionDescription{
{
Organization: [][]string{{Jobs, "Execution"}},
Charts: []chartDescription{
{
Title: "Active",
Metrics: []string{
"jobs.running_non_idle",
},
},
{
Title: "Currently Running",
Metrics: []string{
Expand Down

0 comments on commit 09e5af0

Please sign in to comment.