Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

disconnected clients: Observability plumbing #12141

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions nomad/leader.go
Original file line number Diff line number Diff line change
Expand Up @@ -980,6 +980,8 @@ func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) {
float32(tgSummary.Starting), labels)
metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"},
float32(tgSummary.Lost), labels)
metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "unknown"},
float32(tgSummary.Unknown), labels)
}
}

Expand Down
8 changes: 8 additions & 0 deletions nomad/state/state_store.go
Original file line number Diff line number Diff line change
Expand Up @@ -4468,6 +4468,8 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error {
tg.Failed += 1
case structs.AllocClientStatusLost:
tg.Lost += 1
case structs.AllocClientStatusUnknown:
tg.Unknown += 1
case structs.AllocClientStatusComplete:
tg.Complete += 1
case structs.AllocClientStatusRunning:
Expand Down Expand Up @@ -5025,6 +5027,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat
tgSummary.Complete += 1
case structs.AllocClientStatusLost:
tgSummary.Lost += 1
case structs.AllocClientStatusUnknown:
tgSummary.Unknown += 1
}

// Decrementing the count of the bin of the last state
Expand All @@ -5041,6 +5045,10 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat
if tgSummary.Lost > 0 {
tgSummary.Lost -= 1
}
case structs.AllocClientStatusUnknown:
if tgSummary.Unknown > 0 {
tgSummary.Unknown -= 1
}
case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete:
default:
s.logger.Error("invalid old client status for allocation",
Expand Down
9 changes: 8 additions & 1 deletion nomad/state/state_store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4925,7 +4925,13 @@ func TestStateStore_ReconcileJobSummary(t *testing.T) {
alloc11 := alloc10.Copy()
alloc11.ClientStatus = structs.AllocClientStatusLost

state.UpsertAllocs(structs.MsgTypeTestSetup, 130, []*structs.Allocation{alloc4, alloc6, alloc8, alloc10})
alloc12 := mock.Alloc()
alloc12.JobID = alloc.JobID
alloc12.Job = alloc.Job
alloc12.TaskGroup = "db"
alloc12.ClientStatus = structs.AllocClientStatusUnknown

state.UpsertAllocs(structs.MsgTypeTestSetup, 130, []*structs.Allocation{alloc4, alloc6, alloc8, alloc10, alloc12})

state.UpdateAllocsFromClient(structs.MsgTypeTestSetup, 150, []*structs.Allocation{alloc5, alloc7, alloc9, alloc11})

Expand All @@ -4949,6 +4955,7 @@ func TestStateStore_ReconcileJobSummary(t *testing.T) {
Failed: 1,
Complete: 1,
Lost: 1,
Unknown: 1,
},
},
CreateIndex: 100,
Expand Down
1 change: 1 addition & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -4756,6 +4756,7 @@ type TaskGroupSummary struct {
Running int
Starting int
Lost int
Unknown int
}

const (
Expand Down
6 changes: 4 additions & 2 deletions scheduler/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -145,8 +145,8 @@ type delayedRescheduleInfo struct {
}

func (r *reconcileResults) GoString() string {
base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)",
len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop))
base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d) (disconnect %d) (reconnect %d)",
len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop), len(r.disconnectUpdates), len(r.reconnectUpdates))
Comment on lines +148 to +149
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possibly nitpicky: because this report is for "Total changes", is calling the change disconnect and reconnect the right name? That reads to me like the scheduler is asking for the allocs to be disconnected or reconnected. (Might be a good question to throw to the team for bikeshedding 😀 )


if r.deployment != nil {
base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID)
Expand Down Expand Up @@ -1266,6 +1266,8 @@ func (a *allocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName
allocIDToFollowupEvalID[timeoutInfo.allocID] = eval.ID
}

emitRescheduleInfo(timeoutInfo.alloc, eval)

// Create updates that will be applied to the allocs to mark the FollowupEvalID
// and the unknown ClientStatus.
updatedAlloc := timeoutInfo.alloc.Copy()
Expand Down
1 change: 1 addition & 0 deletions website/content/docs/operations/metrics-reference.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,7 @@ Job summary metrics are emitted by the Nomad leader server.
| `nomad.nomad.job_summary.complete` | Number of complete allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.failed` | Number of failed allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.lost` | Number of lost allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.unknown` | Number of unknown allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.queued` | Number of queued allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.running` | Number of running allocations for a job | Integer | Gauge | host, job, namespace, task_group |
| `nomad.nomad.job_summary.starting` | Number of starting allocations for a job | Integer | Gauge | host, job, namespace, task_group |
Expand Down