diff --git a/nomad/leader.go b/nomad/leader.go index ecffabc85c7..dcfeeff8720 100644 --- a/nomad/leader.go +++ b/nomad/leader.go @@ -980,6 +980,8 @@ func (s *Server) iterateJobSummaryMetrics(summary *structs.JobSummary) { float32(tgSummary.Starting), labels) metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "lost"}, float32(tgSummary.Lost), labels) + metrics.SetGaugeWithLabels([]string{"nomad", "job_summary", "unknown"}, + float32(tgSummary.Unknown), labels) } } diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index afc8ccd8495..a7d5d69262f 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -4468,6 +4468,8 @@ func (s *StateStore) ReconcileJobSummaries(index uint64) error { tg.Failed += 1 case structs.AllocClientStatusLost: tg.Lost += 1 + case structs.AllocClientStatusUnknown: + tg.Unknown += 1 case structs.AllocClientStatusComplete: tg.Complete += 1 case structs.AllocClientStatusRunning: @@ -5025,6 +5027,8 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat tgSummary.Complete += 1 case structs.AllocClientStatusLost: tgSummary.Lost += 1 + case structs.AllocClientStatusUnknown: + tgSummary.Unknown += 1 } // Decrementing the count of the bin of the last state @@ -5041,6 +5045,10 @@ func (s *StateStore) updateSummaryWithAlloc(index uint64, alloc *structs.Allocat if tgSummary.Lost > 0 { tgSummary.Lost -= 1 } + case structs.AllocClientStatusUnknown: + if tgSummary.Unknown > 0 { + tgSummary.Unknown -= 1 + } case structs.AllocClientStatusFailed, structs.AllocClientStatusComplete: default: s.logger.Error("invalid old client status for allocation", diff --git a/nomad/state/state_store_test.go b/nomad/state/state_store_test.go index 561765d9413..54c8aa5f4dc 100644 --- a/nomad/state/state_store_test.go +++ b/nomad/state/state_store_test.go @@ -4925,7 +4925,13 @@ func TestStateStore_ReconcileJobSummary(t *testing.T) { alloc11 := alloc10.Copy() alloc11.ClientStatus = structs.AllocClientStatusLost - state.UpsertAllocs(structs.MsgTypeTestSetup, 130, []*structs.Allocation{alloc4, alloc6, alloc8, alloc10}) + alloc12 := mock.Alloc() + alloc12.JobID = alloc.JobID + alloc12.Job = alloc.Job + alloc12.TaskGroup = "db" + alloc12.ClientStatus = structs.AllocClientStatusUnknown + + state.UpsertAllocs(structs.MsgTypeTestSetup, 130, []*structs.Allocation{alloc4, alloc6, alloc8, alloc10, alloc12}) state.UpdateAllocsFromClient(structs.MsgTypeTestSetup, 150, []*structs.Allocation{alloc5, alloc7, alloc9, alloc11}) @@ -4949,6 +4955,7 @@ func TestStateStore_ReconcileJobSummary(t *testing.T) { Failed: 1, Complete: 1, Lost: 1, + Unknown: 1, }, }, CreateIndex: 100, diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 96c5d19f4b8..6b619909915 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -4756,6 +4756,7 @@ type TaskGroupSummary struct { Running int Starting int Lost int + Unknown int } const ( diff --git a/scheduler/reconcile.go b/scheduler/reconcile.go index 777ef7fbdb1..0f08c504bbf 100644 --- a/scheduler/reconcile.go +++ b/scheduler/reconcile.go @@ -145,8 +145,8 @@ type delayedRescheduleInfo struct { } func (r *reconcileResults) GoString() string { - base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d)", - len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop)) + base := fmt.Sprintf("Total changes: (place %d) (destructive %d) (inplace %d) (stop %d) (disconnect %d) (reconnect %d)", + len(r.place), len(r.destructiveUpdate), len(r.inplaceUpdate), len(r.stop), len(r.disconnectUpdates), len(r.reconnectUpdates)) if r.deployment != nil { base += fmt.Sprintf("\nCreated Deployment: %q", r.deployment.ID) @@ -1266,6 +1266,8 @@ func (a *allocReconciler) createTimeoutLaterEvals(disconnecting allocSet, tgName allocIDToFollowupEvalID[timeoutInfo.allocID] = eval.ID } + emitRescheduleInfo(timeoutInfo.alloc, eval) + // Create updates that will be applied to the allocs to mark the FollowupEvalID // and the unknown ClientStatus. updatedAlloc := timeoutInfo.alloc.Copy() diff --git a/website/content/docs/operations/metrics-reference.mdx b/website/content/docs/operations/metrics-reference.mdx index 61311a25e28..11ebdc0227b 100644 --- a/website/content/docs/operations/metrics-reference.mdx +++ b/website/content/docs/operations/metrics-reference.mdx @@ -211,6 +211,7 @@ Job summary metrics are emitted by the Nomad leader server. | `nomad.nomad.job_summary.complete` | Number of complete allocations for a job | Integer | Gauge | host, job, namespace, task_group | | `nomad.nomad.job_summary.failed` | Number of failed allocations for a job | Integer | Gauge | host, job, namespace, task_group | | `nomad.nomad.job_summary.lost` | Number of lost allocations for a job | Integer | Gauge | host, job, namespace, task_group | +| `nomad.nomad.job_summary.unknown` | Number of unknown allocations for a job | Integer | Gauge | host, job, namespace, task_group | | `nomad.nomad.job_summary.queued` | Number of queued allocations for a job | Integer | Gauge | host, job, namespace, task_group | | `nomad.nomad.job_summary.running` | Number of running allocations for a job | Integer | Gauge | host, job, namespace, task_group | | `nomad.nomad.job_summary.starting` | Number of starting allocations for a job | Integer | Gauge | host, job, namespace, task_group |