Skip to content

Commit

Permalink
feat: add node termination metrics (#2139)
Browse files Browse the repository at this point in the history
* feat: added node termination metrics

* moved termination metrics to termination controller
  • Loading branch information
jmdeal authored Jul 17, 2022
1 parent 875d323 commit 7a818e0
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 6 deletions.
4 changes: 2 additions & 2 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ import (
"github.com/aws/karpenter/pkg/controllers/counter"
metricspod "github.com/aws/karpenter/pkg/controllers/metrics/pod"
metricsprovisioner "github.com/aws/karpenter/pkg/controllers/metrics/provisioner"
statemetrics "github.com/aws/karpenter/pkg/controllers/metrics/state"
metricsstate "github.com/aws/karpenter/pkg/controllers/metrics/state"
"github.com/aws/karpenter/pkg/controllers/node"
"github.com/aws/karpenter/pkg/controllers/provisioning"
"github.com/aws/karpenter/pkg/controllers/state"
Expand Down Expand Up @@ -114,7 +114,7 @@ func main() {
recorder := events.NewDedupeRecorder(events.NewRecorder(manager.GetEventRecorderFor(appName)))
cluster := state.NewCluster(cfg, manager.GetClient(), cloudProvider)

statemetrics.StartMetricScraper(ctx, cluster)
metricsstate.StartMetricScraper(ctx, cluster)

if err := manager.RegisterControllers(ctx,
provisioning.NewController(ctx, cfg, manager.GetClient(), clientSet.CoreV1(), recorder, cloudProvider, cluster),
Expand Down
13 changes: 9 additions & 4 deletions designs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ Scheduling statistics consist of actions not representable by a single controlle
termination and pod startup. The visualizations will be the same as those for individual controller performance. The
following metrics will be instrumented to implement these visualizations:

| Name | Type | Labels | Description |
|--------------------------------------------|---------|-----------------------------------------------------------------|-------------------------------------------|
| `karpenter_nodes_termination_time_seconds` | Summary | `provisioner`, `zone`, `arch`, `capacity_type`, `instance_type` | [Measurement Definitions](#measurment-definitions) |
| `karpenter_pods_startup_time_seconds` | Summary | `provisioner`, `zone`, `arch`, `capacity_type`, `instance_type` | [Measurement Definitions](#measurment-definitions) |
| Name | Type | Labels | Description |
|--------------------------------------------------------|-----------|-----------------------------------------------------------------|----------------------------------------------------|
| `karpenter_nodes_termination_time_seconds` | Summary | None | [Measurement Definitions](#measurment-definitions) |
| `karpenter_pods_startup_time_seconds` | Summary | `provisioner`, `zone`, `arch`, `capacity_type`, `instance_type` | [Measurement Definitions](#measurment-definitions) |

API statistics will consist of API call latency, call rate, call method, return code, and payload size. These statistics will be
separated into Kubernetes API and cloudprovider API statistics. Call latency and call rate will be represented the same
Expand All @@ -101,6 +101,11 @@ represents the number of occurrences that were less than that buckets value. Sin
histograms are a poor choice for unbounded data. Summaries instead directly track quantiles. This involves more
computation but works better for unbounded data.

Prometheus `Summary` metric types compute running quantiles for a given metric for given label values. Unlike
histograms, summaries cannot be aggregated since it can result in statistically non-sensical results. Theoretically,
summaries can be tracked for many labels, but we would need a number of summary metrics equal to the size of the
cartesian product of all the labels, which is not viable.

### Cluster State

The metrics used by the cluster capacity dashboard will not be populated by continuously polling objects in the cluster,
Expand Down
25 changes: 25 additions & 0 deletions pkg/controllers/termination/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,17 +30,38 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
"sigs.k8s.io/controller-runtime/pkg/manager"
crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
"sigs.k8s.io/controller-runtime/pkg/reconcile"

"github.com/prometheus/client_golang/prometheus"

provisioning "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5"
"github.com/aws/karpenter/pkg/cloudprovider"
"github.com/aws/karpenter/pkg/events"
"github.com/aws/karpenter/pkg/metrics"
"github.com/aws/karpenter/pkg/utils/functional"
"github.com/aws/karpenter/pkg/utils/injection"
)

const controllerName = "termination"

var (
terminationSummaryVec = prometheus.NewSummaryVec(
prometheus.SummaryOpts{
Namespace: "karpenter",
Subsystem: "nodes",
Name: "termination_time_seconds",
Help: "The time taken between a node's deletion request and the removal of its finalizer",
Objectives: metrics.SummaryObjectives(),
},
[]string{},
)
)

func init() {
crmetrics.Registry.MustRegister(terminationSummaryVec)
}

// Controller for the resource
type Controller struct {
Terminator *Terminator
Expand Down Expand Up @@ -99,6 +120,10 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco
if err := c.Terminator.terminate(ctx, node); err != nil {
return reconcile.Result{}, fmt.Errorf("terminating node %s, %w", node.Name, err)
}

// 6. Record termination duration (time between deletion timestamp and finalizer removal)
terminationSummaryVec.With(prometheus.Labels{}).Observe(time.Since(node.DeletionTimestamp.Time).Seconds())

return reconcile.Result{}, nil
}

Expand Down
10 changes: 10 additions & 0 deletions pkg/metrics/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ func DurationBuckets() []float64 {
1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}
}

// Returns a map of summary objectives (quantile-error pairs)
func SummaryObjectives() map[float64]float64 {
const epsilon = 0.01
objectives := make(map[float64]float64)
for _, quantile := range []float64{0.0, 0.5, 0.9, 0.99, 1.0} {
objectives[quantile] = epsilon
}
return objectives
}

// Measure returns a deferrable function that observes the duration between the
// defer statement and the end of the function.
func Measure(observer prometheus.Observer) func() {
Expand Down

0 comments on commit 7a818e0

Please sign in to comment.