From 72c9ab2c24fe62b6153af7ae69291c379627dbbe Mon Sep 17 00:00:00 2001 From: Abdullah Gharaibeh Date: Tue, 12 Jan 2021 14:09:20 -0500 Subject: [PATCH] addressed comments --- keps/sig-apps/592-ttl-after-finish/README.md | 15 ++++++++++----- keps/sig-apps/592-ttl-after-finish/kep.yaml | 11 ++++++----- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/keps/sig-apps/592-ttl-after-finish/README.md b/keps/sig-apps/592-ttl-after-finish/README.md index 0d3e40adddac..b292af3154bf 100644 --- a/keps/sig-apps/592-ttl-after-finish/README.md +++ b/keps/sig-apps/592-ttl-after-finish/README.md @@ -288,17 +288,18 @@ fields of API types, flags, etc.?** _This section must be completed when targeting beta graduation to a release._ * **How can an operator determine if the feature is in use by workloads?** - - The `workqueue_adds_total` tracks the number of finished Jobs with ttlSecondsAfterFinished set. + - The `workqueue_adds_total{name="ttl_jobs_to_delete"}` tracks the number of + finished Jobs with ttlSecondsAfterFinished set. - Listing jobs in the cluster and checking if any has ttlSecondsAfterFinished field set. * **What are the SLIs (Service Level Indicators) an operator can use to determine the health of the service?** - [x] Metrics - Metric name: `ttl_after_finished_controller_rate_limiter_use` - - Metric name: `workqueue_adds_total` - - Metric name: `workqueue_depth` - - Metric name: `workqueue_queue_duration_seconds` - - Metric name: `workqueue_retries_total` + - Metric name: `workqueue_adds_total{name="ttl_jobs_to_delete"}` + - Metric name: `workqueue_depth{name="ttl_jobs_to_delete"}` + - Metric name: `workqueue_queue_duration_seconds{name="ttl_jobs_to_delete"}` + - Metric name: `workqueue_retries_total{name="ttl_jobs_to_delete"}` - Components exposing the metric: `kube-controller-manager` - Metric name: `etcd_object_counts{resource="jobs.batch"}` - Components exposing the metric: `kube-apiserver`. @@ -307,6 +308,10 @@ the health of the service?** 99% of the jobs that needs cleanup are deleted within X minutes. +This can be implemented using the `workqueue_queue_duration_seconds{name="ttl_jobs_to_delete"}` +histogram. Note that only jobs that are finished and have the ttlSecondsAfterinished field set +are added to the queue. + * **Are there any missing metrics that would be useful to have to improve observability of this feature?** diff --git a/keps/sig-apps/592-ttl-after-finish/kep.yaml b/keps/sig-apps/592-ttl-after-finish/kep.yaml index d2c1c66f847b..72787c504ff6 100644 --- a/keps/sig-apps/592-ttl-after-finish/kep.yaml +++ b/keps/sig-apps/592-ttl-after-finish/kep.yaml @@ -2,6 +2,7 @@ title: TTL After Finished kep-number: 592 authors: - "@janetkuo" + - "@ahg-g" owning-sig: sig-apps participating-sigs: - sig-api-machinery @@ -44,8 +45,8 @@ feature-gates: - kube-controller-manager disable-supported: true metrics: - - "ttl_after_finished_controller_rate_limiter_use" - - "workqueue_adds_total" - - "workqueue_depth" - - "workqueue_queue_duration_seconds" - - "workqueue_retries_total" + - "ttl_after_finished_controller_rate_limiter_use" + - "workqueue_adds_total{name="ttl_jobs_to_delete"}" + - "workqueue_depth{name="ttl_jobs_to_delete"}" + - "workqueue_queue_duration_seconds{name="ttl_jobs_to_delete"}" + - "workqueue_retries_total{name="ttl_jobs_to_delete"}"