From 72c9ab2c24fe62b6153af7ae69291c379627dbbe Mon Sep 17 00:00:00 2001
From: Abdullah Gharaibeh <ahg@google.com>
Date: Tue, 12 Jan 2021 14:09:20 -0500
Subject: [PATCH] addressed comments

---
 keps/sig-apps/592-ttl-after-finish/README.md | 15 ++++++++++-----
 keps/sig-apps/592-ttl-after-finish/kep.yaml  | 11 ++++++-----
 2 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/keps/sig-apps/592-ttl-after-finish/README.md b/keps/sig-apps/592-ttl-after-finish/README.md
index 0d3e40adddac..b292af3154bf 100644
--- a/keps/sig-apps/592-ttl-after-finish/README.md
+++ b/keps/sig-apps/592-ttl-after-finish/README.md
@@ -288,17 +288,18 @@ fields of API types, flags, etc.?**
 _This section must be completed when targeting beta graduation to a release._
 
 * **How can an operator determine if the feature is in use by workloads?**
-  - The `workqueue_adds_total` tracks the number of finished Jobs with ttlSecondsAfterFinished set.
+  - The `workqueue_adds_total{name="ttl_jobs_to_delete"}` tracks the number of 
+    finished Jobs with ttlSecondsAfterFinished set.
   - Listing jobs in the cluster and checking if any has ttlSecondsAfterFinished field set.
 
 * **What are the SLIs (Service Level Indicators) an operator can use to determine 
 the health of the service?**
  - [x] Metrics
     - Metric name: `ttl_after_finished_controller_rate_limiter_use`
-    - Metric name: `workqueue_adds_total`
-    - Metric name: `workqueue_depth`
-    - Metric name: `workqueue_queue_duration_seconds`
-    - Metric name: `workqueue_retries_total`
+    - Metric name: `workqueue_adds_total{name="ttl_jobs_to_delete"}`
+    - Metric name: `workqueue_depth{name="ttl_jobs_to_delete"}`
+    - Metric name: `workqueue_queue_duration_seconds{name="ttl_jobs_to_delete"}`
+    - Metric name: `workqueue_retries_total{name="ttl_jobs_to_delete"}`
     - Components exposing the metric: `kube-controller-manager`
     - Metric name: `etcd_object_counts{resource="jobs.batch"}`
     - Components exposing the metric: `kube-apiserver`.
@@ -307,6 +308,10 @@ the health of the service?**
 
 99% of the jobs that needs cleanup are deleted within X minutes.
 
+This can be implemented using the `workqueue_queue_duration_seconds{name="ttl_jobs_to_delete"}` 
+histogram. Note that only jobs that are finished and have the ttlSecondsAfterinished field set
+are added to the queue.
+
 * **Are there any missing metrics that would be useful to have to improve observability 
 of this feature?**
 
diff --git a/keps/sig-apps/592-ttl-after-finish/kep.yaml b/keps/sig-apps/592-ttl-after-finish/kep.yaml
index d2c1c66f847b..72787c504ff6 100644
--- a/keps/sig-apps/592-ttl-after-finish/kep.yaml
+++ b/keps/sig-apps/592-ttl-after-finish/kep.yaml
@@ -2,6 +2,7 @@ title: TTL After Finished
 kep-number: 592
 authors:
   - "@janetkuo"
+  - "@ahg-g"
 owning-sig: sig-apps
 participating-sigs:
   - sig-api-machinery
@@ -44,8 +45,8 @@ feature-gates:
       - kube-controller-manager
 disable-supported: true
 metrics:
-    - "ttl_after_finished_controller_rate_limiter_use"
-    - "workqueue_adds_total"
-    - "workqueue_depth"
-    - "workqueue_queue_duration_seconds"
-    - "workqueue_retries_total"
+  - "ttl_after_finished_controller_rate_limiter_use"
+  - "workqueue_adds_total{name="ttl_jobs_to_delete"}"
+  - "workqueue_depth{name="ttl_jobs_to_delete"}"
+  - "workqueue_queue_duration_seconds{name="ttl_jobs_to_delete"}"
+  - "workqueue_retries_total{name="ttl_jobs_to_delete"}"