From d0745526107a5f3477c613ed08351d1193cf9402 Mon Sep 17 00:00:00 2001
From: willgraf <7930703+willgraf@users.noreply.github.com>
Date: Sat, 28 Mar 2020 15:23:52 -0700
Subject: [PATCH] Update prometheus helmfiles and rules to better use metric
 labels (#304)

* Update prometheus-operator, prometheus-adapter, and promethes-redis-exporter helm charts and remove stale default values

* Relabel redis-exporter with `deployment=$QUEUE-consumer` and change key to be `$QUEUE`

* Rename zip-consumer to segmentation-zip-consumer to match labels.

* Using .75 instead of .9 for backoff coefficient.
---
 conf/addons/hpa.yaml                          |  20 +-
 .../0240.segmentation-zip-consumer.yaml       |   4 +-
 conf/helmfile.d/0600.prometheus-operator.yaml | 240 +++++++-----------
 3 files changed, 88 insertions(+), 176 deletions(-)

diff --git a/conf/addons/hpa.yaml b/conf/addons/hpa.yaml
index ea678e4f..87ba243e 100644
--- a/conf/addons/hpa.yaml
+++ b/conf/addons/hpa.yaml
@@ -22,24 +22,6 @@ spec:
         name: tf_serving_gpu_usage
       targetValue: 70
 ---
-# apiVersion: autoscaling/v2beta1
-# kind: HorizontalPodAutoscaler
-# metadata:
-#   name: data-processing
-#   namespace: deepcell
-# spec:
-#   scaleTargetRef:
-#     apiVersion: apps/v1
-#     kind: Deployment
-#     name: data-processing
-#   minReplicas: 1
-#   maxReplicas: {{ mul $max_gpus 20 }}
-#   metrics:
-#   - type: Resource
-#     resource:
-#       name: cpu
-#       targetAverageUtilization: 80
-# ---
 apiVersion: autoscaling/v2beta1
 kind: HorizontalPodAutoscaler
 metadata:
@@ -100,7 +82,7 @@ spec:
         apiVersion: v1
         kind: Namespace
         name: segmentation_zip_consumer_key_ratio
-      targetValue: 2
+      targetValue: 1
 ---
 apiVersion: autoscaling/v2beta1
 kind: HorizontalPodAutoscaler
diff --git a/conf/helmfile.d/0240.segmentation-zip-consumer.yaml b/conf/helmfile.d/0240.segmentation-zip-consumer.yaml
index 3d8022fb..80ee7d0d 100644
--- a/conf/helmfile.d/0240.segmentation-zip-consumer.yaml
+++ b/conf/helmfile.d/0240.segmentation-zip-consumer.yaml
@@ -13,8 +13,8 @@ releases:
 # References:
 #   - https://github.com/vanvalenlab/kiosk-console/tree/master/conf/charts/redis-consumer
 #
-- name: segmentation-zip-consumer
-  namespace: deepcell
+- name: "segmentation-zip-consumer"
+  namespace: "deepcell"
   labels:
     chart: redis-consumer
     component: deepcell
diff --git a/conf/helmfile.d/0600.prometheus-operator.yaml b/conf/helmfile.d/0600.prometheus-operator.yaml
index 79c4fecb..4dd8aa22 100644
--- a/conf/helmfile.d/0600.prometheus-operator.yaml
+++ b/conf/helmfile.d/0600.prometheus-operator.yaml
@@ -116,165 +116,95 @@ releases:
                 deployment: tracking-consumer
                 namespace: deepcell
 
-      ## Using default values from https://github.com/helm/charts/blob/master/stable/grafana/values.yaml
-      ##
-      grafana:
 
-        enabled: true
-
-        ## Deploy default dashboards.
-        ##
-        defaultDashboardsEnabled: true
-
-        adminPassword: '{{ env "GRAFANA_PASSWORD" | default "prom-operator" }}'
-
-        dashboards:
-          default:
-            prometheus-stats:
-              # Ref: https://grafana.com/dashboards/2
-              gnetId: 2
-              revision: 2
-              datasource: Prometheus
-            prometheus-redis:
-              # Ref: https://grafana.com/dashboards/763
-              gnetId: 763
-              revision: 2
-              datasource: Prometheus
-
-      ## Deploy a Prometheus instance
-      ##
-      prometheus:
-
-        ## Settings affecting prometheusSpec
-        ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec
+        ## Using default values from https://github.com/helm/charts/blob/master/stable/grafana/values.yaml
         ##
-        prometheusSpec:
-
-          ## Interval between consecutive scrapes.
-          ##
-          scrapeInterval: 15s
-
-          ## Interval between consecutive evaluations.
-          ##
-          evaluationInterval: 15s
+        grafana:
 
-          ## Resource limits & requests
-          ##
-          # resources:
-          #   requests:
-          #     memory: 1Gi
-          #   limits:
-          #     memory: 1Gi
-
-          ## Enable compression of the write-ahead log using Snappy.
-          ##
-          walCompression: true
-
-          ## Prometheus StorageSpec for persistent data
-          ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md
-          ##
-          storageSpec: {}
-          #  volumeClaimTemplate:
-          #    spec:
-          #      storageClassName: gluster
-          #      accessModes: ["ReadWriteOnce"]
-          #      resources:
-          #        requests:
-          #          storage: 50Gi
-          #    selector: {}
+          enabled: true
 
-          ## AdditionalScrapeConfigs allows specifying additional Prometheus scrape configurations. Scrape configurations
-          ## are appended to the configurations generated by the Prometheus Operator. Job configurations must have the form
-          ## as specified in the official Prometheus documentation:
-          ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<scrape_config>. As scrape configs are
-          ## appended, the user is responsible to make sure it is valid. Note that using this feature may expose the possibility
-          ## to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible
-          ## scrape configs are going to break Prometheus after the upgrade.
+          ## Deploy default dashboards.
           ##
-          additionalScrapeConfigs:
-          - job_name: redis_exporter
-            static_configs:
-            - targets: ['prometheus-redis-exporter:9121']
-            # create new label "deployment" matching with the queue's conumser
-            metric_relabel_configs:
-            - source_labels: ['key']
-              regex: '(^.*$)'
-              replacement: '${1}-consumer'
-              target_label: deployment
-
-          - job_name: tensorflow
-            metrics_path: /monitoring/prometheus/metrics
-            static_configs:
-              - targets: ['tf-serving.deepcell:8501']
-
-      ## Component scraping the kubelet and kubelet-hosted cAdvisor
-      ##
-      kubelet:
-        serviceMonitor:
-          # Metric relabellings to apply to samples before ingestion
-          ##
-          cAdvisorMetricRelabelings:
-            - sourceLabels: [__name__, image]
-              separator: ;
-              regex: container_([a-z_]+);
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              separator: ;
-              regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s|memory_failures_total|fs_reads_total|fs_writes_total)
-              replacement: $1
-              action: drop
-
-          metricRelabelings:
-            - sourceLabels: [__name__, image]
-              separator: ;
-              regex: container_([a-z_]+);
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              separator: ;
-              regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s|memory_failures_total|fs_reads_total|fs_writes_total)
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              separator: ;
-              regex: kubelet_(runtime_operations_duration_seconds_bucket|docker_operations_duration_seconds_bucket)
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              separator: ;
-              regex: storage_operation_duration_seconds_bucket
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              regex: rest_client_request_(latency_seconds_bucket|duration_seconds_bucket)
-              replacement: $1
-              action: drop
+          defaultDashboardsEnabled: true
+
+          adminPassword: prom-operator
+
+          dashboards:
+            default:
+              prometheus-stats:
+                # Ref: https://grafana.com/dashboards/2
+                gnetId: 2
+                revision: 2
+                datasource: Prometheus
+              prometheus-redis:
+                # Ref: https://grafana.com/dashboards/763
+                gnetId: 763
+                revision: 2
+                datasource: Prometheus
+
+        ## Deploy a Prometheus instance
+        ##
+        prometheus:
 
-      ## Component scraping the kube api server
-      ##
-      kubeApiServer:
-        serviceMonitor:
-          ## 	metric relabel configs to apply to samples before ingestion.
+          ## Settings affecting prometheusSpec
+          ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec
           ##
-          metricRelabelings:
-            - sourceLabels: [__name__]
-              regex: apiserver_admission_controller_admission_latencies_seconds_(.*)
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              regex: apiserver_admission_step_admission_latencies_seconds_(.*)
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              regex: apiserver_request_duration_seconds_(.*)
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              regex: apiserver_request_latencies_(.*)
-              replacement: $1
-              action: drop
-            - sourceLabels: [__name__]
-              regex: apiserver_response_size_buckets
-              replacement: $1
-              action: drop
+          prometheusSpec:
+
+            ## Interval between consecutive scrapes.
+            ##
+            scrapeInterval: 15s
+
+            ## Interval between consecutive evaluations.
+            ##
+            evaluationInterval: 15s
+
+            ## Resource limits & requests
+            ##
+            # resources:
+            #   requests:
+            #     memory: 1Gi
+            #   limits:
+            #     memory: 1Gi
+
+            ## Enable compression of the write-ahead log using Snappy.
+            ##
+            walCompression: true
+
+            ## Prometheus StorageSpec for persistent data
+            ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md
+            ##
+            storageSpec: {}
+            #  volumeClaimTemplate:
+            #    spec:
+            #      storageClassName: gluster
+            #      accessModes: ["ReadWriteOnce"]
+            #      resources:
+            #        requests:
+            #          storage: 50Gi
+            #    selector: {}
+
+            ## AdditionalScrapeConfigs allows specifying additional Prometheus scrape configurations. Scrape configurations
+            ## are appended to the configurations generated by the Prometheus Operator. Job configurations must have the form
+            ## as specified in the official Prometheus documentation:
+            ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<scrape_config>. As scrape configs are
+            ## appended, the user is responsible to make sure it is valid. Note that using this feature may expose the possibility
+            ## to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible
+            ## scrape configs are going to break Prometheus after the upgrade.
+            ##
+            additionalScrapeConfigs:
+            - job_name: redis_exporter
+              static_configs:
+              - targets: ['prometheus-redis-exporter:9121']
+              # create new label "deployment" matching with the queue's conumser
+              metric_relabel_configs:
+              - source_labels: ['key']
+                regex: '(^.*$)'
+                replacement: '${1}-consumer'
+                target_label: deployment
+
+            - job_name: tensorflow
+              scrape_interval: 5s
+              metrics_path: /monitoring/prometheus/metrics
+              static_configs:
+                - targets: ['tf-serving.deepcell:8501']