From d0745526107a5f3477c613ed08351d1193cf9402 Mon Sep 17 00:00:00 2001 From: willgraf <7930703+willgraf@users.noreply.github.com> Date: Sat, 28 Mar 2020 15:23:52 -0700 Subject: [PATCH] Update prometheus helmfiles and rules to better use metric labels (#304) * Update prometheus-operator, prometheus-adapter, and promethes-redis-exporter helm charts and remove stale default values * Relabel redis-exporter with `deployment=$QUEUE-consumer` and change key to be `$QUEUE` * Rename zip-consumer to segmentation-zip-consumer to match labels. * Using .75 instead of .9 for backoff coefficient. --- conf/addons/hpa.yaml | 20 +- .../0240.segmentation-zip-consumer.yaml | 4 +- conf/helmfile.d/0600.prometheus-operator.yaml | 240 +++++++----------- 3 files changed, 88 insertions(+), 176 deletions(-) diff --git a/conf/addons/hpa.yaml b/conf/addons/hpa.yaml index ea678e4f..87ba243e 100644 --- a/conf/addons/hpa.yaml +++ b/conf/addons/hpa.yaml @@ -22,24 +22,6 @@ spec: name: tf_serving_gpu_usage targetValue: 70 --- -# apiVersion: autoscaling/v2beta1 -# kind: HorizontalPodAutoscaler -# metadata: -# name: data-processing -# namespace: deepcell -# spec: -# scaleTargetRef: -# apiVersion: apps/v1 -# kind: Deployment -# name: data-processing -# minReplicas: 1 -# maxReplicas: {{ mul $max_gpus 20 }} -# metrics: -# - type: Resource -# resource: -# name: cpu -# targetAverageUtilization: 80 -# --- apiVersion: autoscaling/v2beta1 kind: HorizontalPodAutoscaler metadata: @@ -100,7 +82,7 @@ spec: apiVersion: v1 kind: Namespace name: segmentation_zip_consumer_key_ratio - targetValue: 2 + targetValue: 1 --- apiVersion: autoscaling/v2beta1 kind: HorizontalPodAutoscaler diff --git a/conf/helmfile.d/0240.segmentation-zip-consumer.yaml b/conf/helmfile.d/0240.segmentation-zip-consumer.yaml index 3d8022fb..80ee7d0d 100644 --- a/conf/helmfile.d/0240.segmentation-zip-consumer.yaml +++ b/conf/helmfile.d/0240.segmentation-zip-consumer.yaml @@ -13,8 +13,8 @@ releases: # References: # - https://github.com/vanvalenlab/kiosk-console/tree/master/conf/charts/redis-consumer # -- name: segmentation-zip-consumer - namespace: deepcell +- name: "segmentation-zip-consumer" + namespace: "deepcell" labels: chart: redis-consumer component: deepcell diff --git a/conf/helmfile.d/0600.prometheus-operator.yaml b/conf/helmfile.d/0600.prometheus-operator.yaml index 79c4fecb..4dd8aa22 100644 --- a/conf/helmfile.d/0600.prometheus-operator.yaml +++ b/conf/helmfile.d/0600.prometheus-operator.yaml @@ -116,165 +116,95 @@ releases: deployment: tracking-consumer namespace: deepcell - ## Using default values from https://github.com/helm/charts/blob/master/stable/grafana/values.yaml - ## - grafana: - enabled: true - - ## Deploy default dashboards. - ## - defaultDashboardsEnabled: true - - adminPassword: '{{ env "GRAFANA_PASSWORD" | default "prom-operator" }}' - - dashboards: - default: - prometheus-stats: - # Ref: https://grafana.com/dashboards/2 - gnetId: 2 - revision: 2 - datasource: Prometheus - prometheus-redis: - # Ref: https://grafana.com/dashboards/763 - gnetId: 763 - revision: 2 - datasource: Prometheus - - ## Deploy a Prometheus instance - ## - prometheus: - - ## Settings affecting prometheusSpec - ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec + ## Using default values from https://github.com/helm/charts/blob/master/stable/grafana/values.yaml ## - prometheusSpec: - - ## Interval between consecutive scrapes. - ## - scrapeInterval: 15s - - ## Interval between consecutive evaluations. - ## - evaluationInterval: 15s + grafana: - ## Resource limits & requests - ## - # resources: - # requests: - # memory: 1Gi - # limits: - # memory: 1Gi - - ## Enable compression of the write-ahead log using Snappy. - ## - walCompression: true - - ## Prometheus StorageSpec for persistent data - ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md - ## - storageSpec: {} - # volumeClaimTemplate: - # spec: - # storageClassName: gluster - # accessModes: ["ReadWriteOnce"] - # resources: - # requests: - # storage: 50Gi - # selector: {} + enabled: true - ## AdditionalScrapeConfigs allows specifying additional Prometheus scrape configurations. Scrape configurations - ## are appended to the configurations generated by the Prometheus Operator. Job configurations must have the form - ## as specified in the official Prometheus documentation: - ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#. As scrape configs are - ## appended, the user is responsible to make sure it is valid. Note that using this feature may expose the possibility - ## to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible - ## scrape configs are going to break Prometheus after the upgrade. + ## Deploy default dashboards. ## - additionalScrapeConfigs: - - job_name: redis_exporter - static_configs: - - targets: ['prometheus-redis-exporter:9121'] - # create new label "deployment" matching with the queue's conumser - metric_relabel_configs: - - source_labels: ['key'] - regex: '(^.*$)' - replacement: '${1}-consumer' - target_label: deployment - - - job_name: tensorflow - metrics_path: /monitoring/prometheus/metrics - static_configs: - - targets: ['tf-serving.deepcell:8501'] - - ## Component scraping the kubelet and kubelet-hosted cAdvisor - ## - kubelet: - serviceMonitor: - # Metric relabellings to apply to samples before ingestion - ## - cAdvisorMetricRelabelings: - - sourceLabels: [__name__, image] - separator: ; - regex: container_([a-z_]+); - replacement: $1 - action: drop - - sourceLabels: [__name__] - separator: ; - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s|memory_failures_total|fs_reads_total|fs_writes_total) - replacement: $1 - action: drop - - metricRelabelings: - - sourceLabels: [__name__, image] - separator: ; - regex: container_([a-z_]+); - replacement: $1 - action: drop - - sourceLabels: [__name__] - separator: ; - regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s|memory_failures_total|fs_reads_total|fs_writes_total) - replacement: $1 - action: drop - - sourceLabels: [__name__] - separator: ; - regex: kubelet_(runtime_operations_duration_seconds_bucket|docker_operations_duration_seconds_bucket) - replacement: $1 - action: drop - - sourceLabels: [__name__] - separator: ; - regex: storage_operation_duration_seconds_bucket - replacement: $1 - action: drop - - sourceLabels: [__name__] - regex: rest_client_request_(latency_seconds_bucket|duration_seconds_bucket) - replacement: $1 - action: drop + defaultDashboardsEnabled: true + + adminPassword: prom-operator + + dashboards: + default: + prometheus-stats: + # Ref: https://grafana.com/dashboards/2 + gnetId: 2 + revision: 2 + datasource: Prometheus + prometheus-redis: + # Ref: https://grafana.com/dashboards/763 + gnetId: 763 + revision: 2 + datasource: Prometheus + + ## Deploy a Prometheus instance + ## + prometheus: - ## Component scraping the kube api server - ## - kubeApiServer: - serviceMonitor: - ## metric relabel configs to apply to samples before ingestion. + ## Settings affecting prometheusSpec + ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec ## - metricRelabelings: - - sourceLabels: [__name__] - regex: apiserver_admission_controller_admission_latencies_seconds_(.*) - replacement: $1 - action: drop - - sourceLabels: [__name__] - regex: apiserver_admission_step_admission_latencies_seconds_(.*) - replacement: $1 - action: drop - - sourceLabels: [__name__] - regex: apiserver_request_duration_seconds_(.*) - replacement: $1 - action: drop - - sourceLabels: [__name__] - regex: apiserver_request_latencies_(.*) - replacement: $1 - action: drop - - sourceLabels: [__name__] - regex: apiserver_response_size_buckets - replacement: $1 - action: drop + prometheusSpec: + + ## Interval between consecutive scrapes. + ## + scrapeInterval: 15s + + ## Interval between consecutive evaluations. + ## + evaluationInterval: 15s + + ## Resource limits & requests + ## + # resources: + # requests: + # memory: 1Gi + # limits: + # memory: 1Gi + + ## Enable compression of the write-ahead log using Snappy. + ## + walCompression: true + + ## Prometheus StorageSpec for persistent data + ## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md + ## + storageSpec: {} + # volumeClaimTemplate: + # spec: + # storageClassName: gluster + # accessModes: ["ReadWriteOnce"] + # resources: + # requests: + # storage: 50Gi + # selector: {} + + ## AdditionalScrapeConfigs allows specifying additional Prometheus scrape configurations. Scrape configurations + ## are appended to the configurations generated by the Prometheus Operator. Job configurations must have the form + ## as specified in the official Prometheus documentation: + ## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#. As scrape configs are + ## appended, the user is responsible to make sure it is valid. Note that using this feature may expose the possibility + ## to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible + ## scrape configs are going to break Prometheus after the upgrade. + ## + additionalScrapeConfigs: + - job_name: redis_exporter + static_configs: + - targets: ['prometheus-redis-exporter:9121'] + # create new label "deployment" matching with the queue's conumser + metric_relabel_configs: + - source_labels: ['key'] + regex: '(^.*$)' + replacement: '${1}-consumer' + target_label: deployment + + - job_name: tensorflow + scrape_interval: 5s + metrics_path: /monitoring/prometheus/metrics + static_configs: + - targets: ['tf-serving.deepcell:8501']