Skip to content

Commit

Permalink
Update prometheus helmfiles and rules to better use metric labels (#304)
Browse files Browse the repository at this point in the history
* Update prometheus-operator, prometheus-adapter, and promethes-redis-exporter helm charts and remove stale default values

* Relabel redis-exporter with `deployment=$QUEUE-consumer` and change key to be `$QUEUE`

* Rename zip-consumer to segmentation-zip-consumer to match labels.

* Using .75 instead of .9 for backoff coefficient.
  • Loading branch information
willgraf committed May 23, 2020
1 parent 9a0e661 commit d074552
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 176 deletions.
20 changes: 1 addition & 19 deletions conf/addons/hpa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,24 +22,6 @@ spec:
name: tf_serving_gpu_usage
targetValue: 70
---
# apiVersion: autoscaling/v2beta1
# kind: HorizontalPodAutoscaler
# metadata:
# name: data-processing
# namespace: deepcell
# spec:
# scaleTargetRef:
# apiVersion: apps/v1
# kind: Deployment
# name: data-processing
# minReplicas: 1
# maxReplicas: {{ mul $max_gpus 20 }}
# metrics:
# - type: Resource
# resource:
# name: cpu
# targetAverageUtilization: 80
# ---
apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
metadata:
Expand Down Expand Up @@ -100,7 +82,7 @@ spec:
apiVersion: v1
kind: Namespace
name: segmentation_zip_consumer_key_ratio
targetValue: 2
targetValue: 1
---
apiVersion: autoscaling/v2beta1
kind: HorizontalPodAutoscaler
Expand Down
4 changes: 2 additions & 2 deletions conf/helmfile.d/0240.segmentation-zip-consumer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ releases:
# References:
# - https://github.com/vanvalenlab/kiosk-console/tree/master/conf/charts/redis-consumer
#
- name: segmentation-zip-consumer
namespace: deepcell
- name: "segmentation-zip-consumer"
namespace: "deepcell"
labels:
chart: redis-consumer
component: deepcell
Expand Down
240 changes: 85 additions & 155 deletions conf/helmfile.d/0600.prometheus-operator.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,165 +116,95 @@ releases:
deployment: tracking-consumer
namespace: deepcell

## Using default values from https://github.com/helm/charts/blob/master/stable/grafana/values.yaml
##
grafana:

enabled: true

## Deploy default dashboards.
##
defaultDashboardsEnabled: true

adminPassword: '{{ env "GRAFANA_PASSWORD" | default "prom-operator" }}'

dashboards:
default:
prometheus-stats:
# Ref: https://grafana.com/dashboards/2
gnetId: 2
revision: 2
datasource: Prometheus
prometheus-redis:
# Ref: https://grafana.com/dashboards/763
gnetId: 763
revision: 2
datasource: Prometheus

## Deploy a Prometheus instance
##
prometheus:

## Settings affecting prometheusSpec
## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec
## Using default values from https://github.com/helm/charts/blob/master/stable/grafana/values.yaml
##
prometheusSpec:

## Interval between consecutive scrapes.
##
scrapeInterval: 15s

## Interval between consecutive evaluations.
##
evaluationInterval: 15s
grafana:

## Resource limits & requests
##
# resources:
# requests:
# memory: 1Gi
# limits:
# memory: 1Gi

## Enable compression of the write-ahead log using Snappy.
##
walCompression: true

## Prometheus StorageSpec for persistent data
## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md
##
storageSpec: {}
# volumeClaimTemplate:
# spec:
# storageClassName: gluster
# accessModes: ["ReadWriteOnce"]
# resources:
# requests:
# storage: 50Gi
# selector: {}
enabled: true

## AdditionalScrapeConfigs allows specifying additional Prometheus scrape configurations. Scrape configurations
## are appended to the configurations generated by the Prometheus Operator. Job configurations must have the form
## as specified in the official Prometheus documentation:
## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<scrape_config>. As scrape configs are
## appended, the user is responsible to make sure it is valid. Note that using this feature may expose the possibility
## to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible
## scrape configs are going to break Prometheus after the upgrade.
## Deploy default dashboards.
##
additionalScrapeConfigs:
- job_name: redis_exporter
static_configs:
- targets: ['prometheus-redis-exporter:9121']
# create new label "deployment" matching with the queue's conumser
metric_relabel_configs:
- source_labels: ['key']
regex: '(^.*$)'
replacement: '${1}-consumer'
target_label: deployment

- job_name: tensorflow
metrics_path: /monitoring/prometheus/metrics
static_configs:
- targets: ['tf-serving.deepcell:8501']

## Component scraping the kubelet and kubelet-hosted cAdvisor
##
kubelet:
serviceMonitor:
# Metric relabellings to apply to samples before ingestion
##
cAdvisorMetricRelabelings:
- sourceLabels: [__name__, image]
separator: ;
regex: container_([a-z_]+);
replacement: $1
action: drop
- sourceLabels: [__name__]
separator: ;
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s|memory_failures_total|fs_reads_total|fs_writes_total)
replacement: $1
action: drop

metricRelabelings:
- sourceLabels: [__name__, image]
separator: ;
regex: container_([a-z_]+);
replacement: $1
action: drop
- sourceLabels: [__name__]
separator: ;
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s|memory_failures_total|fs_reads_total|fs_writes_total)
replacement: $1
action: drop
- sourceLabels: [__name__]
separator: ;
regex: kubelet_(runtime_operations_duration_seconds_bucket|docker_operations_duration_seconds_bucket)
replacement: $1
action: drop
- sourceLabels: [__name__]
separator: ;
regex: storage_operation_duration_seconds_bucket
replacement: $1
action: drop
- sourceLabels: [__name__]
regex: rest_client_request_(latency_seconds_bucket|duration_seconds_bucket)
replacement: $1
action: drop
defaultDashboardsEnabled: true

adminPassword: prom-operator

dashboards:
default:
prometheus-stats:
# Ref: https://grafana.com/dashboards/2
gnetId: 2
revision: 2
datasource: Prometheus
prometheus-redis:
# Ref: https://grafana.com/dashboards/763
gnetId: 763
revision: 2
datasource: Prometheus

## Deploy a Prometheus instance
##
prometheus:

## Component scraping the kube api server
##
kubeApiServer:
serviceMonitor:
## metric relabel configs to apply to samples before ingestion.
## Settings affecting prometheusSpec
## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/api.md#prometheusspec
##
metricRelabelings:
- sourceLabels: [__name__]
regex: apiserver_admission_controller_admission_latencies_seconds_(.*)
replacement: $1
action: drop
- sourceLabels: [__name__]
regex: apiserver_admission_step_admission_latencies_seconds_(.*)
replacement: $1
action: drop
- sourceLabels: [__name__]
regex: apiserver_request_duration_seconds_(.*)
replacement: $1
action: drop
- sourceLabels: [__name__]
regex: apiserver_request_latencies_(.*)
replacement: $1
action: drop
- sourceLabels: [__name__]
regex: apiserver_response_size_buckets
replacement: $1
action: drop
prometheusSpec:

## Interval between consecutive scrapes.
##
scrapeInterval: 15s

## Interval between consecutive evaluations.
##
evaluationInterval: 15s

## Resource limits & requests
##
# resources:
# requests:
# memory: 1Gi
# limits:
# memory: 1Gi

## Enable compression of the write-ahead log using Snappy.
##
walCompression: true

## Prometheus StorageSpec for persistent data
## ref: https://github.com/coreos/prometheus-operator/blob/master/Documentation/user-guides/storage.md
##
storageSpec: {}
# volumeClaimTemplate:
# spec:
# storageClassName: gluster
# accessModes: ["ReadWriteOnce"]
# resources:
# requests:
# storage: 50Gi
# selector: {}

## AdditionalScrapeConfigs allows specifying additional Prometheus scrape configurations. Scrape configurations
## are appended to the configurations generated by the Prometheus Operator. Job configurations must have the form
## as specified in the official Prometheus documentation:
## https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<scrape_config>. As scrape configs are
## appended, the user is responsible to make sure it is valid. Note that using this feature may expose the possibility
## to break upgrades of Prometheus. It is advised to review Prometheus release notes to ensure that no incompatible
## scrape configs are going to break Prometheus after the upgrade.
##
additionalScrapeConfigs:
- job_name: redis_exporter
static_configs:
- targets: ['prometheus-redis-exporter:9121']
# create new label "deployment" matching with the queue's conumser
metric_relabel_configs:
- source_labels: ['key']
regex: '(^.*$)'
replacement: '${1}-consumer'
target_label: deployment

- job_name: tensorflow
scrape_interval: 5s
metrics_path: /monitoring/prometheus/metrics
static_configs:
- targets: ['tf-serving.deepcell:8501']

0 comments on commit d074552

Please sign in to comment.