Skip to content

Commit

Permalink
Refactor report metrics to use aggregation_over_time expressions (kub…
Browse files Browse the repository at this point in the history
…e-burner#415)

* Make elapsed second precise

Signed-off-by: Raul Sevilla <[email protected]>

* Remove references to aggregations and update docs

Signed-off-by: Raul Sevilla <[email protected]>

---------

Signed-off-by: Raul Sevilla <[email protected]>
  • Loading branch information
rsevilla87 authored Aug 10, 2023
1 parent 03a2e15 commit f05d61d
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 171 deletions.
239 changes: 157 additions & 82 deletions cmd/kube-burner/ocp-config/metrics-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,175 +2,250 @@
# Kubelet & CRI-O

# Average of the CPU usage from all worker's kubelet
- query: irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m]) and on (node) kube_node_role{role="worker"}
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: cpu-kubelet
aggregations: [avg]
instant: true

# Average of the memory usage from all worker's kubelet
- query: process_resident_memory_bytes{service="kubelet",job="kubelet"} and on (node) kube_node_role{role="worker"}
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: memory-kubelet
aggregations: [avg, max]
instant: true

# Max of the memory usage from all worker's kubelet
- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: max-memory-kubelet
instant: true

# Average of the CPU usage from all worker's CRI-O
- query: irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m]) and on (node) kube_node_role{role="worker"}
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: cpu-crio
aggregations: [avg]
instant: true

# Average of the memory usage from all worker's CRI-O
- query: process_resident_memory_bytes{service="kubelet",job="crio"} and on (node) kube_node_role{role="worker"}
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: memory-crio
aggregations: [avg, max]
instant: true

# Max of the memory usage from all worker's CRI-O
- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[{{.elapsed}}:]) and on (node) kube_node_role{role="worker"})
metricName: max-memory-crio
instant: true


# Etcd

- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: 99thEtcdDiskBackendCommit
aggregations: [avg]
instant: true

- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: 99thEtcdDiskWalFsync
aggregations: [avg]
instant: true

- query: histogram_quantile(0.99, irate(etcd_network_peer_round_trip_time_seconds_bucket[2m]))
- query: avg(avg_over_time(histogram_quantile(0.99, irate(etcd_network_peer_round_trip_time_seconds_bucket[2m]))[{{.elapsed}}:]))
metricName: 99thEtcdRoundTripTime
aggregations: [avg]
instant: true

# Control-plane

- query: topk(1, sum(rate(container_cpu_usage_seconds_total{namespace="openshift-kube-controller-manager"}[2m])) by (pod))
- query: avg(avg_over_time(topk(1, sum(rate(container_cpu_usage_seconds_total{namespace="openshift-kube-controller-manager"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: cpu-kube-controller-manager
aggregations: [avg]
instant: true

- query: topk(1, sum(container_memory_rss{namespace="openshift-kube-controller-manager"}) by (pod))
- query: avg(avg_over_time(topk(1, sum(container_memory_rss{namespace="openshift-kube-controller-manager"}) by (pod))[{{.elapsed}}:]))
metricName: memory-kube-controller-manager
aggregations: [avg, max]
instant: true

- query: topk(3, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-kube-apiserver"}[2m])) by (pod))
- query: max(max_over_time(topk(1, sum(container_memory_rss{namespace="openshift-kube-controller-manager"}) by (pod))[{{.elapsed}}:]))
metricName: maxmemory-kube-controller-manager
instant: true

- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-kube-apiserver"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: cpu-kube-apiserver
aggregations: [avg]
instant: true

- query: topk(3, sum(container_memory_rss{namespace="openshift-kube-apiserver"}) by (pod))
- query: avg(avg_over_time(topk(3, sum(container_memory_rss{namespace="openshift-kube-apiserver"}) by (pod))[{{.elapsed}}:]))
metricName: memory-kube-apiserver
aggregations: [avg, max]
instant: true

- query: max(max_over_time(topk(3, sum(container_memory_rss{namespace="openshift-kube-apiserver"}) by (pod))[{{.elapsed}}:]))
metricName: max-memory-kube-apiserver
instant: true

- query: topk(3, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-apiserver"}[2m])) by (pod))
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-apiserver"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: cpu-openshift-apiserver
aggregations: [avg]
instant: true

- query: topk(3, sum(container_memory_rss{namespace="openshift-apiserver"}) by (pod))
- query: avg(avg_over_time(topk(3, sum(container_memory_rss{namespace="openshift-apiserver"}) by (pod))[{{.elapsed}}:]))
metricName: memory-openshift-apiserver
aggregations: [avg, max]
instant: true

- query: max(max_over_time(topk(3, sum(container_memory_rss{namespace="openshift-apiserver"}) by (pod))[{{.elapsed}}:]))
metricName: max-memory-openshift-apiserver
instant: true

- query: topk(3, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd"}[2m])) by (pod))
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-etcd"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: cpu-etcd
aggregations: [avg]
instant: true

- query: topk(1, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-controller-manager"}[2m])) by (pod))
- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{namespace="openshift-controller-manager"}[2m])) by (pod))[{{.elapsed}}:]))
metricName: cpu-openshift-controller-manager
aggregations: [avg]
instant: true

- query: topk(1, sum(container_memory_rss{namespace="openshift-controller-manager"}) by (pod))
- query: avg(avg_over_time(topk(1, sum(container_memory_rss{namespace="openshift-controller-manager"}) by (pod))[{{.elapsed}}:]))
metricName: memory-openshift-controller-manager
aggregations: [avg, max]
instant: true

- query: topk(3,sum(container_memory_rss{namespace="openshift-etcd", name!=""}) by (pod))
- query: max(max_over_time(topk(1, sum(container_memory_rss{namespace="openshift-controller-manager"}) by (pod))[{{.elapsed}}:]))
metricName: max-memory-openshift-controller-manager
instant: true

- query: avg(avg_over_time(topk(3,sum(container_memory_rss{namespace="openshift-etcd", name!=""}) by (pod))[{{.elapsed}}:]))
metricName: memory-etcd
aggregations: [avg, max]
instant: true

- query: max(max_over_time(topk(3,sum(container_memory_rss{namespace="openshift-etcd", name!=""}) by (pod))[{{.elapsed}}:]))
metricName: max-memory-etcd
instant: true

# OVNKubernetes

- query: irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="sbdb"}[2m])
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="sbdb"}[2m])[{{.elapsed}}:]))
metricName: cpu-ovnkube-master-sbdb
aggregations: [avg]
instant: true

- query: container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="sbdb"}
- query: avg(avg_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="sbdb"}[{{.elapsed}}:]))
metricName: memory-ovnkube-master-sbdb
aggregations: [avg, max]
instant: true

- query: max(max_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="sbdb"}[{{.elapsed}}:]))
metricName: max-memory-ovnkube-master-sbdb
instant: true

- query: irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="nbdb"}[2m])
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="nbdb"}[2m])[{{.elapsed}}:]))
metricName: cpu-ovnkube-master-nbdb
aggregations: [avg]
instant: true

- query: container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="nbdb"}
- query: avg(avg_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="nbdb"}[{{.elapsed}}:]))
metricName: memory-ovnkube-master-nbdb
aggregations: [avg, max]
instant: true

- query: irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="northd"}[2m])
- query: max(max_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="nbdb"}[{{.elapsed}}:]))
metricName: max-memory-ovnkube-master-nbdb
instant: true

- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="northd"}[2m])[{{.elapsed}}:]))
metricName: cpu-ovnkube-master-northd
aggregations: [avg]
instant: true

- query: container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="northd"}
- query: avg(avg_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="northd"}[{{.elapsed}}:]))
metricName: memory-ovnkube-master-northd
aggregations: [avg, max]
instant: true

- query: max(max_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="northd"}[{{.elapsed}}:]))
metricName: max-memory-ovnkube-master-northd
instant: true

- query: irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovnkube-master"}[2m])
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovnkube-master"}[2m])[{{.elapsed}}:]))
metricName: cpu-ovnkube-master-ovnkube-master
aggregations: [avg]
instant: true

- query: container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovnkube-master"}
- query: avg(avg_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovnkube-master"}[{{.elapsed}}:]))
metricName: memory-ovnkube-master-ovnkube-master
aggregations: [avg, max]
instant: true

- query: irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovn-dbchecker"}[2m])
- query: max(max_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovnkube-master"}[{{.elapsed}}:]))
metricName: max-memory-ovnkube-master-ovnkube-master
instant: true

- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovn-dbchecker"}[2m])[{{.elapsed}}:]))
metricName: cpu-ovnkube-master-ovn-dbchecker
aggregations: [avg]
instant: true

- query: container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovn-dbchecker"}
- query: avg(avg_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovn-dbchecker"}[{{.elapsed}}:]))
metricName: memory-ovnkube-master-ovn-dbchecker
aggregations: [avg, max]
instant: true

- query: max(max_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-master.+",container="ovn-dbchecker"}[{{.elapsed}}:]))
metricName: max-memory-ovnkube-master-ovn-dbchecker
instant: true

- query: irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovnkube-node"}[2m])
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovnkube-node"}[2m])[{{.elapsed}}:]))
metricName: cpu-ovnkube-node-ovnkube-node
aggregations: [avg]
instant: true

- query: container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovnkube-node"}
- query: avg(avg_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovnkube-node"}[{{.elapsed}}:]))
metricName: memory-ovnkube-node-ovnkube-node
aggregations: [avg, max]
instant: true

- query: max(max_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovnkube-node"}[{{.elapsed}}:]))
metricName: max-memory-ovnkube-node-ovnkube-node
instant: true

- query: irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovn-controller"}[2m])
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovn-controller"}[2m])[{{.elapsed}}:]))
metricName: cpu-ovnkube-node-ovn-controller
aggregations: [avg]
instant: true

- query: container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovn-controller"}
- query: avg(avg_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovn-controller"}[{{.elapsed}}:]))
metricName: memory-ovnkube-node-ovn-controller
aggregations: [avg, max]
instant: true

- query: max(max_over_time(container_memory_rss{namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+",container="ovn-controller"}[{{.elapsed}}:]))
metricName: max-memory-ovnkube-node-ovn-controller
instant: true

# Nodes

- query: sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:]))
metricName: cpu-masters
aggregations: [avg]
instant: true

- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
metricName: memory-masters
aggregations: [avg, max]
instant: true

- query: sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)
metricName: cpu-masters
aggregations: [avg]
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
metricName: max-memory-masters
instant: true

- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:]))
metricName: cpu-workers
instant: true

- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
metricName: memory-workers
aggregations: [avg, max]
instant: true

- query: sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)
metricName: cpu-workers
aggregations: [avg]
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
metricName: max-memory-workers
instant: true

- query: sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:]))
metricName: cpu-infra
aggregations: [avg]
instant: true

- query: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[{{.elapsed}}:]))
metricName: memory-infra
aggregations: [avg, max]
instant: true

- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[{{.elapsed}}:]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
metricName: max-memory-infra
instant: true

# Cluster

- query: cluster:memory_usage:ratio
- query: avg_over_time(cluster:memory_usage:ratio[{{.elapsed}}:])
metricName: memory-cluster-usage-ratio
aggregations: [max, avg]
instant: true

- query: cluster:node_cpu:ratio
- query: avg_over_time(cluster:memory_usage:ratio[{{.elapsed}}:])
metricName: max-memory-cluster-usage-ratio
instant: true

- query: avg_over_time(cluster:node_cpu:ratio[{{.elapsed}}:])
metricName: cpu-cluster-usage-ratio
aggregations: [max, avg]
instant: true

- query: max_over_time(cluster:node_cpu:ratio[{{.elapsed}}:])
metricName: max-cpu-cluster-usage-ratio
instant: true
Loading

0 comments on commit f05d61d

Please sign in to comment.