From 2c56bfa86a99098dbcd6c9e73d30dff0c78cbd2a Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Wed, 28 Aug 2024 16:44:29 -0500 Subject: [PATCH 01/14] Fix Accepted wf metric and succesful wf Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index d554532b24..c981e6582e 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -19,11 +19,11 @@ def workflow_stats(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title="Accepted Workflow", + title="Accepted Workflows (avg)", dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate(flyte:propeller:all:workflow:accepted{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', + expr='avg(flyte:propeller:all:workflow:accepted{project=~"$project", domain=~"$domain", wf=~"$workflow"})', refId='A', ), ], @@ -33,18 +33,15 @@ def workflow_stats(collapse: bool) -> Row: ), ), Graph( - title="Successful Workflow", + title="Successful Workflow execution time (ms)", dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate(flyte:propeller:all:workflow:success_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', + expr='sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', refId='A', ), ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), + yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), Graph( title="Failed Workflow", From 4d87c7c3149feb3cfccc9b1500d53cc8df4f94cf Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Wed, 28 Aug 2024 17:05:58 -0500 Subject: [PATCH 02/14] Change successful wf execution to avg Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index c981e6582e..f87cfec7a3 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -33,15 +33,15 @@ def workflow_stats(collapse: bool) -> Row: ), ), Graph( - title="Successful Workflow execution time (ms)", + title="Successful Workflow executions (avg)", dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', + expr='avg(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"})', refId='A', ), ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( title="Failed Workflow", From ca1ebc9634f3da28471b6ca1621bcad9b39aafee Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Wed, 28 Aug 2024 17:08:37 -0500 Subject: [PATCH 03/14] Fix failed wf metric Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index f87cfec7a3..d24d5ddadb 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -48,12 +48,11 @@ def workflow_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate(flyte:propeller:all:workflow:failure_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', + expr='avg(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"})', refId='A', ), ], yAxes=YAxes( - YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), From 1890d9a49b2b6dd12482b8ea9e509d8d61ae24bd Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Wed, 4 Sep 2024 12:21:02 -0500 Subject: [PATCH 04/14] 1st try to get kube pod label metric to work Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index d24d5ddadb..d91a5889d5 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -57,16 +57,15 @@ def workflow_stats(collapse: bool) -> Row: ), ), Graph( - title="Aborted Workflow", + title="Aborted Workflows (avg)", dataSource=DATASOURCE, targets=[ Target( - expr='sum(rate(flyte:propeller:all:workflow:workflow_aborted{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', + expr='avg(flyte:propeller:all:workflow:workflow_aborted_unlabeled{project=~"$project", domain=~"$domain", wf=~"$workflow"})', refId='A', ), ], yAxes=YAxes( - YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), @@ -75,7 +74,7 @@ def workflow_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='sum(flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='sum(flyte:propeller:all:workflow:event_recording:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', refId='A', ), ], @@ -86,7 +85,7 @@ def workflow_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='sum(flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='sum(flyte:propeller:all:workflow:event_recording:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', refId='A', ), ], From f3225c0706a6f389d37324aed0cc7f5cd18bff7e Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Fri, 6 Sep 2024 08:44:39 -0500 Subject: [PATCH 05/14] Fixes pending tasks query Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index d91a5889d5..85c67df6ab 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -96,7 +96,7 @@ def workflow_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='sum(flyte:propeller:all:node:queueing_latency_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='(flyte:propeller:all:node:queueing_latency_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', refId='A', ), ], @@ -163,7 +163,7 @@ def resource_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0', + expr='sum(kube_pod_status_phase{phase="Pending"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~"$workflow"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0', refId='A', ), ], @@ -174,7 +174,7 @@ def resource_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='(100 * max(container_memory_rss{image!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', + expr='(100 * max(container_memory_rss * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', refId='A', ), ], From 780fba5466818af5b55e385f026150990984f31d Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Fri, 6 Sep 2024 11:41:58 -0500 Subject: [PATCH 06/14] Fixes memory usage query Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index 85c67df6ab..0dba8534ed 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -61,7 +61,7 @@ def workflow_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='avg(flyte:propeller:all:workflow:workflow_aborted_unlabeled{project=~"$project", domain=~"$domain", wf=~"$workflow"})', + expr='avg(flyte:propeller:all:workflow:workflow_aborted{project=~"$project", domain=~"$domain", wf=~"$workflow"})', refId='A', ), ], @@ -174,7 +174,7 @@ def resource_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='(100 * max(container_memory_rss * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', + expr='(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ', refId='A', ), ], From e29839f87d41502984e7e67ef895ac41d95ed3fb Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Fri, 6 Sep 2024 13:43:28 -0500 Subject: [PATCH 07/14] Fixes CPU usage query and panel type Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index 0dba8534ed..aba0c95f1f 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -1,6 +1,6 @@ import typing from grafanalib.core import ( - Alert, AlertCondition, Dashboard, Graph, + Alert, AlertCondition, Dashboard, Graph,BarChart, GreaterThan, OP_AND, OPS_FORMAT, Row, RTYPE_SUM, SECONDS_FORMAT, SHORT_FORMAT, single_y_axis, Target, TimeRange, YAxes, YAxis, MILLISECONDS_FORMAT, Templating, Template, DataSourceInput @@ -169,27 +169,28 @@ def resource_stats(collapse: bool) -> Row: ], yAxes=single_y_axis(format=SHORT_FORMAT), ), - Graph( - title="Memory Usage Percentage", + BarChart( + title="Task Memory Usage (%)", dataSource=DATASOURCE, targets=[ Target( expr='(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ', refId='A', ), + ], - yAxes=single_y_axis(format=SHORT_FORMAT), - ), - Graph( - title="CPU Usage Percentage", + showValue='true', + ), + BarChart( + title="Task CPU Usage (%)", dataSource=DATASOURCE, targets=[ Target( - expr='(100* sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !="",namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=""} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=""} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0', + expr='(100* sum(rate(container_cpu_usage_seconds_total[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0', refId='A', ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + showValue='true', ), ]) From e7512cdac98be33fde4936da4f31010559cf1cf1 Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Fri, 6 Sep 2024 14:29:50 -0500 Subject: [PATCH 08/14] Fixes CPU usage graph and remove node queuing graph Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index aba0c95f1f..eedaf16ccf 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -91,17 +91,6 @@ def workflow_stats(collapse: bool) -> Row: ], yAxes=single_y_axis(format=MILLISECONDS_FORMAT), ), - Graph( - title="Node queuing latency by Quantile", - dataSource=DATASOURCE, - targets=[ - Target( - expr='(flyte:propeller:all:node:queueing_latency_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', - refId='A', - ), - ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), - ), ]) @staticmethod @@ -186,7 +175,7 @@ def resource_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='(100* sum(rate(container_cpu_usage_seconds_total[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0', + expr='(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0', refId='A', ), ], From fe6bb343e48d9f539e6c4cc89a978afa72fc0fca Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Tue, 10 Sep 2024 16:13:14 -0500 Subject: [PATCH 09/14] Turn success wf exec into bargauge viz Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index eedaf16ccf..16c1977028 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -1,6 +1,6 @@ import typing from grafanalib.core import ( - Alert, AlertCondition, Dashboard, Graph,BarChart, + Alert, AlertCondition, Dashboard, Graph,BarChart,BarGauge, GreaterThan, OP_AND, OPS_FORMAT, Row, RTYPE_SUM, SECONDS_FORMAT, SHORT_FORMAT, single_y_axis, Target, TimeRange, YAxes, YAxis, MILLISECONDS_FORMAT, Templating, Template, DataSourceInput @@ -44,7 +44,7 @@ def workflow_stats(collapse: bool) -> Row: yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( - title="Failed Workflow", + title="Failed Workflows (avg)", dataSource=DATASOURCE, targets=[ Target( @@ -69,16 +69,17 @@ def workflow_stats(collapse: bool) -> Row: YAxis(format=SHORT_FORMAT), ), ), - Graph( - title="Successful workflow execution time by Quantile", + BarGauge( + title="Successful wf execution duration by quantile (s)", dataSource=DATASOURCE, targets=[ Target( - expr='sum(flyte:propeller:all:workflow:event_recording:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='avg((flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"})/1000) by(quantile)', refId='A', ), ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + orientation='horizontal', + format=SECONDS_FORMAT ), Graph( title="Failed workflow execution time by Quantile", From d78dacd2d978cf0fcefe9a56d2e536375b3a7aca Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Tue, 10 Sep 2024 18:15:57 -0500 Subject: [PATCH 10/14] Turn failure duration quantile into bargauge Signed-off-by: davidmirror-ops --- stats/flyteuser.dashboard.py | 38 ++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 21 deletions(-) diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index 16c1977028..b15f4eb7ef 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -79,66 +79,62 @@ def workflow_stats(collapse: bool) -> Row: ), ], orientation='horizontal', - format=SECONDS_FORMAT + format=SECONDS_FORMAT, ), - Graph( + BarGauge( title="Failed workflow execution time by Quantile", dataSource=DATASOURCE, targets=[ Target( - expr='sum(flyte:propeller:all:workflow:event_recording:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='avg(flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', refId='A', ), ], - yAxes=single_y_axis(format=MILLISECONDS_FORMAT), + orientation='horizontal', + format=SECONDS_FORMAT, ), ]) @staticmethod def quota_stats(collapse: bool) -> Row: return Row( - title="Kubernetes Quota Usage stats", + title="Kubernetes Resource Quota Usage", collapse=collapse, panels=[ Graph( - title="CPU Limits vs usage", + title="CPU Limits vs requested by namespace", dataSource=DATASOURCE, targets=[ Target( expr='kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="hard"}', refId='A', - legendFormat="max cpu", + legendFormat="CPU limit", ), Target( expr='kube_resourcequota{resource="limits.cpu", namespace="$project-$domain", type="used"}', refId='B', - legendFormat="used cpu", + legendFormat="CPU requested", ), ], yAxes=YAxes( - YAxis(format=OPS_FORMAT), YAxis(format=SHORT_FORMAT), ), ), Graph( - title="Mem Limits vs usage", + title="Memory limit vs requested by namespace (MiB)", dataSource=DATASOURCE, targets=[ Target( - expr='kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"}', + expr='(kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="hard"})*9.5367e-7', refId='A', - legendFormat="max mem", + legendFormat="Memory limit (MiB)", ), Target( - expr='kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"}', + expr='(kube_resourcequota{resource="limits.memory", namespace="$project-$domain", type="used"})*9.5367e-7', refId='B', - legendFormat="used mem", + legendFormat="Memory requested (MiB)", ), ], - yAxes=YAxes( - YAxis(format=OPS_FORMAT), - YAxis(format=SHORT_FORMAT), - ), ), ]) @@ -149,7 +145,7 @@ def resource_stats(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title="Pending tasks", + title="# of Pending Tasks", dataSource=DATASOURCE, targets=[ Target( @@ -160,7 +156,7 @@ def resource_stats(collapse: bool) -> Row: yAxes=single_y_axis(format=SHORT_FORMAT), ), BarChart( - title="Task Memory Usage (%)", + title="Memory Usage per Task(%)", dataSource=DATASOURCE, targets=[ Target( @@ -172,7 +168,7 @@ def resource_stats(collapse: bool) -> Row: showValue='true', ), BarChart( - title="Task CPU Usage (%)", + title="CPU Usage per Task(%)", dataSource=DATASOURCE, targets=[ Target( From 6ef3a755bbbda12536aeb65066aa1ecc1526014c Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Thu, 12 Sep 2024 15:49:15 -0500 Subject: [PATCH 11/14] Minor fixes Signed-off-by: davidmirror-ops --- .../stats/prometheus/flyteuser.dashboard.json | 1797 +++++++++++++++++ stats/flyteuser.dashboard.py | 8 +- 2 files changed, 1801 insertions(+), 4 deletions(-) create mode 100644 deployment/stats/prometheus/flyteuser.dashboard.json diff --git a/deployment/stats/prometheus/flyteuser.dashboard.json b/deployment/stats/prometheus/flyteuser.dashboard.json new file mode 100644 index 0000000000..c38f12edab --- /dev/null +++ b/deployment/stats/prometheus/flyteuser.dashboard.json @@ -0,0 +1,1797 @@ +{ + "__inputs": [ + { + "description": "Prometheus server that connects to Flyte", + "label": "Prometheus", + "name": "DS_PROM", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "description": "Flyte User Dashboard. It's designed to give an overview of execution status and resource consumption.", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "panels": [], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 1, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Accepted Workflows (avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 2, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "avg(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "avg(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Successful Workflow executions (avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 3, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "avg(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "avg(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Failed Workflows (avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 4, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "avg(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "avg(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Aborted Workflows (avg)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ] + } + } + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 5, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "displayMode": "lcd", + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "decimals": null, + "links": [], + "max": 100, + "min": 0, + "title": null, + "unit": "s" + }, + "limit": null, + "mappings": [], + "override": {}, + "thresholds": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "repeat": null, + "repeatDirection": null, + "span": 2, + "targets": [ + { + "datasource": null, + "expr": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", + "refId": "A", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Successful wf execution duration by quantile (s)", + "transformations": [], + "transparent": false, + "type": "bargauge" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ] + } + } + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 6, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "displayMode": "lcd", + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "decimals": null, + "links": [], + "max": 100, + "min": 0, + "title": null, + "unit": "s" + }, + "limit": null, + "mappings": [], + "override": {}, + "thresholds": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "repeat": null, + "repeatDirection": null, + "span": 2, + "targets": [ + { + "datasource": null, + "expr": "avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "refId": "A", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Failed workflow execution time by Quantile", + "transformations": [], + "transparent": false, + "type": "bargauge" + } + ], + "repeat": null, + "showTitle": true, + "title": "Workflow Stats" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 7, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"hard\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU limit", + "metric": "", + "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"hard\"}", + "refId": "A", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"used\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "CPU requested", + "metric": "", + "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"used\"}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "CPU Limit vs requested by namespace", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 8, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Memory limit (MiB)", + "metric": "", + "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", + "refId": "A", + "step": 10, + "target": "" + }, + { + "datasource": null, + "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "Memory requested (MiB)", + "metric": "", + "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", + "refId": "B", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory limit vs requested by namespace (MiB)", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + } + ], + "repeat": null, + "showTitle": true, + "title": "Kubernetes Resource Quota Usage" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 9, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Pending Tasks", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "none", + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 10, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "barRadius": 0.0, + "barWidth": 0.97, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "true", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "repeat": null, + "repeatDirection": null, + "span": 4, + "targets": [ + { + "datasource": null, + "expr": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", + "refId": "A", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage per Task(%)", + "transformations": [], + "transparent": false, + "type": "barchart" + }, + { + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "none", + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 11, + "interval": null, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "options": { + "barRadius": 0.0, + "barWidth": 0.97, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "true", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "repeat": null, + "repeatDirection": null, + "span": 4, + "targets": [ + { + "datasource": null, + "expr": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", + "refId": "A", + "step": 10, + "target": "" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage per Task(%)", + "transformations": [], + "transparent": false, + "type": "barchart" + } + ], + "repeat": null, + "showTitle": true, + "title": "Task stats" + }, + { + "collapse": true, + "editable": true, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 12, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "User errors", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + }, + { + "aliasColors": {}, + "bars": false, + "cacheTimeout": null, + "datasource": "${DS_PROM}", + "description": null, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "thresholds": { + "mode": "absolute", + "steps": [] + }, + "unit": "" + } + }, + "fill": 1, + "grid": { + "threshold1": null, + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2": null, + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "gridPos": null, + "height": null, + "hideTimeOverride": false, + "id": 13, + "interval": null, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "sort": null, + "sortDesc": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "maxDataPoints": 100, + "maxPerRow": null, + "minSpan": null, + "nullPointMode": "connected", + "options": { + "alertThreshold": true, + "dataLinks": [] + }, + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "repeatDirection": null, + "seriesOverrides": [], + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": null, + "expr": "sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "", + "metric": "", + "query": "sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", + "refId": "A", + "step": 10, + "target": "" + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "System errors", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "transformations": [], + "transparent": false, + "type": "graph", + "xaxis": { + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "decimals": null, + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": 0 + } + } + ], + "repeat": null, + "showTitle": true, + "title": "Error (System vs User)" + } + ], + "schemaVersion": 12, + "sharedCrosshair": false, + "style": "dark", + "tags": [ + "flyte", + "prometheus", + "flyteuser", + "flyte-user" + ], + "templating": { + "list": [ + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, + "datasource": "${DS_PROM}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "project", + "options": [], + "query": "label_values(flyte:propeller:all:collector:flyteworkflow, project)", + "refresh": 1, + "regex": null, + "sort": true, + "tagValuesQuery": null, + "tagsQuery": null, + "type": "query", + "useTags": false + }, + { + "allValue": null, + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, + "datasource": "${DS_PROM}", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "domain", + "options": [], + "query": "label_values(flyte:propeller:all:collector:flyteworkflow, domain)", + "refresh": 1, + "regex": null, + "sort": true, + "tagValuesQuery": null, + "tagsQuery": null, + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "tags": [], + "text": null, + "value": null + }, + "datasource": "${DS_PROM}", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "workflow", + "options": [], + "query": "label_values(flyte:propeller:all:collector:flyteworkflow, wf)", + "refresh": 1, + "regex": null, + "sort": true, + "tagValuesQuery": null, + "tagsQuery": null, + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "hidden": false, + "nowDelay": null, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Flyte User Dashboard (via Prometheus)", + "uid": null, + "version": 0 +} diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index b15f4eb7ef..b0d3e84fea 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -102,7 +102,7 @@ def quota_stats(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title="CPU Limits vs requested by namespace", + title="CPU Limit vs requested by namespace", dataSource=DATASOURCE, targets=[ Target( @@ -145,7 +145,7 @@ def resource_stats(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title="# of Pending Tasks", + title="Pending Tasks", dataSource=DATASOURCE, targets=[ Target( @@ -183,7 +183,7 @@ def resource_stats(collapse: bool) -> Row: @staticmethod def errors(collapse: bool) -> Row: return Row( - title="Error (System vs user)", + title="Error (System vs User)", collapse=collapse, panels=[ Graph( @@ -262,7 +262,7 @@ def create_all_rows(interval: int) -> typing.List[Row]: domain_template, wf_template, ]), - description="Flyte User Dashboard. This is great to get a birds-eye and drill down view of executions in your Flyte cluster. Useful for the user.", + description="Flyte User Dashboard. It's designed to give an overview of execution status and resource consumption.", ).auto_panel_ids() if __name__ == "__main__": From ef07139bb4fe2387c43a3137d652125f88cff3bc Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Mon, 16 Sep 2024 18:10:54 -0500 Subject: [PATCH 12/14] Change to rates from counters Signed-off-by: davidmirror-ops --- .../stats/prometheus/flyteuser-dashboard.json | 777 +++++++----------- stats/flyteuser.dashboard.py | 26 +- 2 files changed, 297 insertions(+), 506 deletions(-) diff --git a/deployment/stats/prometheus/flyteuser-dashboard.json b/deployment/stats/prometheus/flyteuser-dashboard.json index 55c7ad5851..210fef8369 100644 --- a/deployment/stats/prometheus/flyteuser-dashboard.json +++ b/deployment/stats/prometheus/flyteuser-dashboard.json @@ -12,7 +12,7 @@ "annotations": { "list": [] }, - "description": "Flyte User Dashboard. This is great to get a birds-eye and drill down view of executions in your Flyte cluster. Useful for the user.", + "description": "Flyte User Dashboard. It's designed to give an overview of execution status and resource consumption.", "editable": false, "gnetId": null, "graphTooltip": 0, @@ -40,7 +40,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -96,7 +97,7 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "expr": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", "format": "time_series", "hide": false, "instant": false, @@ -104,7 +105,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "query": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", "refId": "A", "step": 10, "target": "" @@ -113,7 +114,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Accepted Workflow", + "title": "Accepted Workflows (avg)", "tooltip": { "msResolution": true, "shared": true, @@ -167,7 +168,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -240,7 +242,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Successful Workflow", + "title": "Workflow success rate", "tooltip": { "msResolution": true, "shared": true, @@ -294,7 +296,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -367,7 +370,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Failed Workflow", + "title": "Workflow failure rate", "tooltip": { "msResolution": true, "shared": true, @@ -421,7 +424,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -477,7 +481,7 @@ "targets": [ { "datasource": null, - "expr": "sum(rate(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "expr": "avg_over_time(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m])", "format": "time_series", "hide": false, "instant": false, @@ -485,7 +489,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(rate(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m]))", + "query": "avg_over_time(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}[5m])", "refId": "A", "step": 10, "target": "" @@ -494,7 +498,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Aborted Workflow", + "title": "Aborted Workflows (avg)", "tooltip": { "msResolution": true, "shared": true, @@ -513,7 +517,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -536,8 +540,6 @@ } }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -547,64 +549,84 @@ "defaults": { "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ] } } }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, "gridPos": null, "height": null, "hideTimeOverride": false, "id": 5, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] + "displayMode": "lcd", + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "decimals": null, + "links": [], + "max": 100, + "min": 0, + "title": null, + "unit": "s" + }, + "limit": null, + "mappings": [], + "override": {}, + "thresholds": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ], + "values": false + }, + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 2, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "expr": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile)", "format": "time_series", "hide": false, "instant": false, @@ -612,59 +634,20 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "query": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile)", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Successful workflow execution time by Quantile", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "Successful wf execution duration by quantile", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "bargauge" }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -674,191 +657,84 @@ "defaults": { "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ] } } }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, "gridPos": null, "height": null, "hideTimeOverride": false, "id": 6, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Failed workflow execution time by Quantile", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "displayMode": "lcd", + "fieldOptions": { + "calcs": [ + "mean" + ], + "defaults": { + "decimals": null, + "links": [], + "max": 100, + "min": 0, + "title": null, + "unit": "s" + }, + "limit": null, + "mappings": [], + "override": {}, + "thresholds": [ + { + "color": "green", + "index": 0, + "line": true, + "op": "gt", + "value": "null", + "yaxis": "left" + }, + { + "color": "red", + "index": 1, + "line": true, + "op": "gt", + "value": 80.0, + "yaxis": "left" + } + ], + "values": false }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" + "orientation": "horizontal", + "showThresholdLabels": false, + "showThresholdMarkers": true }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 7, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 2, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:node:queueing_latency_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "expr": "avg((flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", "format": "time_series", "hide": false, "instant": false, @@ -866,55 +742,18 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:node:queueing_latency_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", + "query": "avg((flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Node queuing latency by Quantile", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "Failed wf execution duration by quantile", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "bargauge" } ], "repeat": null, @@ -939,7 +778,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -952,7 +792,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 8, + "id": 7, "interval": null, "isNew": true, "legend": { @@ -1001,7 +841,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "max cpu", + "legendFormat": "CPU limit", "metric": "", "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"hard\"}", "refId": "A", @@ -1016,7 +856,7 @@ "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "used cpu", + "legendFormat": "CPU requested", "metric": "", "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"used\"}", "refId": "B", @@ -1027,7 +867,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "CPU Limits vs usage", + "title": "CPU Limit vs requested by namespace", "tooltip": { "msResolution": true, "shared": true, @@ -1046,7 +886,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1081,7 +921,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1094,7 +935,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 9, + "id": 8, "interval": null, "isNew": true, "legend": { @@ -1137,30 +978,30 @@ "targets": [ { "datasource": null, - "expr": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"}", + "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "max mem", + "legendFormat": "Memory limit (MiB)", "metric": "", - "query": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"}", + "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", "refId": "A", "step": 10, "target": "" }, { "datasource": null, - "expr": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"}", + "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, - "legendFormat": "used mem", + "legendFormat": "Memory requested (MiB)", "metric": "", - "query": "kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"}", + "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", "refId": "B", "step": 10, "target": "" @@ -1169,7 +1010,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Mem Limits vs usage", + "title": "Memory limit vs requested by namespace (MiB)", "tooltip": { "msResolution": true, "shared": true, @@ -1188,7 +1029,7 @@ "yaxes": [ { "decimals": null, - "format": "ops", + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1213,7 +1054,7 @@ ], "repeat": null, "showTitle": true, - "title": "Kubernetes Quota Usage stats" + "title": "Kubernetes Resource Quota Usage" }, { "collapse": true, @@ -1233,7 +1074,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1246,7 +1088,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 10, + "id": 9, "interval": null, "isNew": true, "legend": { @@ -1289,7 +1131,7 @@ "targets": [ { "datasource": null, - "expr": "sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0", + "expr": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", "format": "time_series", "hide": false, "instant": false, @@ -1297,7 +1139,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(kube_pod_container_status_waiting * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"}) by (namespace, label_execution_id, label_task_name, label_node_id, label_workflow_name) > 0", + "query": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", "refId": "A", "step": 10, "target": "" @@ -1306,7 +1148,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Pending tasks", + "title": "Pending Tasks", "tooltip": { "msResolution": true, "shared": true, @@ -1348,8 +1190,6 @@ } }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -1357,66 +1197,80 @@ "error": false, "fieldConfig": { "defaults": { + "color": { + "fixedColor": "none", + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + } + }, + "mappings": [], "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "overrides": [] }, "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 11, + "id": 10, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] + "barRadius": 0.0, + "barWidth": 0.97, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "true", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 4, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "(100 * max(container_memory_rss{image!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "expr": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", "format": "time_series", "hide": false, "instant": false, @@ -1424,59 +1278,20 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(100 * max(container_memory_rss{image!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / max(kube_pod_container_resource_limits_memory_bytes{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "query": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Memory Usage Percentage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "Memory Usage per Task(%)", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "barchart" }, { - "aliasColors": {}, - "bars": false, "cacheTimeout": null, "datasource": "${DS_PROM}", "description": null, @@ -1484,66 +1299,80 @@ "error": false, "fieldConfig": { "defaults": { + "color": { + "fixedColor": "none", + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + } + }, + "mappings": [], "thresholds": { "mode": "absolute", - "steps": [] + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] } - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "overrides": [] }, "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 12, + "id": 11, "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, "links": [], "maxDataPoints": 100, "maxPerRow": null, "minSpan": null, - "nullPointMode": "connected", "options": { - "alertThreshold": true, - "dataLinks": [] + "barRadius": 0.0, + "barWidth": 0.97, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "true", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", "repeat": null, "repeatDirection": null, - "seriesOverrides": [], "span": 4, - "stack": false, - "steppedLine": false, "targets": [ { "datasource": null, - "expr": "(100* sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "expr": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", "format": "time_series", "hide": false, "instant": false, @@ -1551,55 +1380,18 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(100* sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\",namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name) / sum(kube_pod_container_resource_limits_cpu_cores{container!=\"\"} * on(pod) group_left(label_execution_id, label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_execution_id !=\"\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_execution_id, label_task_name, label_node_id, label_workflow_name)) > 0", + "query": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", "refId": "A", "step": 10, "target": "" } ], - "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "CPU Usage Percentage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, + "title": "CPU Usage per Task(%)", "transformations": [], "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } + "type": "barchart" } ], "repeat": null, @@ -1624,7 +1416,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1637,7 +1430,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 13, + "id": 12, "interval": null, "isNew": true, "legend": { @@ -1751,7 +1544,8 @@ "thresholds": { "mode": "absolute", "steps": [] - } + }, + "unit": "" } }, "fill": 1, @@ -1764,7 +1558,7 @@ "gridPos": null, "height": null, "hideTimeOverride": false, - "id": 14, + "id": 13, "interval": null, "isNew": true, "legend": { @@ -1868,7 +1662,7 @@ ], "repeat": null, "showTitle": true, - "title": "Error (System vs user)" + "title": "Error (System vs User)" } ], "schemaVersion": 12, @@ -1971,6 +1765,7 @@ }, "timepicker": { "hidden": false, + "nowDelay": null, "refresh_intervals": [ "5s", "10s", diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index b0d3e84fea..665006b8cf 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -33,44 +33,40 @@ def workflow_stats(collapse: bool) -> Row: ), ), Graph( - title="Successful Workflow executions (avg)", + title="Workflow success rate", dataSource=DATASOURCE, targets=[ Target( - expr='avg(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"})', + expr='sum(rate(flyte:propeller:all:workflow:success_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', refId='A', ), ], - yAxes=single_y_axis(format=SHORT_FORMAT), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( - title="Failed Workflows (avg)", + title="Workflow failure rate", dataSource=DATASOURCE, targets=[ Target( - expr='avg(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"})', + expr='sum(rate(flyte:propeller:all:workflow:failure_duration_ms_count{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m]))', refId='A', ), ], - yAxes=YAxes( - YAxis(format=SHORT_FORMAT), - ), + yAxes=single_y_axis(format=OPS_FORMAT), ), Graph( title="Aborted Workflows (avg)", dataSource=DATASOURCE, targets=[ Target( - expr='avg(flyte:propeller:all:workflow:workflow_aborted{project=~"$project", domain=~"$domain", wf=~"$workflow"})', + expr='avg_over_time(flyte:propeller:all:workflow:workflow_aborted{project=~"$project", domain=~"$domain", wf=~"$workflow"}[5m])', refId='A', ), ], - yAxes=YAxes( - YAxis(format=SHORT_FORMAT), - ), + yAxes=single_y_axis(format=SHORT_FORMAT), ), BarGauge( - title="Successful wf execution duration by quantile (s)", + title="Successful wf execution duration by quantile", dataSource=DATASOURCE, targets=[ Target( @@ -82,11 +78,11 @@ def workflow_stats(collapse: bool) -> Row: format=SECONDS_FORMAT, ), BarGauge( - title="Failed workflow execution time by Quantile", + title="Failed wf execution duration by quantile", dataSource=DATASOURCE, targets=[ Target( - expr='avg(flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by (quantile)', + expr='avg((flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"})/1000) by(quantile)', refId='A', ), ], From 12ec36f6152a290ba89b08aa272d753f7dfebbf3 Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Fri, 20 Sep 2024 16:03:28 -0500 Subject: [PATCH 13/14] Remove stale file Signed-off-by: davidmirror-ops --- .../stats/prometheus/flyteuser-dashboard.json | 8 +- .../stats/prometheus/flyteuser.dashboard.json | 1797 ----------------- stats/flyteuser.dashboard.py | 2 +- 3 files changed, 5 insertions(+), 1802 deletions(-) delete mode 100644 deployment/stats/prometheus/flyteuser.dashboard.json diff --git a/deployment/stats/prometheus/flyteuser-dashboard.json b/deployment/stats/prometheus/flyteuser-dashboard.json index 210fef8369..bf51c25418 100644 --- a/deployment/stats/prometheus/flyteuser-dashboard.json +++ b/deployment/stats/prometheus/flyteuser-dashboard.json @@ -626,7 +626,7 @@ "targets": [ { "datasource": null, - "expr": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile)", + "expr": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", "format": "time_series", "hide": false, "instant": false, @@ -634,7 +634,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "sum(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile)", + "query": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", "refId": "A", "step": 10, "target": "" @@ -1270,7 +1270,7 @@ "targets": [ { "datasource": null, - "expr": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", + "expr": "(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=\"$project-$domain\",label_workflow_name=\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", "format": "time_series", "hide": false, "instant": false, @@ -1278,7 +1278,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", + "query": "(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=\"$project-$domain\",label_workflow_name=\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", "refId": "A", "step": 10, "target": "" diff --git a/deployment/stats/prometheus/flyteuser.dashboard.json b/deployment/stats/prometheus/flyteuser.dashboard.json deleted file mode 100644 index c38f12edab..0000000000 --- a/deployment/stats/prometheus/flyteuser.dashboard.json +++ /dev/null @@ -1,1797 +0,0 @@ -{ - "__inputs": [ - { - "description": "Prometheus server that connects to Flyte", - "label": "Prometheus", - "name": "DS_PROM", - "pluginId": "prometheus", - "pluginName": "Prometheus", - "type": "datasource" - } - ], - "annotations": { - "list": [] - }, - "description": "Flyte User Dashboard. It's designed to give an overview of execution status and resource consumption.", - "editable": false, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "panels": [], - "refresh": "10s", - "rows": [ - { - "collapse": false, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 1, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "avg(flyte:propeller:all:workflow:accepted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Accepted Workflows (avg)", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 2, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "avg(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "avg(flyte:propeller:all:workflow:event_recording:success_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Successful Workflow executions (avg)", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 3, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "avg(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "avg(flyte:propeller:all:workflow:event_recording:failure_duration_ms_count{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Failed Workflows (avg)", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 4, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 2, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "avg(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "avg(flyte:propeller:all:workflow:workflow_aborted{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Aborted Workflows (avg)", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "index": 0, - "line": true, - "op": "gt", - "value": "null", - "yaxis": "left" - }, - { - "color": "red", - "index": 1, - "line": true, - "op": "gt", - "value": 80.0, - "yaxis": "left" - } - ] - } - } - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 5, - "interval": null, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "options": { - "displayMode": "lcd", - "fieldOptions": { - "calcs": [ - "mean" - ], - "defaults": { - "decimals": null, - "links": [], - "max": 100, - "min": 0, - "title": null, - "unit": "s" - }, - "limit": null, - "mappings": [], - "override": {}, - "thresholds": [ - { - "color": "green", - "index": 0, - "line": true, - "op": "gt", - "value": "null", - "yaxis": "left" - }, - { - "color": "red", - "index": 1, - "line": true, - "op": "gt", - "value": 80.0, - "yaxis": "left" - } - ], - "values": false - }, - "orientation": "horizontal", - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "repeat": null, - "repeatDirection": null, - "span": 2, - "targets": [ - { - "datasource": null, - "expr": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", - "refId": "A", - "step": 10, - "target": "" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Successful wf execution duration by quantile (s)", - "transformations": [], - "transparent": false, - "type": "bargauge" - }, - { - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "index": 0, - "line": true, - "op": "gt", - "value": "null", - "yaxis": "left" - }, - { - "color": "red", - "index": 1, - "line": true, - "op": "gt", - "value": 80.0, - "yaxis": "left" - } - ] - } - } - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 6, - "interval": null, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "options": { - "displayMode": "lcd", - "fieldOptions": { - "calcs": [ - "mean" - ], - "defaults": { - "decimals": null, - "links": [], - "max": 100, - "min": 0, - "title": null, - "unit": "s" - }, - "limit": null, - "mappings": [], - "override": {}, - "thresholds": [ - { - "color": "green", - "index": 0, - "line": true, - "op": "gt", - "value": "null", - "yaxis": "left" - }, - { - "color": "red", - "index": 1, - "line": true, - "op": "gt", - "value": 80.0, - "yaxis": "left" - } - ], - "values": false - }, - "orientation": "horizontal", - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "repeat": null, - "repeatDirection": null, - "span": 2, - "targets": [ - { - "datasource": null, - "expr": "avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by (quantile)", - "refId": "A", - "step": 10, - "target": "" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Failed workflow execution time by Quantile", - "transformations": [], - "transparent": false, - "type": "bargauge" - } - ], - "repeat": null, - "showTitle": true, - "title": "Workflow Stats" - }, - { - "collapse": true, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 7, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"hard\"}", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "CPU limit", - "metric": "", - "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"hard\"}", - "refId": "A", - "step": 10, - "target": "" - }, - { - "datasource": null, - "expr": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"used\"}", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "CPU requested", - "metric": "", - "query": "kube_resourcequota{resource=\"limits.cpu\", namespace=\"$project-$domain\", type=\"used\"}", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "CPU Limit vs requested by namespace", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 8, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "Memory limit (MiB)", - "metric": "", - "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"hard\"})*9.5367e-7", - "refId": "A", - "step": 10, - "target": "" - }, - { - "datasource": null, - "expr": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "Memory requested (MiB)", - "metric": "", - "query": "(kube_resourcequota{resource=\"limits.memory\", namespace=\"$project-$domain\", type=\"used\"})*9.5367e-7", - "refId": "B", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory limit vs requested by namespace (MiB)", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - } - ], - "repeat": null, - "showTitle": true, - "title": "Kubernetes Resource Quota Usage" - }, - { - "collapse": true, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 9, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(kube_pod_status_phase{phase=\"Pending\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{label_workflow_name=~\"$workflow\"}) by (namespace, label_task_name, label_node_id, label_workflow_name) > 0", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pending Tasks", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "none", - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "fillOpacity": 80, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineWidth": 1, - "scaleDistribution": { - "type": "linear" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 10, - "interval": null, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "options": { - "barRadius": 0.0, - "barWidth": 0.97, - "groupWidth": 0.7, - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "orientation": "auto", - "showValue": "true", - "stacking": "none", - "tooltip": { - "mode": "single", - "sort": "none" - }, - "xTickLabelRotation": 0, - "xTickLabelSpacing": 0 - }, - "repeat": null, - "repeatDirection": null, - "span": 4, - "targets": [ - { - "datasource": null, - "expr": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", - "refId": "A", - "step": 10, - "target": "" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage per Task(%)", - "transformations": [], - "transparent": false, - "type": "barchart" - }, - { - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "color": { - "fixedColor": "none", - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "fillOpacity": 80, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineWidth": 1, - "scaleDistribution": { - "type": "linear" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 11, - "interval": null, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "options": { - "barRadius": 0.0, - "barWidth": 0.97, - "groupWidth": 0.7, - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "orientation": "auto", - "showValue": "true", - "stacking": "none", - "tooltip": { - "mode": "single", - "sort": "none" - }, - "xTickLabelRotation": 0, - "xTickLabelSpacing": 0 - }, - "repeat": null, - "repeatDirection": null, - "span": 4, - "targets": [ - { - "datasource": null, - "expr": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", - "refId": "A", - "step": 10, - "target": "" - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage per Task(%)", - "transformations": [], - "transparent": false, - "type": "barchart" - } - ], - "repeat": null, - "showTitle": true, - "title": "Task stats" - }, - { - "collapse": true, - "editable": true, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 12, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(rate(flyte:propeller:all:node:user_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "User errors", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - }, - { - "aliasColors": {}, - "bars": false, - "cacheTimeout": null, - "datasource": "${DS_PROM}", - "description": null, - "editable": true, - "error": false, - "fieldConfig": { - "defaults": { - "thresholds": { - "mode": "absolute", - "steps": [] - }, - "unit": "" - } - }, - "fill": 1, - "grid": { - "threshold1": null, - "threshold1Color": "rgba(216, 200, 27, 0.27)", - "threshold2": null, - "threshold2Color": "rgba(234, 112, 112, 0.22)" - }, - "gridPos": null, - "height": null, - "hideTimeOverride": false, - "id": 13, - "interval": null, - "isNew": true, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "hideZero": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": null, - "sort": null, - "sortDesc": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "maxDataPoints": 100, - "maxPerRow": null, - "minSpan": null, - "nullPointMode": "connected", - "options": { - "alertThreshold": true, - "dataLinks": [] - }, - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "repeatDirection": null, - "seriesOverrides": [], - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "datasource": null, - "expr": "sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", - "format": "time_series", - "hide": false, - "instant": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "query": "sum(rate(flyte:propeller:all:node:system_error_duration_ms_count{project=~\"$project\",domain=~\"$domain\",wf=~\"$project:$domain:$workflow\"}[5m]))", - "refId": "A", - "step": 10, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "System errors", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "transformations": [], - "transparent": false, - "type": "graph", - "xaxis": { - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "decimals": null, - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": 0 - } - } - ], - "repeat": null, - "showTitle": true, - "title": "Error (System vs User)" - } - ], - "schemaVersion": 12, - "sharedCrosshair": false, - "style": "dark", - "tags": [ - "flyte", - "prometheus", - "flyteuser", - "flyte-user" - ], - "templating": { - "list": [ - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "selected": false, - "tags": [], - "text": null, - "value": null - }, - "datasource": "${DS_PROM}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "project", - "options": [], - "query": "label_values(flyte:propeller:all:collector:flyteworkflow, project)", - "refresh": 1, - "regex": null, - "sort": true, - "tagValuesQuery": null, - "tagsQuery": null, - "type": "query", - "useTags": false - }, - { - "allValue": null, - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "selected": false, - "tags": [], - "text": null, - "value": null - }, - "datasource": "${DS_PROM}", - "hide": 0, - "includeAll": false, - "label": null, - "multi": false, - "name": "domain", - "options": [], - "query": "label_values(flyte:propeller:all:collector:flyteworkflow, domain)", - "refresh": 1, - "regex": null, - "sort": true, - "tagValuesQuery": null, - "tagsQuery": null, - "type": "query", - "useTags": false - }, - { - "allValue": ".*", - "auto": false, - "auto_count": 30, - "auto_min": "10s", - "current": { - "selected": false, - "tags": [], - "text": null, - "value": null - }, - "datasource": "${DS_PROM}", - "hide": 0, - "includeAll": true, - "label": null, - "multi": false, - "name": "workflow", - "options": [], - "query": "label_values(flyte:propeller:all:collector:flyteworkflow, wf)", - "refresh": 1, - "regex": null, - "sort": true, - "tagValuesQuery": null, - "tagsQuery": null, - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-1h", - "to": "now" - }, - "timepicker": { - "hidden": false, - "nowDelay": null, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "utc", - "title": "Flyte User Dashboard (via Prometheus)", - "uid": null, - "version": 0 -} diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index 665006b8cf..a4f23ddda6 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -156,7 +156,7 @@ def resource_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='(100 * max(container_memory_working_set_bytes * on(pod) group_left( label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ', + expr='(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace="$project-$domain",label_workflow_name="$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ', refId='A', ), From 81ef23ce3bf24f1642eef7a322b0e52b8bdc0b06 Mon Sep 17 00:00:00 2001 From: davidmirror-ops Date: Tue, 24 Sep 2024 12:22:44 -0500 Subject: [PATCH 14/14] Adapt CPU/mem metrics Signed-off-by: davidmirror-ops --- .../stats/prometheus/flyteuser-dashboard.json | 20 +++++++++---------- stats/flyteuser.dashboard.py | 17 ++++++++-------- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/deployment/stats/prometheus/flyteuser-dashboard.json b/deployment/stats/prometheus/flyteuser-dashboard.json index bf51c25418..36eb2bb7bf 100644 --- a/deployment/stats/prometheus/flyteuser-dashboard.json +++ b/deployment/stats/prometheus/flyteuser-dashboard.json @@ -626,7 +626,7 @@ "targets": [ { "datasource": null, - "expr": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", + "expr": "(avg(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "format": "time_series", "hide": false, "instant": false, @@ -634,7 +634,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "avg((flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", + "query": "(avg(flyte:propeller:all:workflow:success_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "refId": "A", "step": 10, "target": "" @@ -734,7 +734,7 @@ "targets": [ { "datasource": null, - "expr": "avg((flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", + "expr": "(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "format": "time_series", "hide": false, "instant": false, @@ -742,7 +742,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "avg((flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"})/1000) by(quantile)", + "query": "(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~\"$project\", domain=~\"$domain\", wf=~\"$workflow\"}) by(quantile))/1000", "refId": "A", "step": 10, "target": "" @@ -1270,7 +1270,7 @@ "targets": [ { "datasource": null, - "expr": "(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=\"$project-$domain\",label_workflow_name=\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", + "expr": "(100 * (max(container_memory_working_set_bytes{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "format": "time_series", "hide": false, "instant": false, @@ -1278,7 +1278,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=\"$project-$domain\",label_workflow_name=\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ", + "query": "(100 * (max(container_memory_working_set_bytes{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "refId": "A", "step": 10, "target": "" @@ -1372,7 +1372,7 @@ "targets": [ { "datasource": null, - "expr": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", + "expr": "(100 * (sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "format": "time_series", "hide": false, "instant": false, @@ -1380,7 +1380,7 @@ "intervalFactor": 2, "legendFormat": "", "metric": "", - "query": "(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0", + "query": "(100 * (sum(rate(container_cpu_usage_seconds_total{image!=\"\"}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~\"$project-$domain\",label_workflow_name=~\"$workflow\"} * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=\"\"} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase=\"Running\"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0", "refId": "A", "step": 10, "target": "" @@ -1490,7 +1490,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "User errors", + "title": "User error rate", "tooltip": { "msResolution": true, "shared": true, @@ -1618,7 +1618,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "System errors", + "title": "System error rate", "tooltip": { "msResolution": true, "shared": true, diff --git a/stats/flyteuser.dashboard.py b/stats/flyteuser.dashboard.py index a4f23ddda6..763362f004 100644 --- a/stats/flyteuser.dashboard.py +++ b/stats/flyteuser.dashboard.py @@ -70,7 +70,7 @@ def workflow_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='avg((flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"})/1000) by(quantile)', + expr='(avg(flyte:propeller:all:workflow:success_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by(quantile))/1000', refId='A', ), ], @@ -82,7 +82,7 @@ def workflow_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='avg((flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"})/1000) by(quantile)', + expr='(avg(flyte:propeller:all:workflow:failure_duration_ms{project=~"$project", domain=~"$domain", wf=~"$workflow"}) by(quantile))/1000', refId='A', ), ], @@ -156,19 +156,18 @@ def resource_stats(collapse: bool) -> Row: dataSource=DATASOURCE, targets=[ Target( - expr='(100 * max(container_memory_working_set_bytes * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace="$project-$domain",label_workflow_name="$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0 ', + expr='(100 * (max(container_memory_working_set_bytes{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_memory:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0', refId='A', ), - ], - showValue='true', - ), + showValue='true', + ), BarChart( title="CPU Usage per Task(%)", dataSource=DATASOURCE, targets=[ Target( - expr='(max(container_cpu_usage_seconds_total * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / max(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name)) > 0', + expr='(100 * (sum(rate(container_cpu_usage_seconds_total{image!=""}[2m]) * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels{namespace=~"$project-$domain",label_workflow_name=~"$workflow"} * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name) / sum(cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits{container!=""} * on(pod) group_left(label_task_name, label_node_id, label_workflow_name) kube_pod_labels * on(pod) group_left(phase) kube_pod_status_phase{phase="Running"}) by (namespace, pod, label_task_name, label_node_id, label_workflow_name))) > 0', refId='A', ), ], @@ -183,7 +182,7 @@ def errors(collapse: bool) -> Row: collapse=collapse, panels=[ Graph( - title="User errors", + title="User error rate", dataSource=DATASOURCE, targets=[ Target( @@ -194,7 +193,7 @@ def errors(collapse: bool) -> Row: yAxes=single_y_axis(format=SHORT_FORMAT), ), Graph( - title="System errors", + title="System error rate", dataSource=DATASOURCE, targets=[ Target(