From 3ff0b1d8c4a4f43432c90e0e38269bb44e10dc2c Mon Sep 17 00:00:00 2001 From: John Slivka Date: Mon, 12 Feb 2024 15:12:25 -0600 Subject: [PATCH] add datadog dashboards for monitoring Fairwinds insights-agent (#871) --- .../cronjob_duration_15m_monitor.json | 21 ++ .../cronjob_duration_1h_monitor.json | 21 ++ .../cronjob_oom_events_monitor.json | 21 ++ .../datadog/insights-agent/dashboard.json | 322 ++++++++++++++++++ 4 files changed, 385 insertions(+) create mode 100644 monitoring/dashboards/datadog/insights-agent/cronjob_duration_15m_monitor.json create mode 100644 monitoring/dashboards/datadog/insights-agent/cronjob_duration_1h_monitor.json create mode 100644 monitoring/dashboards/datadog/insights-agent/cronjob_oom_events_monitor.json create mode 100644 monitoring/dashboards/datadog/insights-agent/dashboard.json diff --git a/monitoring/dashboards/datadog/insights-agent/cronjob_duration_15m_monitor.json b/monitoring/dashboards/datadog/insights-agent/cronjob_duration_15m_monitor.json new file mode 100644 index 000000000..0c512c7ef --- /dev/null +++ b/monitoring/dashboards/datadog/insights-agent/cronjob_duration_15m_monitor.json @@ -0,0 +1,21 @@ +{ + "id": 138653336, + "name": "[fairwinds/insights-agent] CronJob duration since last schedule exceeded (15m)", + "type": "query alert", + "query": "max(last_5m):avg:kubernetes_state.cronjob.duration_since_last_schedule{kube_namespace:insights-agent AND kube_cronjob IN (prometheus-metrics, workloads)} by {cluster_name,kube_namespace,kube_cronjob} > 1200", + "message": "{{#is_alert}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_alert}}\n\n{{#is_warning}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_warning}}", + "tags": [], + "options": { + "thresholds": { + "critical": 1200, + "warning": 960 + }, + "notify_audit": false, + "include_tags": true, + "new_group_delay": 60, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} \ No newline at end of file diff --git a/monitoring/dashboards/datadog/insights-agent/cronjob_duration_1h_monitor.json b/monitoring/dashboards/datadog/insights-agent/cronjob_duration_1h_monitor.json new file mode 100644 index 000000000..6759c5cba --- /dev/null +++ b/monitoring/dashboards/datadog/insights-agent/cronjob_duration_1h_monitor.json @@ -0,0 +1,21 @@ +{ + "id": 138652989, + "name": "[fairwinds/insights-agent] CronJob duration since last schedule exceeded (1h)", + "type": "query alert", + "query": "max(last_5m):avg:kubernetes_state.cronjob.duration_since_last_schedule{kube_namespace:insights-agent AND kube_cronjob NOT IN (prometheus-metrics, workloads)} by {cluster_name,kube_namespace,kube_cronjob} > 3900", + "message": "{{#is_alert}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_alert}} \n\n{{#is_warning}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_warning}}", + "tags": [], + "options": { + "thresholds": { + "critical": 3900, + "warning": 3660 + }, + "notify_audit": false, + "include_tags": true, + "new_group_delay": 60, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} \ No newline at end of file diff --git a/monitoring/dashboards/datadog/insights-agent/cronjob_oom_events_monitor.json b/monitoring/dashboards/datadog/insights-agent/cronjob_oom_events_monitor.json new file mode 100644 index 000000000..a037cc92e --- /dev/null +++ b/monitoring/dashboards/datadog/insights-agent/cronjob_oom_events_monitor.json @@ -0,0 +1,21 @@ +{ + "id": 138653777, + "name": "[fairwinds/insights-agent] CronJob OOM events", + "type": "query alert", + "query": "max(last_5m):container.memory.oom_events{cluster_name:production-eks AND kube_namespace:insights-agent} by {kube_cronjob} > 4", + "message": "{{#is_alert}} CronJob has OOM events. CronJob name: {{kube_cronjob.name}} {{/is_alert}}\n\n{{#is_warning}} CronJob has OOM events. CronJob name: {{kube_cronjob.name}} {{/is_warning}}", + "tags": [], + "options": { + "thresholds": { + "critical": 4, + "warning": 3 + }, + "notify_audit": false, + "include_tags": true, + "new_group_delay": 60, + "notify_no_data": false, + "silenced": {} + }, + "priority": null, + "restricted_roles": null +} \ No newline at end of file diff --git a/monitoring/dashboards/datadog/insights-agent/dashboard.json b/monitoring/dashboards/datadog/insights-agent/dashboard.json new file mode 100644 index 000000000..5aa79cc91 --- /dev/null +++ b/monitoring/dashboards/datadog/insights-agent/dashboard.json @@ -0,0 +1,322 @@ +{ + "title": "[fairwinds/insights-agent] Kubernetes CronJob Dashboard", + "description": null, + "widgets": [ + { + "id": 1732392467968311, + "definition": { + "type": "free_text", + "text": "Fairwinds Insights Agent", + "color": "#6915F0", + "font_size": "56", + "text_align": "center" + }, + "layout": { + "x": 0, + "y": 0, + "width": 12, + "height": 1 + } + }, + { + "id": 1360739785147291, + "definition": { + "title": "Resource Consumption", + "background_color": "blue", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 4774855640317495, + "definition": { + "title": "CPU usage by CronJob", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "exclude_null(query1)" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:kubernetes.cpu.usage.total{$kube_namespace} by {kube_cluster_name,kube_namespace,kube_cronjob}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 2692162135133172, + "definition": { + "title": "Memory usage by CronJob", + "title_size": "16", + "title_align": "left", + "show_legend": true, + "legend_layout": "auto", + "legend_columns": [ + "avg", + "min", + "max", + "value", + "sum" + ], + "type": "timeseries", + "requests": [ + { + "formulas": [ + { + "formula": "exclude_null(query1)" + } + ], + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "sum:kubernetes.memory.usage{$kube_namespace} by {kube_namespace,kube_cronjob,kube_cluster_name}" + } + ], + "response_format": "timeseries", + "style": { + "palette": "dog_classic", + "line_type": "solid", + "line_width": "normal" + }, + "display_type": "line" + } + ] + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 3 + } + }, + { + "id": 4883798007459927, + "definition": { + "title": "CronJob OOM Events", + "title_size": "16", + "title_align": "left", + "type": "alert_graph", + "alert_id": "138653777", + "viz_type": "timeseries" + }, + "layout": { + "x": 0, + "y": 3, + "width": 12, + "height": 3 + } + } + ] + }, + "layout": { + "x": 0, + "y": 1, + "width": 12, + "height": 7 + } + }, + { + "id": 5187018217533442, + "definition": { + "title": "Scheduling", + "background_color": "vivid_green", + "show_title": true, + "type": "group", + "layout_type": "ordered", + "widgets": [ + { + "id": 425294088513441, + "definition": { + "title": "Successful Job Pods (avg)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:kubernetes_state.job.succeeded{$kube_namespace}", + "aggregator": "avg" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 1, + "palette": "white_on_green" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "custom_unit": "jobs", + "precision": 2, + "timeseries_background": { + "type": "area", + "yaxis": { + "include_zero": true + } + } + }, + "layout": { + "x": 0, + "y": 0, + "width": 6, + "height": 2 + } + }, + { + "id": 8751120051550263, + "definition": { + "title": "Failed Job Pods (avg)", + "title_size": "16", + "title_align": "left", + "type": "query_value", + "requests": [ + { + "response_format": "scalar", + "queries": [ + { + "name": "query1", + "data_source": "metrics", + "query": "avg:kubernetes_state.job.completion.failed{$kube_namespace}", + "aggregator": "avg" + } + ], + "conditional_formats": [ + { + "comparator": ">", + "value": 1, + "palette": "white_on_red" + } + ], + "formulas": [ + { + "formula": "query1" + } + ] + } + ], + "autoscale": true, + "custom_unit": "jobs", + "precision": 2, + "timeseries_background": { + "type": "area" + } + }, + "layout": { + "x": 6, + "y": 0, + "width": 6, + "height": 2 + } + }, + { + "id": 5670253520315896, + "definition": { + "title": "CronJob duration since last schedule (15m)", + "title_size": "16", + "title_align": "left", + "type": "alert_graph", + "alert_id": "138653336", + "viz_type": "timeseries" + }, + "layout": { + "x": 0, + "y": 2, + "width": 12, + "height": 4 + } + }, + { + "id": 923140578038269, + "definition": { + "title": "CronJob duration since last schedule (1h)", + "title_size": "16", + "title_align": "left", + "type": "alert_graph", + "alert_id": "138652989", + "viz_type": "timeseries" + }, + "layout": { + "x": 0, + "y": 6, + "width": 12, + "height": 4 + } + } + ] + }, + "layout": { + "x": 0, + "y": 8, + "width": 12, + "height": 11, + "is_column_break": true + } + } + ], + "template_variables": [ + { + "name": "kube_cluster_name", + "prefix": "kube_cluster_name", + "available_values": [], + "default": "*" + }, + { + "name": "kube_namespace", + "prefix": "kube_namespace", + "available_values": [], + "default": "insights-agent" + }, + { + "name": "kube_cronjob", + "prefix": "kube_cronjob", + "available_values": [], + "default": "*" + } + ], + "layout_type": "ordered", + "notify_list": [], + "reflow_type": "fixed" +} \ No newline at end of file