-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
add datadog dashboards for monitoring Fairwinds insights-agent (#871)
- Loading branch information
Showing
4 changed files
with
385 additions
and
0 deletions.
There are no files selected for viewing
21 changes: 21 additions & 0 deletions
21
monitoring/dashboards/datadog/insights-agent/cronjob_duration_15m_monitor.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"id": 138653336, | ||
"name": "[fairwinds/insights-agent] CronJob duration since last schedule exceeded (15m)", | ||
"type": "query alert", | ||
"query": "max(last_5m):avg:kubernetes_state.cronjob.duration_since_last_schedule{kube_namespace:insights-agent AND kube_cronjob IN (prometheus-metrics, workloads)} by {cluster_name,kube_namespace,kube_cronjob} > 1200", | ||
"message": "{{#is_alert}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_alert}}\n\n{{#is_warning}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_warning}}", | ||
"tags": [], | ||
"options": { | ||
"thresholds": { | ||
"critical": 1200, | ||
"warning": 960 | ||
}, | ||
"notify_audit": false, | ||
"include_tags": true, | ||
"new_group_delay": 60, | ||
"notify_no_data": false, | ||
"silenced": {} | ||
}, | ||
"priority": null, | ||
"restricted_roles": null | ||
} |
21 changes: 21 additions & 0 deletions
21
monitoring/dashboards/datadog/insights-agent/cronjob_duration_1h_monitor.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"id": 138652989, | ||
"name": "[fairwinds/insights-agent] CronJob duration since last schedule exceeded (1h)", | ||
"type": "query alert", | ||
"query": "max(last_5m):avg:kubernetes_state.cronjob.duration_since_last_schedule{kube_namespace:insights-agent AND kube_cronjob NOT IN (prometheus-metrics, workloads)} by {cluster_name,kube_namespace,kube_cronjob} > 3900", | ||
"message": "{{#is_alert}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_alert}} \n\n{{#is_warning}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_warning}}", | ||
"tags": [], | ||
"options": { | ||
"thresholds": { | ||
"critical": 3900, | ||
"warning": 3660 | ||
}, | ||
"notify_audit": false, | ||
"include_tags": true, | ||
"new_group_delay": 60, | ||
"notify_no_data": false, | ||
"silenced": {} | ||
}, | ||
"priority": null, | ||
"restricted_roles": null | ||
} |
21 changes: 21 additions & 0 deletions
21
monitoring/dashboards/datadog/insights-agent/cronjob_oom_events_monitor.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
{ | ||
"id": 138653777, | ||
"name": "[fairwinds/insights-agent] CronJob OOM events", | ||
"type": "query alert", | ||
"query": "max(last_5m):container.memory.oom_events{cluster_name:production-eks AND kube_namespace:insights-agent} by {kube_cronjob} > 4", | ||
"message": "{{#is_alert}} CronJob has OOM events. CronJob name: {{kube_cronjob.name}} {{/is_alert}}\n\n{{#is_warning}} CronJob has OOM events. CronJob name: {{kube_cronjob.name}} {{/is_warning}}", | ||
"tags": [], | ||
"options": { | ||
"thresholds": { | ||
"critical": 4, | ||
"warning": 3 | ||
}, | ||
"notify_audit": false, | ||
"include_tags": true, | ||
"new_group_delay": 60, | ||
"notify_no_data": false, | ||
"silenced": {} | ||
}, | ||
"priority": null, | ||
"restricted_roles": null | ||
} |
322 changes: 322 additions & 0 deletions
322
monitoring/dashboards/datadog/insights-agent/dashboard.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,322 @@ | ||
{ | ||
"title": "[fairwinds/insights-agent] Kubernetes CronJob Dashboard", | ||
"description": null, | ||
"widgets": [ | ||
{ | ||
"id": 1732392467968311, | ||
"definition": { | ||
"type": "free_text", | ||
"text": "Fairwinds Insights Agent", | ||
"color": "#6915F0", | ||
"font_size": "56", | ||
"text_align": "center" | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 0, | ||
"width": 12, | ||
"height": 1 | ||
} | ||
}, | ||
{ | ||
"id": 1360739785147291, | ||
"definition": { | ||
"title": "Resource Consumption", | ||
"background_color": "blue", | ||
"show_title": true, | ||
"type": "group", | ||
"layout_type": "ordered", | ||
"widgets": [ | ||
{ | ||
"id": 4774855640317495, | ||
"definition": { | ||
"title": "CPU usage by CronJob", | ||
"title_size": "16", | ||
"title_align": "left", | ||
"show_legend": true, | ||
"legend_layout": "auto", | ||
"legend_columns": [ | ||
"avg", | ||
"min", | ||
"max", | ||
"value", | ||
"sum" | ||
], | ||
"type": "timeseries", | ||
"requests": [ | ||
{ | ||
"formulas": [ | ||
{ | ||
"formula": "exclude_null(query1)" | ||
} | ||
], | ||
"queries": [ | ||
{ | ||
"name": "query1", | ||
"data_source": "metrics", | ||
"query": "sum:kubernetes.cpu.usage.total{$kube_namespace} by {kube_cluster_name,kube_namespace,kube_cronjob}" | ||
} | ||
], | ||
"response_format": "timeseries", | ||
"style": { | ||
"palette": "dog_classic", | ||
"line_type": "solid", | ||
"line_width": "normal" | ||
}, | ||
"display_type": "line" | ||
} | ||
] | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 0, | ||
"width": 6, | ||
"height": 3 | ||
} | ||
}, | ||
{ | ||
"id": 2692162135133172, | ||
"definition": { | ||
"title": "Memory usage by CronJob", | ||
"title_size": "16", | ||
"title_align": "left", | ||
"show_legend": true, | ||
"legend_layout": "auto", | ||
"legend_columns": [ | ||
"avg", | ||
"min", | ||
"max", | ||
"value", | ||
"sum" | ||
], | ||
"type": "timeseries", | ||
"requests": [ | ||
{ | ||
"formulas": [ | ||
{ | ||
"formula": "exclude_null(query1)" | ||
} | ||
], | ||
"queries": [ | ||
{ | ||
"name": "query1", | ||
"data_source": "metrics", | ||
"query": "sum:kubernetes.memory.usage{$kube_namespace} by {kube_namespace,kube_cronjob,kube_cluster_name}" | ||
} | ||
], | ||
"response_format": "timeseries", | ||
"style": { | ||
"palette": "dog_classic", | ||
"line_type": "solid", | ||
"line_width": "normal" | ||
}, | ||
"display_type": "line" | ||
} | ||
] | ||
}, | ||
"layout": { | ||
"x": 6, | ||
"y": 0, | ||
"width": 6, | ||
"height": 3 | ||
} | ||
}, | ||
{ | ||
"id": 4883798007459927, | ||
"definition": { | ||
"title": "CronJob OOM Events", | ||
"title_size": "16", | ||
"title_align": "left", | ||
"type": "alert_graph", | ||
"alert_id": "138653777", | ||
"viz_type": "timeseries" | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 3, | ||
"width": 12, | ||
"height": 3 | ||
} | ||
} | ||
] | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 1, | ||
"width": 12, | ||
"height": 7 | ||
} | ||
}, | ||
{ | ||
"id": 5187018217533442, | ||
"definition": { | ||
"title": "Scheduling", | ||
"background_color": "vivid_green", | ||
"show_title": true, | ||
"type": "group", | ||
"layout_type": "ordered", | ||
"widgets": [ | ||
{ | ||
"id": 425294088513441, | ||
"definition": { | ||
"title": "Successful Job Pods (avg)", | ||
"title_size": "16", | ||
"title_align": "left", | ||
"type": "query_value", | ||
"requests": [ | ||
{ | ||
"response_format": "scalar", | ||
"queries": [ | ||
{ | ||
"name": "query1", | ||
"data_source": "metrics", | ||
"query": "avg:kubernetes_state.job.succeeded{$kube_namespace}", | ||
"aggregator": "avg" | ||
} | ||
], | ||
"conditional_formats": [ | ||
{ | ||
"comparator": ">", | ||
"value": 1, | ||
"palette": "white_on_green" | ||
} | ||
], | ||
"formulas": [ | ||
{ | ||
"formula": "query1" | ||
} | ||
] | ||
} | ||
], | ||
"autoscale": true, | ||
"custom_unit": "jobs", | ||
"precision": 2, | ||
"timeseries_background": { | ||
"type": "area", | ||
"yaxis": { | ||
"include_zero": true | ||
} | ||
} | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 0, | ||
"width": 6, | ||
"height": 2 | ||
} | ||
}, | ||
{ | ||
"id": 8751120051550263, | ||
"definition": { | ||
"title": "Failed Job Pods (avg)", | ||
"title_size": "16", | ||
"title_align": "left", | ||
"type": "query_value", | ||
"requests": [ | ||
{ | ||
"response_format": "scalar", | ||
"queries": [ | ||
{ | ||
"name": "query1", | ||
"data_source": "metrics", | ||
"query": "avg:kubernetes_state.job.completion.failed{$kube_namespace}", | ||
"aggregator": "avg" | ||
} | ||
], | ||
"conditional_formats": [ | ||
{ | ||
"comparator": ">", | ||
"value": 1, | ||
"palette": "white_on_red" | ||
} | ||
], | ||
"formulas": [ | ||
{ | ||
"formula": "query1" | ||
} | ||
] | ||
} | ||
], | ||
"autoscale": true, | ||
"custom_unit": "jobs", | ||
"precision": 2, | ||
"timeseries_background": { | ||
"type": "area" | ||
} | ||
}, | ||
"layout": { | ||
"x": 6, | ||
"y": 0, | ||
"width": 6, | ||
"height": 2 | ||
} | ||
}, | ||
{ | ||
"id": 5670253520315896, | ||
"definition": { | ||
"title": "CronJob duration since last schedule (15m)", | ||
"title_size": "16", | ||
"title_align": "left", | ||
"type": "alert_graph", | ||
"alert_id": "138653336", | ||
"viz_type": "timeseries" | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 2, | ||
"width": 12, | ||
"height": 4 | ||
} | ||
}, | ||
{ | ||
"id": 923140578038269, | ||
"definition": { | ||
"title": "CronJob duration since last schedule (1h)", | ||
"title_size": "16", | ||
"title_align": "left", | ||
"type": "alert_graph", | ||
"alert_id": "138652989", | ||
"viz_type": "timeseries" | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 6, | ||
"width": 12, | ||
"height": 4 | ||
} | ||
} | ||
] | ||
}, | ||
"layout": { | ||
"x": 0, | ||
"y": 8, | ||
"width": 12, | ||
"height": 11, | ||
"is_column_break": true | ||
} | ||
} | ||
], | ||
"template_variables": [ | ||
{ | ||
"name": "kube_cluster_name", | ||
"prefix": "kube_cluster_name", | ||
"available_values": [], | ||
"default": "*" | ||
}, | ||
{ | ||
"name": "kube_namespace", | ||
"prefix": "kube_namespace", | ||
"available_values": [], | ||
"default": "insights-agent" | ||
}, | ||
{ | ||
"name": "kube_cronjob", | ||
"prefix": "kube_cronjob", | ||
"available_values": [], | ||
"default": "*" | ||
} | ||
], | ||
"layout_type": "ordered", | ||
"notify_list": [], | ||
"reflow_type": "fixed" | ||
} |