Skip to content

Commit

Permalink
add datadog dashboards for monitoring Fairwinds insights-agent (#871)
Browse files Browse the repository at this point in the history
  • Loading branch information
jslivka authored Feb 12, 2024
1 parent 76fb32f commit 3ff0b1d
Show file tree
Hide file tree
Showing 4 changed files with 385 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"id": 138653336,
"name": "[fairwinds/insights-agent] CronJob duration since last schedule exceeded (15m)",
"type": "query alert",
"query": "max(last_5m):avg:kubernetes_state.cronjob.duration_since_last_schedule{kube_namespace:insights-agent AND kube_cronjob IN (prometheus-metrics, workloads)} by {cluster_name,kube_namespace,kube_cronjob} > 1200",
"message": "{{#is_alert}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_alert}}\n\n{{#is_warning}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_warning}}",
"tags": [],
"options": {
"thresholds": {
"critical": 1200,
"warning": 960
},
"notify_audit": false,
"include_tags": true,
"new_group_delay": 60,
"notify_no_data": false,
"silenced": {}
},
"priority": null,
"restricted_roles": null
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"id": 138652989,
"name": "[fairwinds/insights-agent] CronJob duration since last schedule exceeded (1h)",
"type": "query alert",
"query": "max(last_5m):avg:kubernetes_state.cronjob.duration_since_last_schedule{kube_namespace:insights-agent AND kube_cronjob NOT IN (prometheus-metrics, workloads)} by {cluster_name,kube_namespace,kube_cronjob} > 3900",
"message": "{{#is_alert}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_alert}} \n\n{{#is_warning}} CronJob has exceeded (> {{threshold}} ). CronJob name: {{kube_cronjob.name}} {{/is_warning}}",
"tags": [],
"options": {
"thresholds": {
"critical": 3900,
"warning": 3660
},
"notify_audit": false,
"include_tags": true,
"new_group_delay": 60,
"notify_no_data": false,
"silenced": {}
},
"priority": null,
"restricted_roles": null
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"id": 138653777,
"name": "[fairwinds/insights-agent] CronJob OOM events",
"type": "query alert",
"query": "max(last_5m):container.memory.oom_events{cluster_name:production-eks AND kube_namespace:insights-agent} by {kube_cronjob} > 4",
"message": "{{#is_alert}} CronJob has OOM events. CronJob name: {{kube_cronjob.name}} {{/is_alert}}\n\n{{#is_warning}} CronJob has OOM events. CronJob name: {{kube_cronjob.name}} {{/is_warning}}",
"tags": [],
"options": {
"thresholds": {
"critical": 4,
"warning": 3
},
"notify_audit": false,
"include_tags": true,
"new_group_delay": 60,
"notify_no_data": false,
"silenced": {}
},
"priority": null,
"restricted_roles": null
}
322 changes: 322 additions & 0 deletions monitoring/dashboards/datadog/insights-agent/dashboard.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,322 @@
{
"title": "[fairwinds/insights-agent] Kubernetes CronJob Dashboard",
"description": null,
"widgets": [
{
"id": 1732392467968311,
"definition": {
"type": "free_text",
"text": "Fairwinds Insights Agent",
"color": "#6915F0",
"font_size": "56",
"text_align": "center"
},
"layout": {
"x": 0,
"y": 0,
"width": 12,
"height": 1
}
},
{
"id": 1360739785147291,
"definition": {
"title": "Resource Consumption",
"background_color": "blue",
"show_title": true,
"type": "group",
"layout_type": "ordered",
"widgets": [
{
"id": 4774855640317495,
"definition": {
"title": "CPU usage by CronJob",
"title_size": "16",
"title_align": "left",
"show_legend": true,
"legend_layout": "auto",
"legend_columns": [
"avg",
"min",
"max",
"value",
"sum"
],
"type": "timeseries",
"requests": [
{
"formulas": [
{
"formula": "exclude_null(query1)"
}
],
"queries": [
{
"name": "query1",
"data_source": "metrics",
"query": "sum:kubernetes.cpu.usage.total{$kube_namespace} by {kube_cluster_name,kube_namespace,kube_cronjob}"
}
],
"response_format": "timeseries",
"style": {
"palette": "dog_classic",
"line_type": "solid",
"line_width": "normal"
},
"display_type": "line"
}
]
},
"layout": {
"x": 0,
"y": 0,
"width": 6,
"height": 3
}
},
{
"id": 2692162135133172,
"definition": {
"title": "Memory usage by CronJob",
"title_size": "16",
"title_align": "left",
"show_legend": true,
"legend_layout": "auto",
"legend_columns": [
"avg",
"min",
"max",
"value",
"sum"
],
"type": "timeseries",
"requests": [
{
"formulas": [
{
"formula": "exclude_null(query1)"
}
],
"queries": [
{
"name": "query1",
"data_source": "metrics",
"query": "sum:kubernetes.memory.usage{$kube_namespace} by {kube_namespace,kube_cronjob,kube_cluster_name}"
}
],
"response_format": "timeseries",
"style": {
"palette": "dog_classic",
"line_type": "solid",
"line_width": "normal"
},
"display_type": "line"
}
]
},
"layout": {
"x": 6,
"y": 0,
"width": 6,
"height": 3
}
},
{
"id": 4883798007459927,
"definition": {
"title": "CronJob OOM Events",
"title_size": "16",
"title_align": "left",
"type": "alert_graph",
"alert_id": "138653777",
"viz_type": "timeseries"
},
"layout": {
"x": 0,
"y": 3,
"width": 12,
"height": 3
}
}
]
},
"layout": {
"x": 0,
"y": 1,
"width": 12,
"height": 7
}
},
{
"id": 5187018217533442,
"definition": {
"title": "Scheduling",
"background_color": "vivid_green",
"show_title": true,
"type": "group",
"layout_type": "ordered",
"widgets": [
{
"id": 425294088513441,
"definition": {
"title": "Successful Job Pods (avg)",
"title_size": "16",
"title_align": "left",
"type": "query_value",
"requests": [
{
"response_format": "scalar",
"queries": [
{
"name": "query1",
"data_source": "metrics",
"query": "avg:kubernetes_state.job.succeeded{$kube_namespace}",
"aggregator": "avg"
}
],
"conditional_formats": [
{
"comparator": ">",
"value": 1,
"palette": "white_on_green"
}
],
"formulas": [
{
"formula": "query1"
}
]
}
],
"autoscale": true,
"custom_unit": "jobs",
"precision": 2,
"timeseries_background": {
"type": "area",
"yaxis": {
"include_zero": true
}
}
},
"layout": {
"x": 0,
"y": 0,
"width": 6,
"height": 2
}
},
{
"id": 8751120051550263,
"definition": {
"title": "Failed Job Pods (avg)",
"title_size": "16",
"title_align": "left",
"type": "query_value",
"requests": [
{
"response_format": "scalar",
"queries": [
{
"name": "query1",
"data_source": "metrics",
"query": "avg:kubernetes_state.job.completion.failed{$kube_namespace}",
"aggregator": "avg"
}
],
"conditional_formats": [
{
"comparator": ">",
"value": 1,
"palette": "white_on_red"
}
],
"formulas": [
{
"formula": "query1"
}
]
}
],
"autoscale": true,
"custom_unit": "jobs",
"precision": 2,
"timeseries_background": {
"type": "area"
}
},
"layout": {
"x": 6,
"y": 0,
"width": 6,
"height": 2
}
},
{
"id": 5670253520315896,
"definition": {
"title": "CronJob duration since last schedule (15m)",
"title_size": "16",
"title_align": "left",
"type": "alert_graph",
"alert_id": "138653336",
"viz_type": "timeseries"
},
"layout": {
"x": 0,
"y": 2,
"width": 12,
"height": 4
}
},
{
"id": 923140578038269,
"definition": {
"title": "CronJob duration since last schedule (1h)",
"title_size": "16",
"title_align": "left",
"type": "alert_graph",
"alert_id": "138652989",
"viz_type": "timeseries"
},
"layout": {
"x": 0,
"y": 6,
"width": 12,
"height": 4
}
}
]
},
"layout": {
"x": 0,
"y": 8,
"width": 12,
"height": 11,
"is_column_break": true
}
}
],
"template_variables": [
{
"name": "kube_cluster_name",
"prefix": "kube_cluster_name",
"available_values": [],
"default": "*"
},
{
"name": "kube_namespace",
"prefix": "kube_namespace",
"available_values": [],
"default": "insights-agent"
},
{
"name": "kube_cronjob",
"prefix": "kube_cronjob",
"available_values": [],
"default": "*"
}
],
"layout_type": "ordered",
"notify_list": [],
"reflow_type": "fixed"
}

0 comments on commit 3ff0b1d

Please sign in to comment.