diff --git a/resources/grafana/generated/dashboards/rhacs-central.yaml b/resources/grafana/generated/dashboards/rhacs-central.yaml index a83bb13f..e7a9e261 100644 --- a/resources/grafana/generated/dashboards/rhacs-central.yaml +++ b/resources/grafana/generated/dashboards/rhacs-central.yaml @@ -522,6 +522,7 @@ spec: "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", + "axisBorderShow": false, "fillOpacity": 10, "gradientMode": "none", "hideFrom": { @@ -531,6 +532,7 @@ spec: }, "lineInterpolation": "stepAfter", "lineWidth": 1, + "insertNulls": false, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -583,7 +585,6 @@ spec: "y": 7 }, "id": 110, - "links": [], "options": { "legend": { "calcs": [], @@ -622,6 +623,62 @@ spec: "legendFormat": "scanner", "range": true, "refId": "B" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"db\", pod=~\"scanner-db-.*\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-db", + "range": true, + "refId": "F" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"matcher\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-v4-matcher", + "range": true, + "refId": "C" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"indexer\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-v4-indexer", + "range": true, + "refId": "D" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"db\", pod=~\"scanner-v4-db-.*\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-v4-db", + "range": true, + "refId": "E" } ], "title": "Container Restarts", diff --git a/resources/grafana/generated/dashboards/rhacs-emailsender.yaml b/resources/grafana/generated/dashboards/rhacs-emailsender.yaml new file mode 100644 index 00000000..9b4e07de --- /dev/null +++ b/resources/grafana/generated/dashboards/rhacs-emailsender.yaml @@ -0,0 +1,611 @@ +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + labels: + app: rhacs + monitoring-key: middleware + name: rhacs-emailsender +spec: + name: rhacs-emailsender.json + json: | + { + "__inputs": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "iconColor": "purple", + "name": "Kubernetes Upgrade", + "textFormat": "Kubernetes Upgrade" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", + "iconColor": "red", + "name": "OpenShift Upgrade", + "textFormat": "OpenShift Upgrade" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 25, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(acs_emailsender_send_email_total{job=~\"emailsender\"}[5m])) by (tenant_id)", + "interval": "", + "legendFormat": "{{tenant_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of sent emails by tenant", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 147, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(acs_emailsender_send_email_total{job=\"emailsender\"}) by (tenant_id)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total emails sent by tenant", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "No registered throttled email send events", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 148, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(acs_emailsender_throttled_send_email_total{job=~\"emailsender\"}[5m])) by (tenant_id)", + "interval": "", + "legendFormat": "{{tenant_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of throttled email sends by tenant", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "No registered failed email send events", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 149, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(acs_emailsender_failed_send_email_total{job=~\"emailsender\"}[5m])) by (tenant_id)", + "interval": "", + "legendFormat": "{{tenant_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of failed email sends by tenant", + "type": "timeseries" + } + ], + "refresh": "", + "revision": 1, + "schemaVersion": 39, + "tags": ["rhacs"], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_name)", + "description": "Red Hat SSO Organisation Name", + "hide": 0, + "includeAll": true, + "label": "Organisation", + "multi": true, + "name": "org_name", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_id)", + "description": "Red Hat SSO Organisation ID", + "hide": 0, + "includeAll": true, + "label": "Organisation ID", + "multi": true, + "name": "org_id", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id=~\"$org_id\",cluster_id=~\"$cluster_id\"}, rhacs_instance_id)", + "description": "RHACS Central Instance ID", + "hide": 0, + "includeAll": true, + "label": "Central", + "multi": true, + "name": "instance_id", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id=~\"$org_id\",cluster_id=~\"$cluster_id\"}, rhacs_instance_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\"}, cluster_id)", + "description": "RHACS Cluster ID", + "hide": 0, + "includeAll": true, + "label": "Cluster ID", + "multi": true, + "name": "cluster_id", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\"}, cluster_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "RHACS Dataplane - Email Sender", + "uid": "aba19019-8d3f-4672-b1e0-1ff4a3d0ee0a", + "version": 1, + "weekStart": "" + } diff --git a/resources/grafana/sources/rhacs-central.json b/resources/grafana/sources/rhacs-central.json index aac1ed20..fc03343c 100644 --- a/resources/grafana/sources/rhacs-central.json +++ b/resources/grafana/sources/rhacs-central.json @@ -512,6 +512,7 @@ "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", + "axisBorderShow": false, "fillOpacity": 10, "gradientMode": "none", "hideFrom": { @@ -521,6 +522,7 @@ }, "lineInterpolation": "stepAfter", "lineWidth": 1, + "insertNulls": false, "pointSize": 5, "scaleDistribution": { "type": "linear" @@ -573,7 +575,6 @@ "y": 7 }, "id": 110, - "links": [], "options": { "legend": { "calcs": [], @@ -612,6 +613,62 @@ "legendFormat": "scanner", "range": true, "refId": "B" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"db\", pod=~\"scanner-db-.*\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-db", + "range": true, + "refId": "F" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"matcher\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-v4-matcher", + "range": true, + "refId": "C" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"indexer\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-v4-indexer", + "range": true, + "refId": "D" + }, + { + "datasource": { + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(increase(kube_pod_container_status_restarts_total{namespace=\"rhacs-$instance_id\", container=\"db\", pod=~\"scanner-v4-db-.*\", job=~\"kube-state-metrics\"}[5m]))", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "scanner-v4-db", + "range": true, + "refId": "E" } ], "title": "Container Restarts", diff --git a/resources/grafana/sources/rhacs-emailsender.json b/resources/grafana/sources/rhacs-emailsender.json new file mode 100644 index 00000000..61a20b54 --- /dev/null +++ b/resources/grafana/sources/rhacs-emailsender.json @@ -0,0 +1,601 @@ +{ + "__inputs": [], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "7.0.3" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "expr": "count (count by (git_version) (label_replace(count_over_time(kubernetes_build_info{job!~\"kube-dns|coredns\"}[${__interval}]), \"git_version\", \"$1\", \"git_version\", \"(v[0-9]*.[0-9]*).*\"))) > 1", + "iconColor": "purple", + "name": "Kubernetes Upgrade", + "textFormat": "Kubernetes Upgrade" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "enable": true, + "expr": "count (count by (gitVersion) (count_over_time (openshift_apiserver_build_info[${__interval}]))) > 1", + "iconColor": "red", + "name": "OpenShift Upgrade", + "textFormat": "OpenShift Upgrade" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 25, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(acs_emailsender_send_email_total{job=~\"emailsender\"}[5m])) by (tenant_id)", + "interval": "", + "legendFormat": "{{tenant_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of sent emails by tenant", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 147, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(acs_emailsender_send_email_total{job=\"emailsender\"}) by (tenant_id)", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total emails sent by tenant", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "No registered throttled email send events", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 148, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(acs_emailsender_throttled_send_email_total{job=~\"emailsender\"}[5m])) by (tenant_id)", + "interval": "", + "legendFormat": "{{tenant_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of throttled email sends by tenant", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 3, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "noValue": "No registered failed email send events", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 149, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "editorMode": "code", + "expr": "sum(rate(acs_emailsender_failed_send_email_total{job=~\"emailsender\"}[5m])) by (tenant_id)", + "interval": "", + "legendFormat": "{{tenant_id}}", + "range": true, + "refId": "A" + } + ], + "title": "Rate of failed email sends by tenant", + "type": "timeseries" + } + ], + "refresh": "", + "revision": 1, + "schemaVersion": 39, + "tags": ["rhacs"], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_name)", + "description": "Red Hat SSO Organisation Name", + "hide": 0, + "includeAll": true, + "label": "Organisation", + "multi": true, + "name": "org_name", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_id)", + "description": "Red Hat SSO Organisation ID", + "hide": 0, + "includeAll": true, + "label": "Organisation ID", + "multi": true, + "name": "org_id", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id!=\"16536854\",cluster_id=~\"$cluster_id\"}, rhacs_org_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": true, + "text": ["All"], + "value": ["$__all"] + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id=~\"$org_id\",cluster_id=~\"$cluster_id\"}, rhacs_instance_id)", + "description": "RHACS Central Instance ID", + "hide": 0, + "includeAll": true, + "label": "Central", + "multi": true, + "name": "instance_id", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\", rhacs_org_name=~\"$org_name\", rhacs_org_id=~\"$org_id\",cluster_id=~\"$cluster_id\"}, rhacs_instance_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "definition": "label_values(process_cpu_seconds_total{job=\"central\"}, cluster_id)", + "description": "RHACS Cluster ID", + "hide": 0, + "includeAll": true, + "label": "Cluster ID", + "multi": true, + "name": "cluster_id", + "options": [], + "query": { + "query": "label_values(process_cpu_seconds_total{job=\"central\"}, cluster_id)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-24h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "RHACS Dataplane - Email Sender", + "uid": "aba19019-8d3f-4672-b1e0-1ff4a3d0ee0a", + "version": 1, + "weekStart": "" +} diff --git a/resources/grafana/templates/dashboards/rhacs-emailsender.yaml b/resources/grafana/templates/dashboards/rhacs-emailsender.yaml new file mode 100644 index 00000000..4bda1633 --- /dev/null +++ b/resources/grafana/templates/dashboards/rhacs-emailsender.yaml @@ -0,0 +1,10 @@ +apiVersion: integreatly.org/v1alpha1 +kind: GrafanaDashboard +metadata: + labels: + app: rhacs + monitoring-key: middleware + name: rhacs-emailsender +spec: + name: rhacs-emailsender.json + json: | diff --git a/resources/index.json b/resources/index.json index f4f56526..e36dc3ae 100644 --- a/resources/index.json +++ b/resources/index.json @@ -6,7 +6,8 @@ "prometheus/pod_monitors/prometheus-self-metrics.yaml", "prometheus/pod_monitors/rhacs-cloudwatch-exporter.yaml", "prometheus/pod_monitors/rhacs-fleetshard-sync-metrics.yaml", - "prometheus/pod_monitors/rhacs-tenant-metrics.yaml" + "prometheus/pod_monitors/rhacs-tenant-metrics.yaml", + "prometheus/pod_monitors/rhacs-emailsender-metrics.yaml" ], "rules": [ "prometheus/billing-rules.yaml", @@ -47,7 +48,8 @@ "grafana/generated/dashboards/rhacs-central-release.yaml", "grafana/generated/dashboards/rhacs-central-slo.yaml", "grafana/generated/dashboards/rhacs-cluster-overview.yaml", - "grafana/generated/dashboards/rhacs-cluster-resource-adjustment.yaml" + "grafana/generated/dashboards/rhacs-cluster-resource-adjustment.yaml", + "grafana/generated/dashboards/rhacs-emailsender.yaml" ], "grafanaVersion": "11.1.0" }, diff --git a/resources/prometheus/pod_monitors/rhacs-emailsender-metrics.yaml b/resources/prometheus/pod_monitors/rhacs-emailsender-metrics.yaml new file mode 100644 index 00000000..71c99f7a --- /dev/null +++ b/resources/prometheus/pod_monitors/rhacs-emailsender-metrics.yaml @@ -0,0 +1,30 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: rhacs-emailsender-metrics + labels: + app: rhacs +spec: + selector: + matchLabels: + app: "emailsender" + namespaceSelector: + any: true + podMetricsEndpoints: + - path: /metrics + port: monitoring + relabelings: + - action: labeldrop + regex: endpoint + + - sourceLabels: [container] + action: replace + targetLabel: job + + - action: labelmap + regex: __meta_kubernetes_pod_annotation_rhacs_redhat_com_(.+) + replacement: rhacs_${1} + + - action: labelmap + regex: __meta_kubernetes_pod_label_rhacs_redhat_com_(.+) + replacement: rhacs_${1} diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index 7a5930c9..b2f33f50 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -159,6 +159,59 @@ spec: description: "Fleetshard synchronizer manages `{{ $value }}` centrals. The number of Centrals should always be larger than zero in a working system. If it drops to or below zero, fleetshard synchronizer is assumed to be in a failed state." sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-007-fleetshard-sync-reconciliation-error.md" + - name: rhacs-emailsender + rules: + - alert: RHACSEmailsenderScrapeFailed + expr: | + (avg_over_time(up{pod=~"emailsender-.*"}[10m]) < 0.5 and ON(pod) kube_pod_container_status_ready{pod=~"emailsender-.*"} == 1) or absent(up{pod=~"emailsender-.*"}) + for: 20m + labels: + severity: warning + annotations: + summary: "Prometheus unable to scrape metrics from target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}`." + description: "During the last 10 minutes, only `{{ $value | humanizePercentage }}` of scrapes of target `{{ $labels.pod }}` in namespace `{{ $labels.namespace }}` were successful. This alert is raised when less than 50% of scrapes are successful." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" + - alert: RHACSEmailsenderContainerDown + expr: | + avg_over_time(kube_pod_container_status_ready{pod=~"emailsender-.*"}[10m]) < 0.5 + for: 20m + labels: + severity: warning + annotations: + summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` is down or in a CrashLoopBackOff status." + description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has been down or in a CrashLoopBackOff status for at least 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" + - alert: RHACSEmailsenderContainerFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts_total{pod=~"emailsender-.*"}[30m]) > 3 + labels: + severity: warning + annotations: + summary: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` restarted more than 3 times." + description: "Email Sender container `{{ $labels.pod }}/{{ $labels.container }}` in namespace `{{ $labels.namespace }}` has restarted more than 3 times during the last 30 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" + - alert: RHACSEmailsenderSendErrors + expr: |2 + (rate(acs_emailsender_failed_send_email_total[10m]) + / + rate(acs_emailsender_send_email_total[10m])) > 0.10 + for: 5m + labels: + severity: warning + annotations: + summary: "Email Sender container failing sending emails" + description: "Email Sender has a send email error rate of {{ $value | humanizePercentage }} over the last 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md" + - alert: RHACSEmailsenderThrottledSend + expr: | + rate(acs_emailsender_throttled_send_email_total[10m]) * 60 > 0 + for: 10m + labels: + severity: warning + annotations: + summary: "Email Sender throttled sending for `{{ $labels.tenant_id }}` Central instance" + description: "Email Sender throttled `{{ $labels.tenant_id }}` Central {{ $value | humanize }} time(s) per minute over the last 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md" + - name: tenant-resources rules: - expr: | diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml new file mode 100644 index 00000000..42f72a82 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderContainerDown.yaml @@ -0,0 +1,27 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "1+0x10 0+0x50" + alert_rule_test: + - eval_time: 15m + alertname: RHACSEmailsenderContainerDown + exp_alerts: [] + - eval_time: 40m + alertname: RHACSEmailsenderContainerDown + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderContainerDown + container: emailsender + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` is down or in a CrashLoopBackOff status." + description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has been down or in a CrashLoopBackOff status for at least 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderContainerFrequentlyRestarting.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderContainerFrequentlyRestarting.yaml new file mode 100644 index 00000000..07c36bf1 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderContainerFrequentlyRestarting.yaml @@ -0,0 +1,27 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: kube_pod_container_status_restarts_total{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "0+0x30 1+1x10 4+1x20" + alert_rule_test: + - eval_time: 30m + alertname: RHACSEmailsenderContainerFrequentlyRestarting + exp_alerts: [] + - eval_time: 60m + alertname: RHACSEmailsenderContainerFrequentlyRestarting + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderContainerFrequentlyRestarting + container: emailsender + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` restarted more than 3 times." + description: "Email Sender container `emailsender-123/emailsender` in namespace `rhacs` has restarted more than 3 times during the last 30 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderScrapeFailed.yaml.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderScrapeFailed.yaml.yaml new file mode 100644 index 00000000..351bb80e --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderScrapeFailed.yaml.yaml @@ -0,0 +1,29 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: up{namespace="rhacs", pod="emailsender-123", instance="1.2.3.4:9090"} + values: "0+0x20 1+0x20" + - series: kube_pod_container_status_ready{namespace="rhacs", pod="emailsender-123"} + values: "1+0x40" + alert_rule_test: + - eval_time: 10m + alertname: RHACSEmailsenderScrapeFailed + exp_alerts: [] + - eval_time: 25m + alertname: RHACSEmailsenderScrapeFailed + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderScrapeFailed + instance: 1.2.3.4:9090 + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Prometheus unable to scrape metrics from target `emailsender-123` in namespace `rhacs`." + description: "During the last 10 minutes, only `45.45%` of scrapes of target `emailsender-123` in namespace `rhacs` were successful. This alert is raised when less than 50% of scrapes are successful." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-046-emailsender-unavailable.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml new file mode 100644 index 00000000..11f5ef39 --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderSendErrors.yaml @@ -0,0 +1,29 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_emailsender_failed_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "0+0x10 1+1x50" + - series: acs_emailsender_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender"} + values: "1+1x10 1+2x50" + alert_rule_test: + - eval_time: 15m + alertname: RHACSEmailsenderSendErrors + exp_alerts: [] + - eval_time: 40m + alertname: RHACSEmailsenderSendErrors + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderSendErrors + container: emailsender + namespace: rhacs + pod: emailsender-123 + severity: warning + exp_annotations: + summary: "Email Sender container failing sending emails" + description: "Email Sender has a send email error rate of 50% over the last 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-047-emailsender-ses-send-error.md" diff --git a/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml b/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml new file mode 100644 index 00000000..a270dc6f --- /dev/null +++ b/resources/prometheus/unit_tests/RHACSEmailsenderThrottledSend.yaml @@ -0,0 +1,28 @@ +rule_files: + - /tmp/prometheus-rules-test.yaml + +evaluation_interval: 1m + +tests: + - interval: 1m + input_series: + - series: acs_emailsender_throttled_send_email_total{namespace="rhacs", pod="emailsender-123", container="emailsender", tenant_id="centralid"} + values: "0+0x10 2+1x60" + alert_rule_test: + - eval_time: 15m + alertname: RHACSEmailsenderThrottledSend + exp_alerts: [] + - eval_time: 40m + alertname: RHACSEmailsenderThrottledSend + exp_alerts: + - exp_labels: + alertname: RHACSEmailsenderThrottledSend + container: emailsender + namespace: rhacs + pod: emailsender-123 + tenant_id: centralid + severity: warning + exp_annotations: + summary: "Email Sender throttled sending for `centralid` Central instance" + description: "Email Sender throttled `centralid` Central 1 time(s) per minute over the last 10 minutes." + sop_url: "https://gitlab.cee.redhat.com/stackrox/acs-managed-service-runbooks/blob/master/sops/dp-048-emailsender-ses-send-throttled.md"