From 2973d15b93a3f6d07887a17c3af5382961a413c0 Mon Sep 17 00:00:00 2001 From: Ashish Jaiswal Date: Sun, 18 Aug 2024 11:54:13 +0530 Subject: [PATCH 1/3] added support for monitoring oosync apps of argocd, for now only kubeaid apps --- build/kube-prometheus/common-template.jsonnet | 103 +++++++++++++++++- 1 file changed, 101 insertions(+), 2 deletions(-) diff --git a/build/kube-prometheus/common-template.jsonnet b/build/kube-prometheus/common-template.jsonnet index 5f0c3eae7..e2cf15ec1 100644 --- a/build/kube-prometheus/common-template.jsonnet +++ b/build/kube-prometheus/common-template.jsonnet @@ -16,7 +16,77 @@ local default_vars = { 'cert-manager', 'traefik', ], - + kubeaid_users_apps+: [], + kubeaid_apps+: [ + 'argo-cd', + 'argocd-image-updater', + 'aws-ebs-csi-driver', + 'aws-efs-csi-driver', + 'capi-cluster', + 'ccm-hetzner', + 'cerebro', + 'cert-manager', + 'cilium', + 'circleci-runner', + 'cloudnative-pg', + 'cluster-api', + 'cluster-autoscaler', + 'crossplane', + 'dokuwiki', + 'errbot', + 'external-dns', + 'filebeat', + 'fluent-bit', + 'gatekeeper', + 'gitea-runner', + '.gitignore', + 'gitlab-runner', + 'grafana-operator', + 'graylog', + 'haproxy', + 'harbor', + 'k8id-custom-azure', + 'k8s-event-logger', + 'keda', + 'keycloakx', + 'kube2iam', + 'kubernetes-dashboard', + 'mail', + 'mariadb-operator', + 'matomo', + 'mattermost-team-edition', + 'metallb', + 'metrics-server', + 'mongodb-operator', + 'obmondo-k8s-agent', + 'oncall', + 'opencost', + 'opensearch', + 'opensearch-dashboards', + 'postgres-operator', + 'prometheus-adapter', + 'prometheus-linuxaid', + 'puppetserver', + 'rabbitmq-operator', + 'redis-operator', + 'redmine', + 'relate', + 'reloader', + 'rook-ceph', + 'sealed-secrets', + 'snapshot-controller', + 'sonarqube', + 'strimzi-kafka-operator', + 'teleport-cluster', + 'teleport-kube-agent', + 'tigera-operator', + 'traefik', + 'traefik-forward-auth', + 'velero', + 'whoami', + 'yetibot', + 'zfs-localpv', + ], prometheus_operator_resources: { limits: { memory: '80Mi' }, requests: { cpu: '20m', memory: '80Mi' }, @@ -207,7 +277,35 @@ local kp = import 'kube-prometheus/addons/custom-metrics.libsonnet' ) else {} ) + - + { + argocdApplications+: { + prometheusRuleExample+: { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'PrometheusRule', + metadata: { + name: 'kubeaidManagedApps', + namespace: $.values.common.namespace, + }, + spec: { + groups: [ + { + name: 'kubeaidManagedApps', + rules: [ + { + record: 'kubeaidManagedApps', + expr: 0, + labels: { + name: argocdApps, + }, + } + for argocdApps in vars.kubeaid_apps + vars.kubeaid_users_apps + ], + }, + ], + }, + }, + }, + } + { grafana+: { networkPolicy+: { @@ -679,6 +777,7 @@ local kp = } } + { ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + { ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['argocd-application-prometheus-rules' + name]: kp.argocdApplications[name] for name in std.objectFields(kp.argocdApplications) } + ( // Need to figure out elseif // if vars != 'gke' || vars != 'azure' didnt worked From f1e1a7d69919078d1fdf96df3ab34ffd24fb589f Mon Sep 17 00:00:00 2001 From: Ashish Jaiswal Date: Sun, 18 Aug 2024 22:17:40 +0530 Subject: [PATCH 2/3] removed .gitignore from the kubeaid list and fixed metadata.name for promrule and added a default project --- build/kube-prometheus/common-template.jsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/build/kube-prometheus/common-template.jsonnet b/build/kube-prometheus/common-template.jsonnet index e2cf15ec1..b7f7e2f24 100644 --- a/build/kube-prometheus/common-template.jsonnet +++ b/build/kube-prometheus/common-template.jsonnet @@ -39,7 +39,6 @@ local default_vars = { 'fluent-bit', 'gatekeeper', 'gitea-runner', - '.gitignore', 'gitlab-runner', 'grafana-operator', 'graylog', @@ -283,7 +282,7 @@ local kp = apiVersion: 'monitoring.coreos.com/v1', kind: 'PrometheusRule', metadata: { - name: 'kubeaidManagedApps', + name: 'kubeaid-managed-apps', namespace: $.values.common.namespace, }, spec: { @@ -296,6 +295,8 @@ local kp = expr: 0, labels: { name: argocdApps, + // TODO: maybe add support for other projects + project: 'default', }, } for argocdApps in vars.kubeaid_apps + vars.kubeaid_users_apps From 730161c0ed3893c5e8d9f99ab4a0abf33bd4963c Mon Sep 17 00:00:00 2001 From: Ashish Jaiswal Date: Sun, 18 Aug 2024 22:41:31 +0530 Subject: [PATCH 3/3] changed the alert expr for arogcd-app which are unhealthy and outofsync --- build/kube-prometheus/mixins/argo-cd/mixin.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet b/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet index ddd857057..c3ba22b20 100644 --- a/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet +++ b/build/kube-prometheus/mixins/argo-cd/mixin.libsonnet @@ -37,11 +37,11 @@ // Inspiration from here https://github.com/adinhodovic/argo-cd-mixin/blob/main/alerts/alerts.libsonnet { alert: 'ArgoCdAppOutOfSync', - expr: 'sum by (job, dest_server, project, sync_status) (argocd_app_info{job=~".*",sync_status!="Synced"}) >= 1', + expr: 'count by (project, sync_status) ((sum by (name, job, dest_server, project, sync_status) (argocd_app_info{job=~".*",sync_status!="Synced"}) >= 1) + on (name) group_left kubeaidManagedApps)', labels: { severity: 'warning', }, - 'for': '15m', + 'for': '2h', annotations: { summary: 'ArgoCD Application is Out Of Sync.', description: 'Multiple application under {{ .Labels.project }} is out of sync with the sync status {{ .Labels.sync_status }} for the past 15m', @@ -49,11 +49,11 @@ }, { alert: 'ArgoCdAppUnhealthy', - expr: 'sum by (job, project, dest_server, health_status) (argocd_app_info{health_status!~"Healthy|Progressing"}) >= 1', + expr: 'count by (health_status,project) ((sum by (name, job, dest_server, project, health_status) (argocd_app_info{health_status!~"Healthy|Progressing"}) >= 1) + on (name) group_left kubeaidManagedApps)', labels: { severity: 'warning', }, - 'for': '15m', + 'for': '2h', annotations: { summary: 'ArgoCD Application is not healthy.', description: 'Multiple application under {{ .Labels.project }} is not healthy with the health status {{ .Labels.health_status }} for the past 15m',