From a5275ae81a44da3e944cf86ca62bfd3dc2e98729 Mon Sep 17 00:00:00 2001 From: Michal Hucko Date: Fri, 11 Oct 2024 12:53:46 +0200 Subject: [PATCH] Add alert rules to pvcviewer-operator based on the KF093 spec (#55) (#62) * Add alert rules to pvcviewer-operator based on the KF093 spec * add cos integration tests for alert rules Co-authored-by: Robert Gildein --- .../KubeflowPvcviewerOperatorServices.rules | 24 +++++++++++++++++++ tests/integration/test_charm.py | 10 ++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/prometheus_alert_rules/KubeflowPvcviewerOperatorServices.rules diff --git a/src/prometheus_alert_rules/KubeflowPvcviewerOperatorServices.rules b/src/prometheus_alert_rules/KubeflowPvcviewerOperatorServices.rules new file mode 100644 index 0000000..c7d675d --- /dev/null +++ b/src/prometheus_alert_rules/KubeflowPvcviewerOperatorServices.rules @@ -0,0 +1,24 @@ +groups: +- name: KubeflowPvcviewerOperatorServices + rules: + - alert: KubeflowServiceDown + expr: up{} < 1 + for: 5m + labels: + severity: critical + annotations: + summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}. + LABELS = {{ $labels }} + + - alert: KubeflowServiceIsNotStable + expr: avg_over_time(up{}[10m]) < 0.5 + for: 0m + labels: + severity: warning + annotations: + summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + {{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes. + LABELS = {{ $labels }} diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index ef9fb20..510557c 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -11,9 +11,11 @@ import pytest import yaml from charmed_kubeflow_chisme.testing import ( + assert_alert_rules, assert_logging, assert_metrics_endpoint, deploy_and_assert_grafana_agent, + get_alert_rules, ) from lightkube import codecs from lightkube.generic_resource import create_namespaced_resource @@ -152,6 +154,14 @@ async def test_logging(ops_test: OpsTest): await assert_logging(app) +async def test_alert_rules(ops_test: OpsTest): + """Test check charm alert rules and rules defined in relation data bag.""" + app = ops_test.model.applications[CHARM_NAME] + alert_rules = get_alert_rules() + logger.info("found alert_rules: %s", alert_rules) + await assert_alert_rules(app, alert_rules) + + @retry(stop=stop_after_delay(600), wait=wait_fixed(10)) @pytest.mark.abort_on_fail async def test_pvcviewer_example(ops_test: OpsTest, lightkube_client: lightkube.Client):