diff --git a/src/prometheus_alert_rules/KubeflowPvcviewerOperatorServices.rules b/src/prometheus_alert_rules/KubeflowPvcviewerOperatorServices.rules new file mode 100644 index 0000000..c7d675d --- /dev/null +++ b/src/prometheus_alert_rules/KubeflowPvcviewerOperatorServices.rules @@ -0,0 +1,24 @@ +groups: +- name: KubeflowPvcviewerOperatorServices + rules: + - alert: KubeflowServiceDown + expr: up{} < 1 + for: 5m + labels: + severity: critical + annotations: + summary: "{{ $labels.juju_charm }} service is Down ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + One or more targets of {{ $labels.juju_charm }} charm are down on unit {{ $labels.juju_model }}/{{ $labels.juju_unit }}. + LABELS = {{ $labels }} + + - alert: KubeflowServiceIsNotStable + expr: avg_over_time(up{}[10m]) < 0.5 + for: 0m + labels: + severity: warning + annotations: + summary: "{{ $labels.juju_charm }} service is not stable ({{ $labels.juju_model }}/{{ $labels.juju_unit }})" + description: | + {{ $labels.juju_charm }} unit {{ $labels.juju_model }}/{{ $labels.juju_unit }} has been unreachable at least 50% of the time over the last 10 minutes. + LABELS = {{ $labels }} diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index ef9fb20..510557c 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -11,9 +11,11 @@ import pytest import yaml from charmed_kubeflow_chisme.testing import ( + assert_alert_rules, assert_logging, assert_metrics_endpoint, deploy_and_assert_grafana_agent, + get_alert_rules, ) from lightkube import codecs from lightkube.generic_resource import create_namespaced_resource @@ -152,6 +154,14 @@ async def test_logging(ops_test: OpsTest): await assert_logging(app) +async def test_alert_rules(ops_test: OpsTest): + """Test check charm alert rules and rules defined in relation data bag.""" + app = ops_test.model.applications[CHARM_NAME] + alert_rules = get_alert_rules() + logger.info("found alert_rules: %s", alert_rules) + await assert_alert_rules(app, alert_rules) + + @retry(stop=stop_after_delay(600), wait=wait_fixed(10)) @pytest.mark.abort_on_fail async def test_pvcviewer_example(ops_test: OpsTest, lightkube_client: lightkube.Client):