diff --git a/resources/prometheus/prometheus-rules.yaml b/resources/prometheus/prometheus-rules.yaml index c5f455b5..096adc57 100644 --- a/resources/prometheus/prometheus-rules.yaml +++ b/resources/prometheus/prometheus-rules.yaml @@ -358,8 +358,8 @@ spec: - expr: | sum by (namespace, rhacs_instance_id, rhacs_org_id, rhacs_org_name, rhacs_cluster_name, rhacs_environment) - (rate(grpc_server_started_total{namespace=~"rhacs-.*", job="central", grpc_type="unary",grpc_service!="v1.PingService"}[10m])) - record: central:grpc_server_started:total:rate10m + (rate(grpc_server_handled_total{namespace=~"rhacs-.*", job="central", grpc_type="unary",grpc_service!="v1.PingService"}[10m])) + record: central:grpc_server_handled:total:rate10m # HTTP @@ -383,6 +383,25 @@ spec: ) record: central:http_incoming_requests:total:rate10m + - expr: | + central:http_incoming_requests:not_5xx:rate10m + + on (namespace) group_left(rhacs_instance_id) central:grpc_server_handled:server_available_code:rate10m + record: central:incoming_requests:available:rate10m + + - expr: | + central:http_incoming_requests:total:rate10m + + on (namespace) group_left(rhacs_instance_id) central:grpc_server_handled:total:rate10m + record: central:incoming_requests:total:rate10m + + - expr: | + clamp ( + central:incoming_requests:available:rate10m + / + (central:incoming_requests:total:rate10m > 0), + 0, 1 + ) + record: central:error_rate10m + # This is a time series of 0s (down) and 1s (up). # Success rate above 65% is floored to 1. # Success rate below 65% is floored to 0. @@ -394,15 +413,7 @@ spec: # reference in dashboards. - expr: | sum by (namespace, rhacs_instance_id) ( - floor( - ( - central:http_incoming_requests:not_5xx:rate10m - + on (namespace) group_left(rhacs_instance_id) central:grpc_server_handled:server_available_code:rate10m - ) / (( - central:http_incoming_requests:total:rate10m - + on (namespace) group_left(rhacs_instance_id) central:grpc_server_started:total:rate10m - ) > 0) + 0.35 - ) + floor (central:error_rate10m + 0.35) ) or on (namespace, rhacs_instance_id) central:sli:pod_ready record: central:sli:error_rate diff --git a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml index e9199f5f..8933a82d 100644 --- a/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml +++ b/resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml @@ -16,8 +16,6 @@ tests: values: "1x1000" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} values: "1x1000" - - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} - values: "1+0x1000" - series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-aaaabbbbccccddddeeee"} values: "1x1000" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="Unavailable", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} @@ -35,8 +33,6 @@ tests: values: "0x1000" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} values: "1x1000" - - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} - values: "1+0x1000" - series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-aaaabbbbccccddddeeee"} values: "1x1000" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="Unavailable", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} @@ -64,8 +60,6 @@ tests: values: "1+0x260 0+0x40 1+0x100" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} values: "1+1x400" - - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"} - values: "1+1x360 362+2x40" - series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-aaaabbbbccccddddeeee"} values: "4+4x400" # 200m downtime due Unavailable or 5xx responses. Out of 28 days, this equates to ~0.5% downtime. @@ -98,8 +92,6 @@ tests: values: "1+0x265 0+0x35 1+0x100" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-ffffgggghhhhiiiijjjj", rhacs_instance_id="ffffgggghhhhiiiijjjj"} values: "1+1x400" - - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-ffffgggghhhhiiiijjjj", rhacs_instance_id="ffffgggghhhhiiiijjjj"} - values: "1+1x365 367+2x35" - series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-ffffgggghhhhiiiijjjj"} values: "4+4x400" # 175m downtime due Unavailable or 5xx responses. Out of 28 days, this equates to ~0.43% downtime. @@ -132,8 +124,6 @@ tests: values: "1+0x279 0+0x21 1+0x100" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-kkkkllllmmmmnnnnoooo", rhacs_instance_id="kkkkllllmmmmnnnnoooo"} values: "1+1x400" - - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-kkkkllllmmmmnnnnoooo", rhacs_instance_id="kkkkllllmmmmnnnnoooo"} - values: "1+1x379 381+2x21" - series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-kkkkllllmmmmnnnnoooo"} values: "4+4x400" # 105m downtime due Unavailable or 5xx responses. Out of 28 days, this equates to ~0.25% downtime. @@ -166,8 +156,6 @@ tests: values: "1+0x6 0+0x6" - series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-ppppqqqqrrrrsssstttt"} values: "5+5x14" - - series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-ppppqqqqrrrrsssstttt"} - values: "5+5x14" - series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-ppppqqqqrrrrsssstttt"} values: "5+5x14" alert_rule_test: