Skip to content

Commit

Permalink
fix(sli): clamp error rate at 1 (#127)
Browse files Browse the repository at this point in the history
* fix(sli): clamp error rate at 1

* use grpc_server_handled_total consistently
  • Loading branch information
stehessel authored Aug 16, 2023
1 parent 9ffae66 commit 46a6e74
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 23 deletions.
33 changes: 22 additions & 11 deletions resources/prometheus/prometheus-rules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -358,8 +358,8 @@ spec:
- expr: |
sum by (namespace, rhacs_instance_id, rhacs_org_id, rhacs_org_name, rhacs_cluster_name, rhacs_environment)
(rate(grpc_server_started_total{namespace=~"rhacs-.*", job="central", grpc_type="unary",grpc_service!="v1.PingService"}[10m]))
record: central:grpc_server_started:total:rate10m
(rate(grpc_server_handled_total{namespace=~"rhacs-.*", job="central", grpc_type="unary",grpc_service!="v1.PingService"}[10m]))
record: central:grpc_server_handled:total:rate10m
# HTTP

Expand All @@ -383,6 +383,25 @@ spec:
)
record: central:http_incoming_requests:total:rate10m
- expr: |
central:http_incoming_requests:not_5xx:rate10m
+ on (namespace) group_left(rhacs_instance_id) central:grpc_server_handled:server_available_code:rate10m
record: central:incoming_requests:available:rate10m
- expr: |
central:http_incoming_requests:total:rate10m
+ on (namespace) group_left(rhacs_instance_id) central:grpc_server_handled:total:rate10m
record: central:incoming_requests:total:rate10m
- expr: |
clamp (
central:incoming_requests:available:rate10m
/
(central:incoming_requests:total:rate10m > 0),
0, 1
)
record: central:error_rate10m
# This is a time series of 0s (down) and 1s (up).
# Success rate above 65% is floored to 1.
# Success rate below 65% is floored to 0.
Expand All @@ -394,15 +413,7 @@ spec:
# reference in dashboards.
- expr: |
sum by (namespace, rhacs_instance_id) (
floor(
(
central:http_incoming_requests:not_5xx:rate10m
+ on (namespace) group_left(rhacs_instance_id) central:grpc_server_handled:server_available_code:rate10m
) / ((
central:http_incoming_requests:total:rate10m
+ on (namespace) group_left(rhacs_instance_id) central:grpc_server_started:total:rate10m
) > 0) + 0.35
)
floor (central:error_rate10m + 0.35)
)
or on (namespace, rhacs_instance_id) central:sli:pod_ready
record: central:sli:error_rate
Expand Down
12 changes: 0 additions & 12 deletions resources/prometheus/unit_tests/RHACSCentralSLISLO.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,6 @@ tests:
values: "1x1000"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
values: "1x1000"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
values: "1+0x1000"
- series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-aaaabbbbccccddddeeee"}
values: "1x1000"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="Unavailable", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
Expand All @@ -35,8 +33,6 @@ tests:
values: "0x1000"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
values: "1x1000"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
values: "1+0x1000"
- series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-aaaabbbbccccddddeeee"}
values: "1x1000"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="Unavailable", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
Expand Down Expand Up @@ -64,8 +60,6 @@ tests:
values: "1+0x260 0+0x40 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
values: "1+1x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-aaaabbbbccccddddeeee", rhacs_instance_id="aaaabbbbccccddddeeee"}
values: "1+1x360 362+2x40"
- series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-aaaabbbbccccddddeeee"}
values: "4+4x400"
# 200m downtime due Unavailable or 5xx responses. Out of 28 days, this equates to ~0.5% downtime.
Expand Down Expand Up @@ -98,8 +92,6 @@ tests:
values: "1+0x265 0+0x35 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-ffffgggghhhhiiiijjjj", rhacs_instance_id="ffffgggghhhhiiiijjjj"}
values: "1+1x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-ffffgggghhhhiiiijjjj", rhacs_instance_id="ffffgggghhhhiiiijjjj"}
values: "1+1x365 367+2x35"
- series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-ffffgggghhhhiiiijjjj"}
values: "4+4x400"
# 175m downtime due Unavailable or 5xx responses. Out of 28 days, this equates to ~0.43% downtime.
Expand Down Expand Up @@ -132,8 +124,6 @@ tests:
values: "1+0x279 0+0x21 1+0x100"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-kkkkllllmmmmnnnnoooo", rhacs_instance_id="kkkkllllmmmmnnnnoooo"}
values: "1+1x400"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-kkkkllllmmmmnnnnoooo", rhacs_instance_id="kkkkllllmmmmnnnnoooo"}
values: "1+1x379 381+2x21"
- series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-kkkkllllmmmmnnnnoooo"}
values: "4+4x400"
# 105m downtime due Unavailable or 5xx responses. Out of 28 days, this equates to ~0.25% downtime.
Expand Down Expand Up @@ -166,8 +156,6 @@ tests:
values: "1+0x6 0+0x6"
- series: grpc_server_handled_total{job="central", grpc_type="unary", grpc_code="OK", namespace="rhacs-ppppqqqqrrrrsssstttt"}
values: "5+5x14"
- series: grpc_server_started_total{job="central", grpc_type="unary", namespace="rhacs-ppppqqqqrrrrsssstttt"}
values: "5+5x14"
- series: haproxy_backend_http_responses_total{job="router_internal_default", code="2xx", exported_namespace="rhacs-ppppqqqqrrrrsssstttt"}
values: "5+5x14"
alert_rule_test:
Expand Down

0 comments on commit 46a6e74

Please sign in to comment.