Skip to content

Commit

Permalink
Factor out failure rate query templating
Browse files Browse the repository at this point in the history
Signed-off-by: György Krajcsovits <[email protected]>
  • Loading branch information
krajorama committed Apr 4, 2024
1 parent 10013fc commit 3042051
Show file tree
Hide file tree
Showing 4 changed files with 34 additions and 89 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -9461,7 +9461,7 @@ data:
"uid": "$datasource"
},
"exemplar": false,
"expr": "(\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)",
"expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval])))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(histogram_count(rate(cortex_request_duration_seconds{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval])))\n < ($latency_metrics * -Inf)",
"instant": false,
"legendFormat": "Reads",
"range": true
Expand All @@ -9471,7 +9471,7 @@ data:
"uid": "$datasource"
},
"exemplar": false,
"expr": "(\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)",
"expr": "(\n # gRPC errors are not tracked as 5xx but \"error\".\n sum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\",status_code=~\"5.*|error\"}[$__rate_interval]))\n or\n # Handle the case no failure has been tracked yet.\n vector(0)\n)\n/\nsum(rate(cortex_request_duration_seconds_count{cluster=~\"$cluster\", job=~\"($namespace)/((query-frontend.*|cortex|mimir|mimir-read.*))\", route=~\"(prometheus|api_prom)_api_v1_.+\"}[$__rate_interval]))\n < ($latency_metrics * +Inf)",
"instant": false,
"legendFormat": "Reads",
"range": true
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

111 changes: 28 additions & 83 deletions operations/mimir-mixin/dashboards/dashboard-queries.libsonnet
Original file line number Diff line number Diff line change
@@ -1,6 +1,30 @@
local utils = import 'mixin-utils/utils.libsonnet';

{
// Helper function to produce failure rate in percentage queries for native and classic histograms.
// Takes a metric name and a selector as strings and returns a dictionary with classic and native queries.
nativeClassicFailureRate(metric, selector):: {
local template = |||
(
# gRPC errors are not tracked as 5xx but "error".
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(%(countQuery)s)
|||,
classic: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').classic,
countQuery: utils.nativeClassicHistogramCountRate(metric, selector).classic,
},
native: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').native,
countQuery: utils.nativeClassicHistogramCountRate(metric, selector).native,
},
},

// This object contains common queries used in the Mimir dashboards.
// These queries are NOT intended to be configurable or overriddeable via jsonnet,
// but they're defined in a common place just to share them between different dashboards.
Expand Down Expand Up @@ -37,50 +61,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables,

// Write failures rate as percentage of total requests.
writeFailuresRate:: {
local template = |||
(
# gRPC errors are not tracked as 5xx but "error".
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(%(countQuery)s)
|||,
classic: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic,
countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic,
},
native: template % {
countFailQuery: utils.nativeHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native,
countQuery: utils.nativeHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).native,
},
},
writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector),

// Read failures rate as percentage of total requests.
readFailuresRate:: {
local template = |||
(
# gRPC errors are not tracked as 5xx but "error".
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(%(countQuery)s)
|||,
classic: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic,
countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic,
},
native: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').native,
countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native,
},
},
readFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.readRequestsPerSecondSelector),
},

distributor: {
Expand All @@ -94,27 +78,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables,

// Write failures rate as percentage of total requests.
writeFailuresRate:: {
local template = |||
(
# gRPC errors are not tracked as 5xx but "error".
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(%(countQuery)s)
|||,
classic: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic,
countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).classic,
},
native: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector + ',status_code=~"5.*|error"').native,
countQuery: utils.nativeClassicHistogramCountRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector).native,
},
},
writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector),
},

query_frontend: {
Expand Down Expand Up @@ -174,26 +138,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
labelValuesCardinalityQueriesPerSecond: queryPerSecond('labelValuesCardinality'),

// Read failures rate as percentage of total requests.
readFailuresRate:: {
local template = |||
(
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(%(countQuery)s)
|||,
classic: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').classic,
countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).classic,
},
native: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector + ',status_code=~"5.*|error"').native,
countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector).native,
},
},
readFailuresRate: $.nativeClassicFailureRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector),
},

ruler: {
Expand Down

0 comments on commit 3042051

Please sign in to comment.