Skip to content

Commit

Permalink
dashboards: overview: qps and latency w/ cortex_request_duration_seco…
Browse files Browse the repository at this point in the history
…nds (#7674)

* dashboards: overview: use native histograms in status

Allow switching between basing status on classic or native version
of cortex_request_duration_seconds.

Related to #7154
Followup to #7627

Signed-off-by: György Krajcsovits <[email protected]>
  • Loading branch information
krajorama authored Jun 24, 2024
1 parent 1a5af4a commit 0412301
Show file tree
Hide file tree
Showing 9 changed files with 290 additions and 75 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@

### Mixin

* [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards. #7674
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric.

### Jsonnet

* [FEATURE] Add support for automatically deleting compactor, store-gateway and read-write mode backend PVCs when the corresponding StatefulSet is scaled down. #8382
Expand Down
3 changes: 3 additions & 0 deletions operations/helm/charts/mimir-distributed/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,9 @@ Entries should include a reference to the Pull Request that introduced the chang

## main / unreleased

* [ENHANCEMENT] Dashboards: allow switching between using classic or native histograms in dashboards. #7674
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric.

## 5.4.0-rc.0

* [FEATURE] Add support for a dedicated query path for the ruler. This allows for the isolation of ruler and user query paths. Enable it via `ruler.remoteEvaluationDedicatedQueryPath: true`. #7964
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

91 changes: 75 additions & 16 deletions operations/mimir-mixin-compiled/dashboards/mimir-overview.json

Large diffs are not rendered by default.

23 changes: 12 additions & 11 deletions operations/mimir-mixin/dashboards/dashboard-queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
{
// Helper function to produce failure rate in percentage queries for native and classic histograms.
// Takes a metric name and a selector as strings and returns a dictionary with classic and native queries.
nativeClassicFailureRate(metric, selector):: {
ncHistogramFailureRate(metric, selector):: {
local template = |||
(
# gRPC errors are not tracked as 5xx but "error".
Expand All @@ -16,12 +16,12 @@ local utils = import 'mixin-utils/utils.libsonnet';
sum(%(countQuery)s)
|||,
classic: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').classic,
countQuery: utils.nativeClassicHistogramCountRate(metric, selector).classic,
countFailQuery: utils.ncHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').classic,
countQuery: utils.ncHistogramCountRate(metric, selector).classic,
},
native: template % {
countFailQuery: utils.nativeClassicHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').native,
countQuery: utils.nativeClassicHistogramCountRate(metric, selector).native,
countFailQuery: utils.ncHistogramCountRate(metric, selector + ',status_code=~"5.*|error"').native,
countQuery: utils.ncHistogramCountRate(metric, selector).native,
},
},

Expand Down Expand Up @@ -62,10 +62,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables,

// Write failures rate as percentage of total requests.
writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector),
writeFailuresRate: $.ncHistogramFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector),

// Read failures rate as percentage of total requests.
readFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.readRequestsPerSecondSelector),
readFailuresRate: $.ncHistogramFailureRate(p.requestsPerSecondMetric, p.readRequestsPerSecondSelector),
},

distributor: {
Expand All @@ -79,7 +79,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables,

// Write failures rate as percentage of total requests.
writeFailuresRate: $.nativeClassicFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector),
writeFailuresRate: $.ncHistogramFailureRate(p.requestsPerSecondMetric, p.writeRequestsPerSecondSelector),
},

query_frontend: {
Expand Down Expand Up @@ -121,8 +121,9 @@ local utils = import 'mixin-utils/utils.libsonnet';
}
for r in overviewRoutes
],
overviewRoutesPerSecond: 'sum by (route) (rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"%(overviewRoutesRegex)s"}[$__rate_interval]))' % (variables { overviewRoutesRegex: overviewRoutesRegex }),
nonOverviewRoutesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~"%(overviewRoutesRegex)s"}[$__rate_interval]))' % (variables { overviewRoutesRegex: overviewRoutesRegex }),
overviewRoutesPerSecondMetric: 'cortex_request_duration_seconds',
overviewRoutesPerSecondSelector: '%(queryFrontendMatcher)s,route=~"%(overviewRoutesRegex)s"' % (variables { overviewRoutesRegex: overviewRoutesRegex }),
nonOverviewRoutesPerSecondSelector: '%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~"%(overviewRoutesRegex)s"' % (variables { overviewRoutesRegex: overviewRoutesRegex }),

local queryPerSecond(name) = 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)%(route)s"}[$__rate_interval]))' %
(variables { route: std.filter(function(r) r.name == name, overviewRoutes)[0].routeLabel }),
Expand All @@ -139,7 +140,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
labelValuesCardinalityQueriesPerSecond: queryPerSecond('labelValuesCardinality'),

// Read failures rate as percentage of total requests.
readFailuresRate: $.nativeClassicFailureRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector),
readFailuresRate: $.ncHistogramFailureRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector),
},

ruler: {
Expand Down
12 changes: 12 additions & 0 deletions operations/mimir-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -1261,6 +1261,18 @@ local utils = import 'mixin-utils/utils.libsonnet';
},
},

latencyRecordingRulePanelNativeHistogram(metric, selectors, extra_selectors=[], multiplier='1e3', sum_by=[])::
utils.latencyRecordingRulePanelNativeHistogram(metric, selectors, extra_selectors, multiplier, sum_by) + {
// Hide yaxes from JSON Model; it's not supported by timeseriesPanel.
yaxes:: super.yaxes,
fieldConfig+: {
defaults+: {
unit: 'ms',
min: 0,
},
},
},

filterNodeDiskContainer(containerName)::
|||
ignoring(%(instanceLabel)s) group_right() (
Expand Down
43 changes: 31 additions & 12 deletions operations/mimir-mixin/dashboards/overview.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -128,19 +128,23 @@ local filename = 'mimir-overview.json';
)
.addPanel(
$.timeseriesPanel(std.stripChars('Write requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) +
$.qpsPanel(
$.qpsPanelNativeHistogram(
if $._config.gateway_enabled then
$.queries.gateway.writeRequestsPerSecond
$.queries.gateway.requestsPerSecondMetric
else
$.queries.distributor.writeRequestsPerSecond
$.queries.distributor.requestsPerSecondMetric,
if $._config.gateway_enabled then
$.queries.gateway.writeRequestsPerSecondSelector
else
$.queries.distributor.writeRequestsPerSecondSelector
)
)
.addPanel(
$.timeseriesPanel(std.stripChars('Write latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + (
if $._config.gateway_enabled then
$.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)])
$.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)])
else
$.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)])
$.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)])
)
)
.addPanel(
Expand Down Expand Up @@ -172,32 +176,47 @@ local filename = 'mimir-overview.json';
)
.addPanel(
$.timeseriesPanel(std.stripChars('Read requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) +
$.qpsPanel(
$.qpsPanelNativeHistogram(
if $._config.gateway_enabled then
$.queries.gateway.readRequestsPerSecondMetric
else
$.queries.query_frontend.readRequestsPerSecondMetric,
if $._config.gateway_enabled then
$.queries.gateway.readRequestsPerSecond
$.queries.gateway.readRequestsPerSecondSelector
else
$.queries.query_frontend.readRequestsPerSecond
$.queries.query_frontend.readRequestsPerSecondSelector
)
)
.addPanel(
$.timeseriesPanel(std.stripChars('Read latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + (
if $._config.gateway_enabled then
$.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)])
$.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.read_http_routes_regex)])
else
$.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', $.queries.read_http_routes_regex)])
$.latencyRecordingRulePanelNativeHistogram('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', $.queries.read_http_routes_regex)])
)
)
.addPanel(
$.timeseriesPanel('Queries / sec') +
{
targets: [
{
expr: $.queries.query_frontend.overviewRoutesPerSecond,
expr: utils.showClassicHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.overviewRoutesPerSecondSelector), ['route'])),
format: 'time_series',
legendLink: null,
},
{
expr: $.queries.query_frontend.nonOverviewRoutesPerSecond,
expr: utils.showNativeHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.overviewRoutesPerSecondSelector), ['route'])),
format: 'time_series',
legendLink: null,
},
{
expr: utils.showClassicHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.nonOverviewRoutesPerSecondSelector))),
format: 'time_series',
legendFormat: 'other',
legendLink: null,
},
{
expr: utils.showNativeHistogramQuery(utils.ncHistogramSumBy(utils.ncHistogramCountRate($.queries.query_frontend.overviewRoutesPerSecondMetric, $.queries.query_frontend.nonOverviewRoutesPerSecondSelector))),
format: 'time_series',
legendFormat: 'other',
legendLink: null,
Expand Down
8 changes: 4 additions & 4 deletions operations/mimir-mixin/jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
"subdir": "grafana-builder"
}
},
"version": "653836fa707a591b36fe490be350b1540e4f14ce",
"sum": "+z5VY+bPBNqXcmNAV8xbJcbsRA+pro1R3IM7aIY8OlU="
"version": "bf12954197422f36f0803ee217e378ad055f3837",
"sum": "EEPwMLfUIJT9iEUI/gCW9x6PxWoTBPSJOfabTF4rp1M="
},
{
"source": {
Expand All @@ -18,8 +18,8 @@
"subdir": "mixin-utils"
}
},
"version": "653836fa707a591b36fe490be350b1540e4f14ce",
"sum": "wi1o6t5nUZVcsatqMdGLOWIv1HxNnlaE84oRE0Cl0ec="
"version": "bf12954197422f36f0803ee217e378ad055f3837",
"sum": "Qg992ZB0jkrS+YLq0Q7RV1fSHa8+hQT0jbpTyCGE2NI="
}
],
"legacyImports": false
Expand Down

0 comments on commit 0412301

Please sign in to comment.