Skip to content

Commit

Permalink
Rollout progress dashboard: allow using cortex_request_duration_secon…
Browse files Browse the repository at this point in the history
…ds native histogram

Signed-off-by: Yuri Nikolic <[email protected]>
  • Loading branch information
duricanikolic committed Jul 21, 2024
1 parent e243636 commit 2dabc48
Show file tree
Hide file tree
Showing 9 changed files with 573 additions and 105 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. #7674 #8502
* Writes dashboard: `cortex_request_duration_seconds` metric. #8757
* Reads dashboard: `cortex_request_duration_seconds` metric. #8752
* Rollout progress dashboard.
* [ENHANCEMENT] Alerts: `MimirRunningIngesterReceiveDelayTooHigh` alert has been tuned to be more reactive to high receive delay. #8538
* [ENHANCEMENT] Dashboards: improve end-to-end latency and strong read consistency panels when experimental ingest storage is enabled. #8543
* [ENHANCEMENT] Dashboards: Add panels for monitoring ingester autoscaling when not using ingest-storage. These panels are disabled by default, but can be enabled using the `autoscaling.ingester.enabled: true` config option. #8484
Expand Down
1 change: 1 addition & 0 deletions operations/helm/charts/mimir-distributed/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ Entries should include a reference to the Pull Request that introduced the chang
* Overview dashboard: status, read/write latency and queries/ingestion per sec panels, `cortex_request_duration_seconds` metric. #7674
* Writes dashboard: `cortex_request_duration_seconds` metric. #8757
* Reads dashboard: `cortex_request_duration_seconds` metric. #8752
* Rollout progress dashboard.
* [ENHANCEMENT] Memcached: Update to Memcached 1.6.28 and memcached-exporter 0.14.4. #8557
* [ENHANCEMENT] Add missing fields in multiple topology spread constraints. #8533
* [ENHANCEMENT] Add support for setting the image pull secrets, node selectors, tolerations and topology spread constraints for the Grafana Agent pods used for metamonitoring. #8670
Expand Down

Large diffs are not rendered by default.

Large diffs are not rendered by default.

141 changes: 131 additions & 10 deletions operations/mimir-mixin-compiled/dashboards/mimir-rollout-progress.json

Large diffs are not rendered by default.

51 changes: 51 additions & 0 deletions operations/mimir-mixin/dashboards/dashboard-queries.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,57 @@ local utils = import 'mixin-utils/utils.libsonnet';
},
},

ncSumHistogramCountRate(metric, selectors, extra_selector, rate_interval='$__rate_interval')::
local selectorsStr = $.toPrometheusSelector(selectors);
local extendedSelectorsStr = $.toPrometheusSelector(selectors + extra_selector);
{
classic: 'sum(rate(%(metric)s_count%(extendedSelectors)s[%(rateInterval)s])) /\nsum(rate(%(metric)s_count%(selectors)s[%(rateInterval)s]))' % {
metric: metric,
rateInterval: rate_interval,
extendedSelectors: extendedSelectorsStr,
selectors: selectorsStr,
},
native: 'sum(histogram_count(rate(%(metric)s%(extendedSelectors)s[%(rateInterval)s]))) /\nsum(histogram_count(rate(%(metric)s%(selectors)s[%(rateInterval)s])))' % {
metric: metric,
rateInterval: rate_interval,
extendedSelectors: extendedSelectorsStr,
selectors: selectorsStr,
},
},

ncAvgHistogramQuantile(quantile, metric, selectors, offset, rate_interval='$__rate_interval')::
local labels = std.join('_', [matcher.label for matcher in selectors]);
local metricStr = '%(labels)s:%(metric)s' % { labels: labels, metric: metric };
local selectorsStr = $.toPrometheusSelector(selectors);
{
classic: |||
1 - (
avg_over_time(histogram_quantile(%(quantile)s, sum by (le) (%(metric)s_bucket:sum_rate%(selectors)s offset %(offset)s))[%(rateInterval)s])
/
avg_over_time(histogram_quantile(%(quantile)s, sum by (le) (%(metric)s_bucket:sum_rate%(selectors)s))[%(rateInterval)s])
)
||| % {
quantile: quantile,
metric: metricStr,
selectors: selectorsStr,
offset: offset,
rateInterval: rate_interval,
},
native: |||
1 - (
avg_over_time(histogram_quantile(%(quantile)s, sum(%(metric)s:sum_rate%(selectors)s offset %(offset)s))[%(rateInterval)s])
/
avg_over_time(histogram_quantile(%(quantile)s, sum(%(metric)s:sum_rate%(selectors)s))[%(rateInterval)s])
)
||| % {
quantile: quantile,
metric: metricStr,
selectors: selectorsStr,
offset: offset,
rateInterval: rate_interval,
},
},

// This object contains common queries used in the Mimir dashboards.
// These queries are NOT intended to be configurable or overriddeable via jsonnet,
// but they're defined in a common place just to share them between different dashboards.
Expand Down
27 changes: 27 additions & 0 deletions operations/mimir-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,33 @@ local utils = import 'mixin-utils/utils.libsonnet';
$.autoScalingFailuresPanel(componentName)
),

ncSumCountRateStatPanel(metric, selectors, extra_selector, thresholds=[])::
local ncQuery = $.ncSumHistogramCountRate(metric, selectors, extra_selector);
local queries = [
utils.showClassicHistogramQuery(ncQuery),
utils.showNativeHistogramQuery(ncQuery),
];
$.newStatPanel(
queries=queries,
legends=['', ''],
unit='percentunit',
thresholds=thresholds,
),

ncLatencyStatPanel(quantile, metric, selectors, thresholds=[])::
local labels = std.join('_', [matcher.label for matcher in selectors]);
local metricStr = '%(labels)s:%(metric)s' % { labels: labels, metric: metric };
local queries = [
utils.showClassicHistogramQuery(utils.ncHistogramQuantile(quantile, metricStr, utils.toPrometheusSelectorNaked(selectors), from_recording=true)),
utils.showNativeHistogramQuery(utils.ncHistogramQuantile(quantile, metricStr, utils.toPrometheusSelectorNaked(selectors), from_recording=true)),
];
$.newStatPanel(
queries=queries,
legends=['', ''],
unit='s',
thresholds=thresholds,
),

newStatPanel(queries, legends='', unit='percentunit', decimals=1, thresholds=[], instant=false, novalue='')::
super.queryPanel(queries, legends) + {
type: 'stat',
Expand Down
171 changes: 98 additions & 73 deletions operations/mimir-mixin/dashboards/rollout-progress.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ local filename = 'mimir-rollout-progress.json';
(import 'dashboard-queries.libsonnet') {
local config = $.queries {
namespace_matcher: $.namespaceMatcher(),
per_cluster_label: $._config.per_cluster_label,
write_job_matcher: if $._config.gateway_enabled then $.jobMatcher($._config.job_names.gateway) else $.jobMatcher($._config.job_names.distributor),
read_job_matcher: if $._config.gateway_enabled then $.jobMatcher($._config.job_names.gateway) else $.jobMatcher($._config.job_names.query_frontend),
requests_per_second_metric: if $._config.gateway_enabled then $.queries.gateway.requestsPerSecondMetric else $.queries.distributor.requestsPerSecondMetric,
write_job_selector: if $._config.gateway_enabled then $.jobSelector($._config.job_names.gateway) else $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', $.queries.write_http_routes_regex)],
read_job_selector: if $._config.gateway_enabled then $.jobSelector($._config.job_names.gateway) else $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', $.queries.read_http_routes_regex)],
workload_label_replace_open:
std.repeat('label_replace(', std.length($._config.rollout_dashboard.workload_label_replaces)),
workload_label_replace_close:
Expand All @@ -21,7 +21,8 @@ local filename = 'mimir-rollout-progress.json';
[filename]:
assert std.md5(filename) == '7f0b5567d543a1698e695b530eb7f5de' : 'UID of the dashboard has changed, please update references to dashboard.';
($.dashboard('Rollout progress') + { uid: std.md5(filename) })
.addClusterSelectorTemplates(false) + {
.addClusterSelectorTemplates(false)
.addShowNativeLatencyVariable() + {
// This dashboard uses the new grid system in order to place panels (using gridPos).
// Because of this we can't use the mixin's addRow() and addPanel().
schemaVersion: 27,
Expand Down Expand Up @@ -162,49 +163,56 @@ local filename = 'mimir-rollout-progress.json';
// Writes
//
$.panel('Writes - 2xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
]) + {
$.ncSumCountRateStatPanel(
metric=config.requests_per_second_metric,
selectors=config.write_job_selector,
extra_selector=[utils.selector.re('status_code', '2.+')],
thresholds=[{ color: 'green', value: null }],
) + {
id: 2,
gridPos: { h: 4, w: 2, x: 10, y: 0 },
},

$.panel('Writes - 4xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.2 },
{ color: 'red', value: 0.5 },
]) + {
$.ncSumCountRateStatPanel(
metric=config.requests_per_second_metric,
selectors=config.write_job_selector,
extra_selector=[utils.selector.re('status_code', '4.+')],
thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.2 },
{ color: 'red', value: 0.5 },
]
) + {
id: 3,
gridPos: { h: 4, w: 2, x: 12, y: 0 },
},

$.panel('Writes - 5xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'red', value: 0.01 },
]) + {
$.ncSumCountRateStatPanel(
metric=config.requests_per_second_metric,
selectors=config.write_job_selector,
extra_selector=[utils.selector.re('status_code', '5.+')],
thresholds=[
{ color: 'green', value: null },
{ color: 'red', value: 0.01 },
]
) + {
id: 4,
gridPos: { h: 4, w: 2, x: 14, y: 0 },
},

$.panel('Writes 99th latency') +
$.newStatPanel(|||
histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s"}))
||| % config, unit='s', thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.2 },
{ color: 'red', value: 0.5 },
]) + {
$.ncLatencyStatPanel(
quantile='0.99',
metric=config.requests_per_second_metric,
selectors=config.write_job_selector,
thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.2 },
{ color: 'red', value: 0.5 },
]
) + {
id: 5,
gridPos: { h: 4, w: 8, x: 16, y: 0 },
},
Expand All @@ -213,49 +221,56 @@ local filename = 'mimir-rollout-progress.json';
// Reads
//
$.panel('Reads - 2xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
]) + {
$.ncSumCountRateStatPanel(
metric=config.requests_per_second_metric,
selectors=config.read_job_selector,
extra_selector=[utils.selector.re('status_code', '2.+')],
thresholds=[{ color: 'green', value: null }],
) + {
id: 6,
gridPos: { h: 4, w: 2, x: 10, y: 4 },
},

$.panel('Reads - 4xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.01 },
{ color: 'red', value: 0.05 },
]) + {
$.ncSumCountRateStatPanel(
metric=config.requests_per_second_metric,
selectors=config.read_job_selector,
extra_selector=[utils.selector.re('status_code', '4.+')],
thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 0.01 },
{ color: 'red', value: 0.05 },
]
) + {
id: 7,
gridPos: { h: 4, w: 2, x: 12, y: 4 },
},

$.panel('Reads - 5xx') +
$.newStatPanel(|||
sum(rate(cortex_request_duration_seconds_count{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) /
sum(rate(cortex_request_duration_seconds_count{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s"}[$__rate_interval]))
||| % config, thresholds=[
{ color: 'green', value: null },
{ color: 'red', value: 0.01 },
]) + {
$.ncSumCountRateStatPanel(
metric=config.requests_per_second_metric,
selectors=config.read_job_selector,
extra_selector=[utils.selector.re('status_code', '5.+')],
thresholds=[
{ color: 'green', value: null },
{ color: 'red', value: 0.01 },
]
) + {
id: 8,
gridPos: { h: 4, w: 2, x: 14, y: 4 },
},

$.panel('Reads 99th latency') +
$.newStatPanel(|||
histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s"}))
||| % config, unit='s', thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 1 },
{ color: 'red', value: 2.5 },
]) + {
$.ncLatencyStatPanel(
quantile='0.99',
metric=config.requests_per_second_metric,
selectors=config.read_job_selector,
thresholds=[
{ color: 'green', value: null },
{ color: 'orange', value: 1 },
{ color: 'red', value: 2.5 },
]
) + {
id: 9,
gridPos: { h: 4, w: 8, x: 16, y: 4 },
},
Expand Down Expand Up @@ -354,19 +369,29 @@ local filename = 'mimir-rollout-progress.json';
// Performance comparison with 24h ago
//
$.timeseriesPanel('Latency vs 24h ago') +
$.queryPanel([|||
1 - (
avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s"} offset 24h))[1h:])
/
avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(write_job_matcher)s, route=~"%(write_http_routes_regex)s"}))[1h:])
)
||| % config, |||
1 - (
avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s"} offset 24h))[1h:])
/
avg_over_time(histogram_quantile(0.99, sum by (le) (%(per_cluster_label)s_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(read_job_matcher)s, route=~"%(read_http_routes_regex)s"}))[1h:])
)
||| % config], ['writes', 'reads']) +
$.queryPanel(
local write = $.ncAvgHistogramQuantile(
quantile='0.99',
metric=config.requests_per_second_metric,
selectors=config.write_job_selector,
offset='24h',
rate_interval='1h:'
);
local read = $.ncAvgHistogramQuantile(
quantile='0.99',
metric=config.requests_per_second_metric,
selectors=config.read_job_selector,
offset='24h',
rate_interval='1h:'
);
[
utils.showClassicHistogramQuery(write),
utils.showNativeHistogramQuery(write),
utils.showClassicHistogramQuery(read),
utils.showNativeHistogramQuery(read),
],
['writes', 'writes', 'reads', 'reads']
) +
{
fieldConfig: {
defaults: {
Expand Down
4 changes: 2 additions & 2 deletions operations/mimir-mixin/jsonnetfile.lock.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
"subdir": "grafana-builder"
}
},
"version": "ea6f2601969aa12c02dbca761ce4316aff036af2",
"version": "1d877bb0651ef92176f651d0be473c06e372a8a0",
"sum": "udZaafkbKYMGodLqsFhEe+Oy/St2p0edrK7hiMPEey0="
},
{
Expand All @@ -18,7 +18,7 @@
"subdir": "mixin-utils"
}
},
"version": "ea6f2601969aa12c02dbca761ce4316aff036af2",
"version": "1d877bb0651ef92176f651d0be473c06e372a8a0",
"sum": "mzLmCv9n3ldLChVGPfyRJOVKoBw+dfK40vU9792aHIM="
}
],
Expand Down

0 comments on commit 2dabc48

Please sign in to comment.