Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

wip: backwards compatible dashboards with cortex_request_duration_seconds as native histogram #7377

Closed
wants to merge 16 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

Large diffs are not rendered by default.

1,556 changes: 12 additions & 1,544 deletions operations/mimir-mixin-compiled-baremetal/dashboards/mimir-writes.json

Large diffs are not rendered by default.

629 changes: 486 additions & 143 deletions operations/mimir-mixin-compiled/dashboards/mimir-overview.json

Large diffs are not rendered by default.

1,556 changes: 12 additions & 1,544 deletions operations/mimir-mixin-compiled/dashboards/mimir-writes.json

Large diffs are not rendered by default.

64 changes: 46 additions & 18 deletions operations/mimir-mixin/dashboards/dashboard-queries.libsonnet
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
local utils = import 'mixin-utils/utils.libsonnet';

{
// This object contains common queries used in the Mimir dashboards.
// These queries are NOT intended to be configurable or overriddeable via jsonnet,
Expand Down Expand Up @@ -25,55 +27,78 @@
query_http_routes_regex: '(prometheus|api_prom)_api_v1_query(_range)?',

gateway: {
writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}' % variables,
local p = self,
//writeRequestsPerSecond: removed, use combination of writeRequestsPerSecondMetric and writeRequestsPerSecondSelector instead
readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables,

writeRequestsPerSecondMetric: 'cortex_request_duration_seconds',
writeRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"' % variables,
readRequestsPerSecondMetric: 'cortex_request_duration_seconds',
readRequestsPerSecondSelector: '%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables,

// Write failures rate as percentage of total requests.
writeFailuresRate: |||
writeFailuresRate(sampleType='native'):: |||
(
sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval]))
# gRPC errors are not tracked as 5xx but "error".
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(writeHTTPRoutesRegex)s"}[$__rate_interval]))
||| % variables,
sum(%(countQuery)s)
||| % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType],
countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector)[sampleType],
},

// Read failures rate as percentage of total requests.
readFailuresRate: |||
readFailuresRate(sampleType='native'):: |||
(
sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval]))
# gRPC errors are not tracked as 5xx but "error".
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(rate(cortex_request_duration_seconds_count{%(gatewayMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval]))
||| % variables,
sum(%(countQuery)s)
||| % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType],
countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector)[sampleType],
},
},

distributor: {
writeRequestsPerSecond: 'cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}' % variables,
local p = self,
//writeRequestsPerSecond: removed, use combination of writeRequestsPerSecondMetric and writeRequestsPerSecondSelector instead
writeRequestsPerSecondMetric: 'cortex_request_duration_seconds',
writeRequestsPerSecondSelector: '%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"' % variables,
samplesPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_samples:rate5m{%(distributorMatcher)s})' % variables,
exemplarsPerSecond: 'sum(%(groupPrefixJobs)s:cortex_distributor_received_exemplars:rate5m{%(distributorMatcher)s})' % variables,

// Write failures rate as percentage of total requests.
writeFailuresRate: |||
writeFailuresRate(sampleType='native'):: |||
(
# gRPC errors are not tracked as 5xx but "error".
sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s",status_code=~"5.*|error"}[$__rate_interval]))
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(rate(cortex_request_duration_seconds_count{%(distributorMatcher)s, route=~"%(writeGRPCRoutesRegex)s|%(writeHTTPRoutesRegex)s"}[$__rate_interval]))
||| % variables,
sum(%(countQuery)s)
||| % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType],
countQuery: utils.nativeClassicHistogramCountRate(p.writeRequestsPerSecondMetric, p.writeRequestsPerSecondSelector)[sampleType],
},
},

query_frontend: {
local p = self,
readRequestsPerSecond: 'cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}' % variables,
readRequestsPerSecondMetric: 'cortex_request_duration_seconds',
readRequestsPerSecondSelector: '%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"' % variables,
instantQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query"}[$__rate_interval]))' % variables,
rangeQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_query_range"}[$__rate_interval]))' % variables,
labelNamesQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_labels"}[$__rate_interval]))' % variables,
Expand All @@ -85,16 +110,19 @@
otherQueriesPerSecond: 'sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s,route=~"(prometheus|api_prom)_api_v1_.*",route!~".*(query|query_range|label.*|series|read|metadata|query_exemplars)"}[$__rate_interval]))' % variables,

// Read failures rate as percentage of total requests.
readFailuresRate: |||
readFailuresRate(sampleType='native'):: |||
(
sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s",status_code=~"5.*"}[$__rate_interval]))
sum(%(countFailQuery)s)
or
# Handle the case no failure has been tracked yet.
vector(0)
)
/
sum(rate(cortex_request_duration_seconds_count{%(queryFrontendMatcher)s, route=~"%(readHTTPRoutesRegex)s"}[$__rate_interval]))
||| % variables,
sum(%(countQuery)s)
||| % {
countFailQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector+',status_code=~"5.*|error"')[sampleType],
countQuery: utils.nativeClassicHistogramCountRate(p.readRequestsPerSecondMetric, p.readRequestsPerSecondSelector)[sampleType],
},
},

ruler: {
Expand Down
4 changes: 4 additions & 0 deletions operations/mimir-mixin/dashboards/dashboard-utils.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,10 @@ local utils = import 'mixin-utils/utils.libsonnet';
super.qpsPanel(selector, statusLabelName) +
{ yaxes: $.yaxes('reqps') },

qpsPanelNativeHistogram(title, selector, statusLabelName='status_code')::
super.qpsPanelNativeHistogram(title, selector, statusLabelName) +
{ yaxes: $.yaxes('reqps') },

// hiddenLegendQueryPanel adds on to 'timeseriesPanel', not the deprecated 'panel'.
// It is a standard query panel designed to handle a large number of series. it hides the legend, doesn't fill the series and
// shows all values on tooltip, descending. Also turns on exemplars, unless 4th parameter is false.
Expand Down
46 changes: 32 additions & 14 deletions operations/mimir-mixin/dashboards/overview.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ local filename = 'mimir-overview.json';

($.dashboard('Overview') + { uid: std.md5(filename) })
.addClusterSelectorTemplates()
.addShowHistoricDataVariable()

.addRow(
$.row('%(product)s cluster health' % $._config)
Expand All @@ -52,9 +53,13 @@ local filename = 'mimir-overview.json';
'Status',
[
// Write failures.
if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate else $.queries.distributor.writeFailuresRate,
if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate('native') else $.queries.distributor.writeFailuresRate('native'),
// Write failures but from classic histograms.
'%s < ($show_classic_histograms * +Inf)' % if $._config.gateway_enabled then $.queries.gateway.writeFailuresRate('classic') else $.queries.distributor.writeFailuresRate('classic'),
// Read failures.
if $._config.gateway_enabled then $.queries.gateway.readFailuresRate else $.queries.query_frontend.readFailuresRate,
if $._config.gateway_enabled then $.queries.gateway.readFailuresRate('native') else $.queries.query_frontend.readFailuresRate('native'),
// Read failures but from classic histograms.
'%s < ($show_classic_histograms * +Inf)' % if $._config.gateway_enabled then $.queries.gateway.readFailuresRate('classic') else $.queries.query_frontend.readFailuresRate('classic'),
// Rule evaluation failures.
$.queries.ruler.evaluations.failuresRate,
// Alerting notifications.
Expand Down Expand Up @@ -83,7 +88,7 @@ local filename = 'mimir-overview.json';
// Object storage failures.
$.queries.storage.failuresRate,
],
['Writes', 'Reads', 'Rule evaluations', 'Alerting notifications', 'Object storage']
['Writes', 'Writes historic', 'Reads', 'Reads historic', 'Rule evaluations', 'Alerting notifications', 'Object storage']
)
)
.addPanel(
Expand Down Expand Up @@ -113,20 +118,29 @@ local filename = 'mimir-overview.json';
||| % helpers),
)
.addPanel(
$.panel(std.stripChars('Write requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) +
$.qpsPanel(
$.qpsPanelNativeHistogram(
std.stripChars('Write requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' '),
if $._config.gateway_enabled then
$.queries.gateway.writeRequestsPerSecond
$.queries.gateway.writeRequestsPerSecondMetric
else
$.queries.distributor.writeRequestsPerSecond
$.queries.distributor.writeRequestsPerSecondMetric,
if $._config.gateway_enabled then
$.queries.gateway.writeRequestsPerSecondSelector
else
$.queries.distributor.writeRequestsPerSecondSelector
)
)
.addPanel(
$.panel(std.stripChars('Write latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) + (
$.latencyPanelNativeHistogram(
std.stripChars('Write latency %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' '),
if $._config.gateway_enabled then
$.queries.gateway.writeRequestsPerSecondMetric
else
$.queries.distributor.writeRequestsPerSecondMetric,
if $._config.gateway_enabled then
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', $.queries.write_http_routes_regex)])
$.queries.gateway.writeRequestsPerSecondSelector
else
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|%s' % $.queries.write_http_routes_regex)])
$.queries.distributor.writeRequestsPerSecondSelector
)
)
.addPanel(
Expand Down Expand Up @@ -157,12 +171,16 @@ local filename = 'mimir-overview.json';
||| % helpers),
)
.addPanel(
$.panel(std.stripChars('Read requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' ')) +
$.qpsPanel(
$.qpsPanelNativeHistogram(
std.stripChars('Read requests / sec %(gatewayEnabledPanelTitleSuffix)s' % helpers, ' '),
if $._config.gateway_enabled then
$.queries.gateway.readRequestsPerSecondMetric
else
$.queries.query_frontend.readRequestsPerSecondMetric,
if $._config.gateway_enabled then
$.queries.gateway.readRequestsPerSecond
$.queries.gateway.readRequestsPerSecondSelector
else
$.queries.query_frontend.readRequestsPerSecond
$.queries.query_frontend.readRequestsPerSecondSelector
)
)
.addPanel(
Expand Down
Loading
Loading