From b3faa7be33681265141eee621d8f592b44428405 Mon Sep 17 00:00:00 2001 From: Charlie Le <3375195+CharlieTLe@users.noreply.github.com> Date: Wed, 23 Oct 2024 02:17:43 -0700 Subject: [PATCH] Use `timeseriesPanel` instead of `panel` when creating panels (#58) * Update jsonnet-libs to Fri Jul 19 12:51:49 2024 Updates grafana-builder and mixin-utils to latest version in master branch. This will be helpful in creating Grafana panels that use the timeseriesPanel instead of the deprecated panel. grafana-builder changes: - grafana-builder: add support for native/classic stat panel query (grafana/jsonnet-libs#1285) - More native histograms related utils and renaming (grafana/jsonnet-libs#1270) - Support recording and switching between naive and classic latency histograms (grafana/jsonnet-libs#1150) - Basic native histogram utilities (grafana/jsonnet-libs#1164) - grafana-builder: rename template variable "Data Source" to "Data source" (grafana/jsonnet-libs#1111) - Mixins: draw graphs at full resolution (grafana/jsonnet-libs#825) - Allow dashboards to show gRPC codes as labels (grafana/jsonnet-libs#1098) - Allow configuring sort order for variables (grafana/jsonnet-libs#1014) - remove unused/wrong step param (grafana/jsonnet-libs#999) - Show cancelled requests in grey on QPS dashboards. (grafana/jsonnet-libs#988) - Show cancelled requests in yellow on QPS dashboards. (grafana/jsonnet-libs#986) - Add timeseriesPanel (grafana/jsonnet-libs#824) - Allow including "All" for single template var - Allow datasource's regex to be configured - grafana-builder: make allValue configurable (grafana/jsonnet-libs#703) - grafana_builder: add dashboard link func (grafana/jsonnet-libs#683) - Add 'Data Source' label for the default datasource template variable. (grafana/jsonnet-libs#672) - enable toolip by default (grafana/jsonnet-libs#665) mixin-utils changes: - grafana-builder: add support for native/classic stat panel query (grafana/jsonnet-libs#1285) - More native histograms related utils and renaming (grafana/jsonnet-libs#1270) - nativeClassicSumBy: format list of labels nicer (grafana/jsonnet-libs#1204) - Support recording and switching between naive and classic latency histograms (grafana/jsonnet-libs#1150) - chore: fix hardcoded range interval (grafana/jsonnet-libs#1190) - Basic native histogram utilities (grafana/jsonnet-libs#1164) - utils: allow defining native histogram recording rule (grafana/jsonnet-libs#1156) - modify withRunbookURL to allow internal annotation (grafana/jsonnet-libs#1139) - mixin-utils: drop unsupported step target parameter (grafana/jsonnet-libs#1128) - Mixins: draw graphs at full resolution (grafana/jsonnet-libs#825) - Align with style conventions (grafana/jsonnet-libs#1038) - Add a function to remove an alert rule (grafana/jsonnet-libs#812) - mixin-utils: Parameterize interval for histogramRules (grafana/jsonnet-libs#806) - refactor(grafana/jsonnet-libsprometheus): shard mixins over multiple configmaps (grafana/jsonnet-libs#497) - Not all Prometheus rules are alerts. (grafana/jsonnet-libs#490) Signed-off-by: Charlie Le * Use `timeseriesPanel` instead of `panel` when creating panels Fixes: #44 Depends on: #57 Signed-off-by: Charlie Le * Add units to timeseries panel The yaxes field doesn't seem to do anything in the timeseries panel and was replaced with the units field instead. So I defaulted the units to be short and allowed it to be set for the panel. Signed-off-by: Charlie Le * Update CHANGELOG.md --------- Signed-off-by: Charlie Le Co-authored-by: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> --- CHANGELOG.md | 3 +- .../dashboards/alertmanager.libsonnet | 42 +++---- cortex-mixin/dashboards/compactor.libsonnet | 19 ++- cortex-mixin/dashboards/config.libsonnet | 10 +- .../dashboards/dashboard-utils.libsonnet | 110 ++++++++++++++---- .../dashboards/object-store.libsonnet | 32 +++-- cortex-mixin/dashboards/queries.libsonnet | 94 +++++++-------- .../dashboards/reads-resources.libsonnet | 2 +- cortex-mixin/dashboards/reads.libsonnet | 66 +++++------ .../dashboards/rollout-progress.libsonnet | 22 ++-- cortex-mixin/dashboards/ruler.libsonnet | 55 +++++---- cortex-mixin/dashboards/scaling.libsonnet | 2 +- .../dashboards/writes-resources.libsonnet | 2 +- cortex-mixin/dashboards/writes.libsonnet | 52 ++++----- 14 files changed, 272 insertions(+), 239 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 79419a38..66792572 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,8 +7,9 @@ * [CHANGE] Target 3M memory series per ingester instead of 1.5M * [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57 * [CHANGE] Increase CortexProvisioningTooManyWrites alert threshold to 160e3 +* [CHANGE] Use `timeseriesPanel` instead of `panel` when creating panels #58 * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` -* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard +* [ENHANCEMENT] Support Grafana 11 in all dashboards * [BUGFIX] Remove deprecated option `max_series_per_query` ## 1.16.1 diff --git a/cortex-mixin/dashboards/alertmanager.libsonnet b/cortex-mixin/dashboards/alertmanager.libsonnet index 0bf88c43..731135db 100644 --- a/cortex-mixin/dashboards/alertmanager.libsonnet +++ b/cortex-mixin/dashboards/alertmanager.libsonnet @@ -10,22 +10,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Total Alerts') + + $.timeseriesPanel('Total Alerts') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Total Silences') + + $.timeseriesPanel('Total Silences') + $.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short') ) ) .addRow( $.row('Alerts Received') .addPanel( - $.panel('APS') + + $.timeseriesPanel('APS') + $.queryPanel( [ ||| @@ -42,7 +42,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Alert Notifications') .addPanel( - $.panel('NPS') + + $.timeseriesPanel('NPS') + $.queryPanel( [ ||| @@ -56,7 +56,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('NPS by integration') + + $.timeseriesPanel('NPS by integration') + $.queryPanel( [ ||| @@ -73,18 +73,18 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) ) ) .addRow( $.row('Configuration API (gateway) + Alertmanager UI') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')]) ) ) @@ -94,7 +94,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Replication') .addPanel( - $.panel('Per %s Tenants' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -102,7 +102,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Alerts' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -110,7 +110,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.stack ) .addPanel( - $.panel('Per %s Silences' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( 'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], '{{%s}}' % $._config.per_instance_label @@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Tenant Configuration Sync') .addPanel( - $.panel('Syncs/sec') + + $.timeseriesPanel('Syncs/sec') + $.queryPanel( [ ||| @@ -135,14 +135,14 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Syncs/sec (By Reason)') + + $.timeseriesPanel('Syncs/sec (By Reason)') + $.queryPanel( 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{reason}}' ) ) .addPanel( - $.panel('Ring Check Errors/sec') + + $.timeseriesPanel('Ring Check Errors/sec') + $.queryPanel( 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), 'errors' @@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Initial State Sync') .addPanel( - $.panel('Initial syncs /sec') + + $.timeseriesPanel('Initial syncs /sec') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager), '{{outcome}}' @@ -166,7 +166,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Initial sync duration') + + $.timeseriesPanel('Initial sync duration', unit='s') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + { targets: [ target { @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; } ) .addPanel( - $.panel('Fetch state from other alertmanagers /sec') + + $.timeseriesPanel('Fetch state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -201,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Sharding Runtime State Sync') .addPanel( - $.panel('Replicate state to other alertmanagers /sec') + + $.timeseriesPanel('Replicate state to other alertmanagers /sec') + $.queryPanel( [ ||| @@ -215,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Merge state from other alertmanagers /sec') + + $.timeseriesPanel('Merge state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -229,7 +229,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Persist state to remote storage /sec') + + $.timeseriesPanel('Persist state to remote storage /sec') + $.queryPanel( [ ||| diff --git a/cortex-mixin/dashboards/compactor.libsonnet b/cortex-mixin/dashboards/compactor.libsonnet index aeb64491..720b6fff 100644 --- a/cortex-mixin/dashboards/compactor.libsonnet +++ b/cortex-mixin/dashboards/compactor.libsonnet @@ -14,7 +14,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor) ) + $.bars + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Per-instance runs', ||| @@ -23,7 +22,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Tenants compaction progress') + + $.timeseriesPanel('Tenants compaction progress') + $.queryPanel(||| ( cortex_compactor_tenants_processing_succeeded{%s} + @@ -44,9 +43,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Compacted blocks / sec') + + $.timeseriesPanel('Compacted blocks / sec', unit='ops') + $.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') } + $.panelDescription( 'Compacted blocks / sec', ||| @@ -55,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Per-block compaction duration') + + $.timeseriesPanel('Per-block compaction duration', unit='s') + $.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) + $.panelDescription( 'Per-block compaction duration', @@ -68,11 +66,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('') .addPanel( - $.panel('Average blocks / tenant') + + $.timeseriesPanel('Average blocks / tenant') + $.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'), ) .addPanel( - $.panel('Tenants with largest number of blocks') + + $.timeseriesPanel('Tenants with largest number of blocks') + $.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') + $.panelDescription( 'Tenants with largest number of blocks', @@ -85,9 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Garbage Collector') .addPanel( - $.panel('Blocks marked for deletion / sec') + - $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks marked for deletion / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks'), ) .addPanel( $.successFailurePanel( @@ -111,7 +108,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) + { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Metadata Sync Duration') + + $.timeseriesPanel('Metadata Sync Duration', unit='ms') + // This metric tracks the duration of a per-tenant metadata sync. $.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)), ) diff --git a/cortex-mixin/dashboards/config.libsonnet b/cortex-mixin/dashboards/config.libsonnet index 9240ef89..10692a3d 100644 --- a/cortex-mixin/dashboards/config.libsonnet +++ b/cortex-mixin/dashboards/config.libsonnet @@ -8,19 +8,17 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Startup config file') .addPanel( - $.panel('Startup config file hashes') + + $.timeseriesPanel('Startup config file hashes', unit='instances') + $.queryPanel('count(cortex_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ) .addRow( $.row('Runtime config file') .addPanel( - $.panel('Runtime config file hashes') + + $.timeseriesPanel('Runtime config file hashes', unit='instances') + $.queryPanel('count(cortex_runtime_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') + - $.stack + - { yaxes: $.yaxes('instances') }, + $.stack, ) ), } diff --git a/cortex-mixin/dashboards/dashboard-utils.libsonnet b/cortex-mixin/dashboards/dashboard-utils.libsonnet index c0d2b087..3d9eea30 100644 --- a/cortex-mixin/dashboards/dashboard-utils.libsonnet +++ b/cortex-mixin/dashboards/dashboard-utils.libsonnet @@ -62,6 +62,44 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addTemplate('cluster', 'cortex_build_info', 'cluster') .addTemplate('namespace', 'cortex_build_info{cluster=~"$cluster"}', 'namespace'), }, + timeseriesPanel(title, unit='short'):: { + datasource: '$datasource', + fieldConfig: { + defaults: { + custom: { + drawStyle: 'line', + fillOpacity: 1, + lineWidth: 1, + pointSize: 5, + showPoints: 'never', + spanNulls: false, + stacking: { + group: 'A', + mode: 'none', + }, + }, + thresholds: { + mode: 'absolute', + steps: [], + }, + unit: unit, + }, + overrides: [], + }, + options: { + legend: { + showLegend: true, + }, + tooltip: { + mode: 'single', + sort: 'none', + }, + }, + links: [], + targets: [], + title: title, + type: 'timeseries', + }, // The mixin allow specialism of the job selector depending on if its a single binary // deployment or a namespaced one. @@ -108,6 +146,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; } for target in super.targets ], + fieldConfig+: { + defaults+: { + custom+: { + lineWidth: 0, + fillOpacity: 100, // Get solid fill. + stacking: { + mode: 'normal', + group: 'A', + }, + }, + unit: 'reqps', + min: 0, + }, + overrides+: [{ + matcher: { + id: 'byName', + options: status, + }, + properties: [ + { + id: 'color', + value: { + mode: 'fixed', + fixedColor: $.httpStatusColors[status], + }, + }, + ], + } for status in std.objectFieldsAll($.httpStatusColors)], + }, }, latencyPanel(metricName, selector, multiplier='1e3'):: @@ -121,7 +188,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, successFailurePanel(title, successMetric, failureMetric):: - $.panel(title) + + $.timeseriesPanel(title, unit='short') + $.queryPanel([successMetric, failureMetric], ['successful', 'failed']) + $.stack + { aliasColors: { @@ -132,7 +199,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // Displays started, completed and failed rate. startedCompletedFailedPanel(title, startedMetric, completedMetric, failedMetric):: - $.panel(title) + + $.timeseriesPanel(title, unit='ops') + $.queryPanel([startedMetric, completedMetric, failedMetric], ['started', 'completed', 'failed']) + $.stack + { aliasColors: { @@ -143,7 +210,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerCPUUsagePanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title) + $.queryPanel([ 'sum by(%s) (rate(container_cpu_usage_seconds_total{%s,container=~"%s"}[$__rate_interval]))' % [$._config.per_instance_label, $.namespaceMatcher(), containerName], 'min(container_spec_cpu_quota{%s,container=~"%s"} / container_spec_cpu_period{%s,container=~"%s"})' % [$.namespaceMatcher(), containerName, $.namespaceMatcher(), containerName], @@ -160,7 +227,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerMemoryWorkingSetPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel([ // We use "max" instead of "sum" otherwise during a rolling update of a statefulset we will end up // summing the memory of the old instance/pod (whose metric will be stale for 5m) to the new instance/pod. @@ -180,7 +247,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, containerNetworkPanel(title, metric, instanceName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( 'sum by(%(instance)s) (rate(%(metric)s{%(namespace)s,%(instance)s=~"%(instanceName)s"}[$__rate_interval]))' % { namespace: $.namespaceMatcher(), @@ -199,7 +266,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.containerNetworkPanel('Transmit Bandwidth', 'container_network_transmit_bytes_total', instanceName), containerDiskWritesPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -220,7 +287,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskReadsPanel(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='Bps') + $.queryPanel( ||| sum by(%s, %s, device) ( @@ -239,7 +306,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('Bps') }, containerDiskSpaceUtilization(title, containerName):: - $.panel(title) + + $.timeseriesPanel(title, unit='percentunit') + $.queryPanel( ||| max by(persistentvolumeclaim) ( @@ -266,7 +333,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; else 'label_name="%s"' % containerName, goHeapInUsePanel(title, jobName):: - $.panel(title) + + $.timeseriesPanel(title, unit='bytes') + $.queryPanel( 'sum by(%s) (go_memstats_heap_inuse_bytes{%s})' % [$._config.per_instance_label, $.jobMatcher(jobName)], '{{%s}}' % $._config.per_instance_label @@ -361,39 +428,38 @@ local utils = import 'mixin-utils/utils.libsonnet'; getObjectStoreRows(title, component):: [ super.row(title) .addPanel( - $.panel('Operations / sec') + + $.timeseriesPanel('Operations / sec', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component], '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack ) .addPanel( - $.panel('Error rate') + + $.timeseriesPanel('Error rate', unit='percentunit') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s,component="%s"}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s,component="%s"}[$__rate_interval]))' % [$.namespaceMatcher(), component, $.namespaceMatcher(), component], '{{operation}}') + { yaxes: $.yaxes('percentunit') }, ) .addPanel( - $.panel('Latency of Op: Attributes') + + $.timeseriesPanel('Latency of Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="attributes"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Exists') + + $.timeseriesPanel('Latency of Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="exists"}' % [$.namespaceMatcher(), component]), ), $.row('') .addPanel( - $.panel('Latency of Op: Get') + + $.timeseriesPanel('Latency of Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: GetRange') + + $.timeseriesPanel('Latency of Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="get_range"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Upload') + + $.timeseriesPanel('Latency of Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="upload"}' % [$.namespaceMatcher(), component]), ) .addPanel( - $.panel('Latency of Op: Delete') + + $.timeseriesPanel('Latency of Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="%s",operation="delete"}' % [$.namespaceMatcher(), component]), ), ], @@ -406,7 +472,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; }; super.row(title) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -425,7 +491,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('ops') } ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -439,7 +505,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio', unit='percentunit') + $.queryPanel( ||| sum( diff --git a/cortex-mixin/dashboards/object-store.libsonnet b/cortex-mixin/dashboards/object-store.libsonnet index 69e257b6..d58976a2 100644 --- a/cortex-mixin/dashboards/object-store.libsonnet +++ b/cortex-mixin/dashboards/object-store.libsonnet @@ -7,58 +7,54 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Components') .addPanel( - $.panel('RPS / component') + + $.timeseriesPanel('RPS / component', unit='rps') + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{component}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.panel('Error rate / component') + - $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / component', unit='percentunit') + + $.queryPanel('sum by(component) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(component) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{component}}') ) ) .addRow( $.row('Operations') .addPanel( - $.panel('RPS / operation') + + $.timeseriesPanel('RPS / operation', unit='rps') + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % $.namespaceMatcher(), '{{operation}}') + - $.stack + - { yaxes: $.yaxes('rps') }, + $.stack, ) .addPanel( - $.panel('Error rate / operation') + - $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') + - { yaxes: $.yaxes('percentunit') }, + $.timeseriesPanel('Error rate / operation', unit='percentunit') + + $.queryPanel('sum by(operation) (rate(thanos_objstore_bucket_operation_failures_total{%s}[$__rate_interval])) / sum by(operation) (rate(thanos_objstore_bucket_operations_total{%s}[$__rate_interval]))' % [$.namespaceMatcher(), $.namespaceMatcher()], '{{operation}}') ) ) .addRow( $.row('') .addPanel( - $.panel('Op: Get') + + $.timeseriesPanel('Op: Get') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: GetRange') + + $.timeseriesPanel('Op: GetRange') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="get_range"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Exists') + + $.timeseriesPanel('Op: Exists') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="exists"}' % $.namespaceMatcher()), ) ) .addRow( $.row('') .addPanel( - $.panel('Op: Attributes') + + $.timeseriesPanel('Op: Attributes') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="attributes"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Upload') + + $.timeseriesPanel('Op: Upload') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="upload"}' % $.namespaceMatcher()), ) .addPanel( - $.panel('Op: Delete') + + $.timeseriesPanel('Op: Delete') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,operation="delete"}' % $.namespaceMatcher()), ) ), diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index cada5c8e..212ab9d2 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -8,34 +8,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_frontend_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_frontend)), ) .addPanel( - $.panel('Retries') + - $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Retries', unit='short') + + $.latencyPanel('cortex_query_frontend_retries', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1), ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_frontend_queue_length{%s}' % $.jobMatcher($._config.job_names.query_frontend), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Scheduler') .addPanel( - $.panel('Queue Duration') + + $.timeseriesPanel('Queue Duration', unit='ms') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)), ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel('cortex_query_scheduler_queue_length{%s}' % $.jobMatcher($._config.job_names.query_scheduler), '{{cluster}} / {{namespace}} / {{%s}}' % $._config.per_instance_label), ) ) .addRow( $.row('Query Frontend - Query Splitting and Results Cache') .addPanel( - $.panel('Intervals per Query') + + $.timeseriesPanel('Intervals per Query') + $.queryPanel('sum(rate(cortex_frontend_split_queries_total{%s}[1m])) / sum(rate(cortex_frontend_query_range_duration_seconds_count{%s, method="split_by_interval"}[1m]))' % [$.jobMatcher($._config.job_names.query_frontend), $.jobMatcher($._config.job_names.query_frontend)], 'splitting rate') + $.panelDescription( 'Intervals per Query', @@ -45,7 +44,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Results Cache Hit %') + + $.timeseriesPanel('Results Cache Hit %') + $.queryPanel(||| sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) or sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) / sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) @@ -53,7 +52,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) .addPanel( - $.panel('Results Cache misses') + + $.timeseriesPanel('Results Cache misses') + $.queryPanel(||| sum(rate(cortex_cache_fetched_keys{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits{name=~"frontend.+", %(q)s}[1m])) or sum(rate(cortex_cache_fetched_keys_total{name=~"frontend.+", %(q)s}[1m])) - sum(rate(cortex_cache_hits_total{name=~"frontend.+", %(q)s}[1m])) @@ -63,7 +62,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Query Frontend - Query sharding') .addPanel( - $.panel('Sharded Queries Ratio') + + $.timeseriesPanel('Sharded Queries Ratio') + $.queryPanel(||| sum(rate(cortex_frontend_query_sharding_rewrites_succeeded_total{%s}[$__rate_interval])) / sum(rate(cortex_frontend_query_sharding_rewrites_attempted_total{%s}[$__rate_interval])) @@ -78,9 +77,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Number of Sharded Queries per Query') + + $.timeseriesPanel('Number of Sharded Queries per Query', unit='short') + $.latencyPanel('cortex_frontend_sharded_queries_per_query', '{%s}' % $.jobMatcher($._config.job_names.query_frontend), multiplier=1) + - { yaxes: $.yaxes('short') } + $.panelDescription( 'Number of Sharded Queries per Query', ||| @@ -93,56 +91,50 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Querier') .addPanel( - $.panel('Stages') + + $.timeseriesPanel('Stages', unit='ms') + $.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",%s}) * 1e3' % $.jobMatcher($._config.job_names.querier), '{{slice}}') + - { yaxes: $.yaxes('ms') } + $.stack, ) .addPanel( - $.panel('Chunk cache misses') + + $.timeseriesPanel('Chunk cache misses') + $.queryPanel(||| sum(rate(cortex_cache_fetched_keys{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits{%(q)s,name="chunksmemcache"}[1m])) or sum(rate(cortex_cache_fetched_keys_total{%(q)s,name="chunksmemcache"}[1m])) - sum(rate(cortex_cache_hits_total{%(q)s,name="chunksmemcache"}[1m])) ||| % { q: $.jobMatcher($._config.job_names.query_frontend) }, 'Hit rate'), ) .addPanel( - $.panel('Chunk cache corruptions') + + $.timeseriesPanel('Chunk cache corruptions') + $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'Corrupt chunks'), ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Series per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Series per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_series', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Chunks per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_chunks', $.jobSelector($._config.job_names.ingester), multiplier=1), ) .addPanel( - $.panel('Samples per Query') + - utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Samples per Query', unit='short') + + utils.latencyRecordingRulePanel('cortex_ingester_queried_samples', $.jobSelector($._config.job_names.ingester), multiplier=1), ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( - $.panel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.querier), multiplier=1), ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -151,13 +143,12 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Bucket indexes loaded (per querier)') + + $.timeseriesPanel('Bucket indexes loaded (per querier)', unit='short') + $.queryPanel([ 'max(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'min(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), 'avg(cortex_bucket_index_loaded{%s})' % $.jobMatcher($._config.job_names.querier), - ], ['Max', 'Min', 'Average']) + - { yaxes: $.yaxes('short') }, + ], ['Max', 'Min', 'Average']), ) .addPanel( $.successFailurePanel( @@ -167,7 +158,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Bucket indexes load latency') + + $.timeseriesPanel('Bucket indexes load latency', unit='ms') + $.latencyPanel('cortex_bucket_index_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.querier)), ) ) @@ -175,36 +166,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway - Blocks storage') .addPanel( - $.panel('Blocks queried / sec') + - $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks') + - { yaxes: $.yaxes('ops') }, + $.timeseriesPanel('Blocks queried / sec', unit='ops') + + $.queryPanel('sum(rate(cortex_bucket_store_series_blocks_queried_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), 'blocks'), ) .addPanel( - $.panel('Data fetched / sec') + + $.timeseriesPanel('Data fetched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_fetched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( - $.panel('Data touched / sec') + + $.timeseriesPanel('Data touched / sec', unit='ops') + $.queryPanel('sum by(data_type) (rate(cortex_bucket_store_series_data_touched_sum{component="store-gateway",%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.store_gateway), '{{data_type}}') + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Series fetch duration (per request)') + + $.timeseriesPanel('Series fetch duration (per request)') + $.latencyPanel('cortex_bucket_store_series_get_all_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series merge duration (per request)') + + $.timeseriesPanel('Series merge duration (per request)') + $.latencyPanel('cortex_bucket_store_series_merge_duration_seconds', '{component="store-gateway",%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series returned (per request)') + + $.timeseriesPanel('Series returned (per request)') + $.queryPanel('sum(rate(cortex_bucket_store_series_result_series_sum{component="store-gateway",%s}[$__rate_interval])) / sum(rate(cortex_bucket_store_series_result_series_count{component="store-gateway",%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], 'avg series returned'), ) ) @@ -212,7 +200,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Blocks currently loaded') + + $.timeseriesPanel('Blocks currently loaded') + $.queryPanel('sum(cortex_bucket_store_blocks_loaded{component="store-gateway",%s}) without (user)' % $.jobMatcher($._config.job_names.store_gateway), '{{%s}}' % $._config.per_instance_label) ) .addPanel( @@ -234,15 +222,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('') .addPanel( - $.panel('Lazy loaded index-headers') + + $.timeseriesPanel('Lazy loaded index-headers') + $.queryPanel('cortex_bucket_store_indexheader_lazy_load_total{%s} - cortex_bucket_store_indexheader_lazy_unload_total{%s}' % [$.jobMatcher($._config.job_names.store_gateway), $.jobMatcher($._config.job_names.store_gateway)], '{{%s}}' % $._config.per_instance_label) ) .addPanel( - $.panel('Index-header lazy load duration') + + $.timeseriesPanel('Index-header lazy load duration', unit='ms') + $.latencyPanel('cortex_bucket_store_indexheader_lazy_load_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.store_gateway)), ) .addPanel( - $.panel('Series hash cache hit ratio') + + $.timeseriesPanel('Series hash cache hit ratio') + $.queryPanel(||| sum(rate(cortex_bucket_store_series_hash_cache_hits_total{%s}[$__rate_interval])) / diff --git a/cortex-mixin/dashboards/reads-resources.libsonnet b/cortex-mixin/dashboards/reads-resources.libsonnet index f0750c88..437a57a2 100644 --- a/cortex-mixin/dashboards/reads-resources.libsonnet +++ b/cortex-mixin/dashboards/reads-resources.libsonnet @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ruler') .addPanel( - $.panel('Rules') + + $.timeseriesPanel('Rules') + $.queryPanel( 'sum by(%s) (cortex_prometheus_rule_group_rules{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ruler)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 5a720784..c0ddbe4f 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -36,7 +36,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Instant queries / sec') + + $.timeseriesPanel('Instant queries / sec') + $.statPanel(||| sum( rate( @@ -67,7 +67,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Range queries / sec') + + $.timeseriesPanel('Range queries / sec') + $.statPanel(||| sum( rate( @@ -92,37 +92,35 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Query Frontend') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.query_frontend)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( @@ -142,85 +140,82 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_query_scheduler_queue_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) .addPanel( - $.panel('Latency (Time in Queue)') + + $.timeseriesPanel('Latency (Time in Queue)') + $.latencyPanel('cortex_query_scheduler_queue_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.query_scheduler)) ) ) .addRow( $.row('Cache - Query Results') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_cache_request_duration_seconds_count{method=~"frontend.+", %s}' % $.jobMatcher($._config.job_names.query_frontend)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.query_frontend) + [utils.selector.re('method', 'frontend.+')]) ) ) .addRow( $.row('Querier') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_querier_request_duration_seconds_count{%s, route=~"(prometheus|api_prom)_api_v1_.+"}' % $.jobMatcher($._config.job_names.querier)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_querier_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.re('route', '(prometheus|api_prom)_api_v1_.+')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_querier_request_duration_seconds_bucket{%s, route=~"(prometheus|api_prom)_api_v1_.+"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.querier)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.re('route', '/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/cortex.Ingester/Query(Stream)?|/cortex.Ingester/MetricsForLabelMatchers|/cortex.Ingester/LabelValues|/cortex.Ingester/MetricsMetadata"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Store-gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route=~"/gatewaypb.StoreGateway/.*"}' % $.jobMatcher($._config.job_names.store_gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.store_gateway) + [utils.selector.re('route', '/gatewaypb.StoreGateway/.*')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/gatewaypb.StoreGateway/.*"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.store_gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec', unit='ops') + $.queryPanel( ||| sum by(operation) ( @@ -234,11 +229,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ||| % $.jobMatcher($._config.job_names.store_gateway), '{{operation}}' ) + - $.stack + - { yaxes: $.yaxes('ops') }, + $.stack, ) .addPanel( - $.panel('Latency (getmulti)') + + $.timeseriesPanel('Latency (getmulti)') + $.latencyPanel( 'thanos_memcached_operation_duration_seconds', ||| @@ -252,7 +246,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Hit ratio') + + $.timeseriesPanel('Hit ratio') + $.queryPanel( ||| sum by(item_type) ( diff --git a/cortex-mixin/dashboards/rollout-progress.libsonnet b/cortex-mixin/dashboards/rollout-progress.libsonnet index 16c54095..775a199e 100644 --- a/cortex-mixin/dashboards/rollout-progress.libsonnet +++ b/cortex-mixin/dashboards/rollout-progress.libsonnet @@ -20,7 +20,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Rollout progress // - $.panel('Rollout progress') + + $.timeseriesPanel('Rollout progress') + $.barGauge([ // Multi-zone deployments are grouped together removing the "zone-X" suffix. // After the grouping, the resulting label is called "cortex_service". @@ -89,7 +89,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Writes // - $.panel('Writes - 2xx') + + $.timeseriesPanel('Writes - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -100,7 +100,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 0 }, }, - $.panel('Writes - 4xx') + + $.timeseriesPanel('Writes - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -113,7 +113,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 0 }, }, - $.panel('Writes - 5xx') + + $.timeseriesPanel('Writes - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"}[$__rate_interval])) @@ -125,7 +125,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 0 }, }, - $.panel('Writes 99th Latency') + + $.timeseriesPanel('Writes 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -140,7 +140,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Reads // - $.panel('Reads - 2xx') + + $.timeseriesPanel('Reads - 2xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"2.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -151,7 +151,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 10, y: 4 }, }, - $.panel('Reads - 4xx') + + $.timeseriesPanel('Reads - 4xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"4.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -164,7 +164,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 12, y: 4 }, }, - $.panel('Reads - 5xx') + + $.timeseriesPanel('Reads - 5xx') + $.newStatPanel(||| sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s",status_code=~"5.+"}[$__rate_interval])) / sum(rate(cortex_request_duration_seconds_count{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"}[$__rate_interval])) @@ -176,7 +176,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; gridPos: { h: 4, w: 2, x: 14, y: 4 }, }, - $.panel('Reads 99th Latency') + + $.timeseriesPanel('Reads 99th latency', unit='s') + $.newStatPanel(||| histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_read_routes_regex)s"})) ||| % config, unit='s', thresholds=[ @@ -191,7 +191,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Unhealthy pods // - $.panel('Unhealthy pods') + + $.timeseriesPanel('Unhealthy pods') + $.newStatPanel([ ||| kube_deployment_status_replicas_unavailable{%(namespace_matcher)s, deployment=~"%(all_services_regex)s"} @@ -280,7 +280,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; // // Performance comparison with 24h ago // - $.panel('Latency vs 24h ago') + + $.timeseriesPanel('Latency vs 24h ago') + $.queryPanel([||| 1 - ( avg_over_time(histogram_quantile(0.99, sum by (le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%(gateway_job_matcher)s, route=~"%(gateway_write_routes_regex)s"} offset 24h))[1h:]) diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index b243198c..88742e23 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -67,26 +67,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Active Configurations') + + $.timeseriesPanel('Active Configurations') + $.statPanel('sum(cortex_ruler_managers_total{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Total Rules') + + $.timeseriesPanel('Total Rules') + $.statPanel('sum(cortex_prometheus_rule_group_rules{%s})' % $.jobMatcher($._config.job_names.ruler), format='short') ) .addPanel( - $.panel('Read from Ingesters - QPS') + + $.timeseriesPanel('Read from Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) .addPanel( - $.panel('Write to Ingesters - QPS') + + $.timeseriesPanel('Write to Ingesters - QPS') + $.statPanel('sum(rate(cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}[5m]))' % $.jobMatcher($._config.job_names.ruler), format='reqps') ) ) .addRow( $.row('Rule Evaluations Global') .addPanel( - $.panel('EPS') + + $.timeseriesPanel('EPS') + $.queryPanel( [ $.rulerQueries.ruleEvaluations.success % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], @@ -96,7 +96,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.ruleEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'average' @@ -106,41 +106,40 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Configuration API (gateway)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"%s"}' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re]) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', ruler_config_api_routes_re)]) ) .addPanel( - $.panel('Per route p99 Latency') + + $.timeseriesPanel('Per route p99 latency', unit='s') + $.queryPanel( 'histogram_quantile(0.99, sum by (route, le) (cluster_job_route:cortex_request_duration_seconds_bucket:sum_rate{%s, route=~"%s"}))' % [$.jobMatcher($._config.job_names.gateway), ruler_config_api_routes_re], '{{ route }}' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Writes (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ruler)) ) ) .addRow( $.row('Reads (Ingesters)') .addPanel( - $.panel('QPS') + + $.timeseriesPanel('QPS') + $.qpsPanel('cortex_ingester_client_request_duration_seconds_count{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) ) @@ -148,17 +147,15 @@ local utils = import 'mixin-utils/utils.libsonnet'; std.member($._config.storage_engine, 'blocks'), $.row('Ruler - Blocks storage') .addPanel( - $.panel('Number of store-gateways hit per Query') + - $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Number of store-gateways hit per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_instances_hit_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( - $.panel('Refetches of missing blocks per Query') + - $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, + $.timeseriesPanel('Refetches of missing blocks per Query', unit='short') + + $.latencyPanel('cortex_querier_storegateway_refetches_per_query', '{%s}' % $.jobMatcher($._config.job_names.ruler), multiplier=1), ) .addPanel( - $.panel('Consistency checks failed') + + $.timeseriesPanel('Consistency checks failed') + $.queryPanel('sum(rate(cortex_querier_blocks_consistency_checks_failed_total{%s}[1m])) / sum(rate(cortex_querier_blocks_consistency_checks_total{%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Failure Rate') + { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, ) @@ -166,33 +163,33 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Notifications') .addPanel( - $.panel('Delivery Errors') + + $.timeseriesPanel('Delivery Errors') + $.queryPanel($.rulerQueries.notifications.failure % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Queue Length') + + $.timeseriesPanel('Queue Length') + $.queryPanel($.rulerQueries.notifications.queue % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}') ) .addPanel( - $.panel('Dropped') + + $.timeseriesPanel('Dropped') + $.queryPanel($.rulerQueries.notifications.dropped % $.jobMatcher($._config.job_names.ruler), '{{ user }}') ) ) .addRow( ($.row('Group Evaluations') + { collapse: true }) .addPanel( - $.panel('Missed Iterations') + + $.timeseriesPanel('Missed Iterations') + $.queryPanel($.rulerQueries.groupEvaluations.missedIterations % $.jobMatcher($._config.job_names.ruler), '{{ user }}'), ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.groupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' ), ) .addPanel( - $.panel('Failures') + + $.timeseriesPanel('Failures') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.failure % [$.jobMatcher($._config.job_names.ruler)], '{{ rule_group }}' ) @@ -201,7 +198,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Rule Evaluation per User') + { collapse: true }) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='s') + $.queryPanel( $.rulerQueries.perUserPerGroupEvaluations.latency % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], '{{ user }}' diff --git a/cortex-mixin/dashboards/scaling.libsonnet b/cortex-mixin/dashboards/scaling.libsonnet index 6ac244ea..e078a350 100644 --- a/cortex-mixin/dashboards/scaling.libsonnet +++ b/cortex-mixin/dashboards/scaling.libsonnet @@ -38,7 +38,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( ($.row('Scaling') + { height: '400px' }) .addPanel( - $.panel('Workload-based scaling') + { sort: { col: 0, desc: false } } + + $.timeseriesPanel('Workload-based scaling') + { sort: { col: 0, desc: false } } + $.tablePanel([ ||| sort_desc( diff --git a/cortex-mixin/dashboards/writes-resources.libsonnet b/cortex-mixin/dashboards/writes-resources.libsonnet index 64f83ef1..e11ac223 100644 --- a/cortex-mixin/dashboards/writes-resources.libsonnet +++ b/cortex-mixin/dashboards/writes-resources.libsonnet @@ -31,7 +31,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Ingester') .addPanel( - $.panel('In-memory series') + + $.timeseriesPanel('In-memory series') + $.queryPanel( 'sum by(%s) (cortex_ingester_memory_series{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '{{%s}}' % $._config.per_instance_label diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index c6563645..67d10581 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -34,7 +34,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; showTitle: false, }) .addPanel( - $.panel('Samples / sec') + + $.timeseriesPanel('Samples / sec') + $.statPanel( 'sum(%(group_prefix_jobs)s:cortex_distributor_received_samples:rate5m{%(job)s})' % ( $._config { @@ -45,7 +45,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Active Series') + + $.timeseriesPanel('Active Series') + $.statPanel(||| sum(cortex_ingester_memory_series{%(ingester)s} / on(%(group_by_cluster)s) group_left @@ -56,87 +56,84 @@ local utils = import 'mixin-utils/utils.libsonnet'; }, format='short') ) .addPanel( - $.panel('Tenants') + + $.timeseriesPanel('Tenants') + $.statPanel('count(count by(user) (cortex_ingester_active_series{%s}))' % $.jobMatcher($._config.job_names.ingester), format='short') ) .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.statPanel('sum(rate(cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}[5m]))' % $.jobMatcher($._config.job_names.gateway), format='reqps') ) ) .addRow( $.row('Gateway') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.gateway)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.gateway)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Distributor') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.distributor) + [utils.selector.re('route', '/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route=~"/distributor.Distributor/Push|/httpgrpc.*|api_(v1|prom)_push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.distributor)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Key-value store for high-availability (HA) deduplication') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.distributor)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.distributor)) ) ) .addRow( $.row('Ingester') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_request_duration_seconds_count{%s,route="/cortex.Ingester/Push"}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('route', '/cortex.Ingester/Push')]) ) .addPanel( - $.panel('Per %s p99 Latency' % $._config.per_instance_label) + + $.timeseriesPanel('Per %s p99 Latency' % $._config.per_instance_label, unit='s') + $.hiddenLegendQueryPanel( 'histogram_quantile(0.99, sum by(le, %s) (rate(cortex_request_duration_seconds_bucket{%s, route="/cortex.Ingester/Push"}[$__rate_interval])))' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.ingester)], '' - ) + - { yaxes: $.yaxes('s') } + ) ) ) .addRow( $.row('Key-value store for the ingesters ring') .addPanel( - $.panel('Requests / sec') + + $.timeseriesPanel('Requests / sec') + $.qpsPanel('cortex_kv_request_duration_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)) ) .addPanel( - $.panel('Latency') + + $.timeseriesPanel('Latency', unit='ms') + utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) @@ -158,7 +155,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Upload latency') + + $.timeseriesPanel('Upload latency', unit='ms') + $.latencyPanel('thanos_objstore_bucket_operation_duration_seconds', '{%s,component="ingester",operation="upload"}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Upload latency', @@ -188,7 +185,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Compactions latency') + + $.timeseriesPanel('Compactions latency', unit='ms') + $.latencyPanel('cortex_ingester_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)) + $.panelDescription( 'Compaction latency', @@ -231,9 +228,8 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('WAL truncations latency (includes checkpointing)') + + $.timeseriesPanel('WAL truncations latency (includes checkpointing)', unit='s') + $.queryPanel('sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_sum{%s}[$__rate_interval])) / sum(rate(cortex_ingester_tsdb_wal_truncate_duration_seconds_count{%s}[$__rate_interval]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'avg') + - { yaxes: $.yaxes('s') } + $.panelDescription( 'WAL truncations latency (including checkpointing)', ||| @@ -243,7 +239,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ), ) .addPanel( - $.panel('Corruptions / sec') + + $.timeseriesPanel('Corruptions / sec', unit='ops') + $.queryPanel([ 'sum(rate(cortex_ingester_wal_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'sum(rate(cortex_ingester_tsdb_mmap_chunk_corruptions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester),