From 47240d3729f8bf6ce2201bd97775c9b457b1853e Mon Sep 17 00:00:00 2001 From: Friedrich Gonzalez <1517449+friedrichg@users.noreply.github.com> Date: Thu, 30 May 2024 06:04:26 -0700 Subject: [PATCH] Remove deprecated chunks dashboards (#54) Signed-off-by: Friedrich Gonzalez --- CHANGELOG.md | 1 + cortex-mixin/alerts/alerts.libsonnet | 161 ------------------- cortex-mixin/config.libsonnet | 14 +- cortex-mixin/dashboards.libsonnet | 19 +-- cortex-mixin/dashboards/chunks.libsonnet | 100 ------------ cortex-mixin/dashboards/comparison.libsonnet | 105 ------------ cortex-mixin/dashboards/queries.libsonnet | 41 ----- cortex-mixin/dashboards/reads.libsonnet | 76 --------- cortex-mixin/dashboards/ruler.libsonnet | 41 ----- cortex-mixin/dashboards/writes.libsonnet | 64 -------- cortex-mixin/docs/playbooks.md | 41 ----- 11 files changed, 5 insertions(+), 658 deletions(-) delete mode 100644 cortex-mixin/dashboards/chunks.libsonnet delete mode 100644 cortex-mixin/dashboards/comparison.libsonnet diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ff1583e..0c6f70c3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## master / unreleased * [CHANGE] Enable shuffle sharding in compactors +* [CHANGE] Remove chunks support for dashboards * [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block` * [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index a6287e5e..e67ef449 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -71,27 +71,6 @@ |||, }, }, - { - // We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail - // and we will never trigger the alert. - // We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage. - alert: 'CortexTableSyncFailure', - expr: ||| - 100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m]) - / - rate(cortex_table_manager_sync_duration_seconds_count[15m]) - > 10 - |||, - 'for': '30m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables. - |||, - }, - }, { alert: 'CortexQueriesIncorrect', expr: ||| @@ -206,41 +185,6 @@ |||, }, }, - { - alert: 'CortexTransferFailed', - expr: ||| - max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m]) - |||, - 'for': '5m', - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} transfer failed. - |||, - }, - }, - { - alert: 'CortexOldChunkInMemory', - // Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer - // to 10 hours. - // Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors). - expr: ||| - (time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000) - and - (cortex_oldest_unflushed_chunk_timestamp_seconds > 0) - |||, - 'for': '5m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory. - |||, - }, - }, { alert: 'CortexKVStoreFailure', expr: ||| @@ -379,87 +323,6 @@ }, ], }, - { - name: 'cortex_wal_alerts', - rules: [ - { - // Alert immediately if WAL is corrupt. - alert: 'CortexWALCorruption', - expr: ||| - increase(cortex_ingester_wal_corruptions_total[5m]) > 0 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint. - |||, - }, - }, - { - // One or more failed checkpoint creation is a warning. - alert: 'CortexCheckpointCreationFailed', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint. - |||, - }, - }, - { - // Two or more failed checkpoint creation in 1h means something is wrong. - alert: 'CortexCheckpointCreationFailed', - expr: ||| - increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint. - |||, - }, - }, - { - // One or more failed checkpoint deletion is a warning. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0 - |||, - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - {{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint. - |||, - }, - }, - { - // Two or more failed checkpoint deletion in 2h means something is wrong. - // We give this more buffer than creation as this is a less critical operation. - alert: 'CortexCheckpointDeletionFailed', - expr: ||| - increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1 - |||, - labels: { - severity: 'critical', - }, - annotations: { - message: ||| - {{ $labels.instance }} is failing to delete checkpoint. - |||, - }, - }, - ], - }, { name: 'cortex-rollout-alerts', rules: [ @@ -524,30 +387,6 @@ { name: 'cortex-provisioning', rules: [ - { - alert: 'CortexProvisioningMemcachedTooSmall', - // 4 x in-memory series size = 24hrs of data. - expr: ||| - ( - 4 * - sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count) - / 1e9 - ) - > - ( - sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9 - ) - ||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels], - 'for': '15m', - labels: { - severity: 'warning', - }, - annotations: { - message: ||| - Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB. - ||| % $._config, - }, - }, { alert: 'CortexProvisioningTooManyActiveSeries', // We target each ingester to 1.5M in-memory series. This alert fires if the average diff --git a/cortex-mixin/config.libsonnet b/cortex-mixin/config.libsonnet index 06941b6d..2f620703 100644 --- a/cortex-mixin/config.libsonnet +++ b/cortex-mixin/config.libsonnet @@ -3,18 +3,7 @@ grafanaDashboardShards: 4, _config+:: { - // Switch for overall storage engine. - // May contain 'chunks', 'blocks' or both. - // Enables chunks- or blocks- specific panels and dashboards. - storage_engine: ['blocks'], - - // For chunks backend, switch for chunk index type. - // May contain 'bigtable', 'dynamodb' or 'cassandra'. - chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'], - - // For chunks backend, switch for chunk store type. - // May contain 'bigtable', 'dynamodb', 'cassandra', 's3' or 'gcs'. - chunk_store_backend: ['bigtable', 'dynamodb', 'cassandra', 's3', 'gcs'], + storage_engine: ['blocks'], // TODO: Remove this option, it's not needed // Tags for dashboards. tags: ['cortex'], @@ -32,7 +21,6 @@ ruler: '(ruler|cortex$)', query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments. query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments. - table_manager: '(table-manager|cortex$)', ring_members: ['compactor', 'distributor', 'ingester.*', 'querier.*', 'ruler', 'store-gateway', 'cortex'], store_gateway: '(store-gateway|cortex$)', gateway: '(gateway|cortex-gw|cortex-gw-internal)', diff --git a/cortex-mixin/dashboards.libsonnet b/cortex-mixin/dashboards.libsonnet index 9e7f71c2..e4b68c4b 100644 --- a/cortex-mixin/dashboards.libsonnet +++ b/cortex-mixin/dashboards.libsonnet @@ -9,22 +9,9 @@ (import 'dashboards/writes.libsonnet') + (import 'dashboards/slow-queries.libsonnet') + (import 'dashboards/rollout-progress.libsonnet') + - - (if std.member($._config.storage_engine, 'blocks') - then - (import 'dashboards/compactor.libsonnet') + - (import 'dashboards/compactor-resources.libsonnet') + - (import 'dashboards/object-store.libsonnet') - else {}) + - - (if std.member($._config.storage_engine, 'chunks') - then import 'dashboards/chunks.libsonnet' - else {}) + - - (if std.member($._config.storage_engine, 'blocks') - && std.member($._config.storage_engine, 'chunks') - then import 'dashboards/comparison.libsonnet' - else {}) + + (import 'dashboards/compactor.libsonnet') + + (import 'dashboards/compactor-resources.libsonnet') + + (import 'dashboards/object-store.libsonnet') + (if !$._config.resources_dashboards_enabled then {} else (import 'dashboards/reads-resources.libsonnet') + diff --git a/cortex-mixin/dashboards/chunks.libsonnet b/cortex-mixin/dashboards/chunks.libsonnet deleted file mode 100644 index b82c6880..00000000 --- a/cortex-mixin/dashboards/chunks.libsonnet +++ /dev/null @@ -1,100 +0,0 @@ -local utils = import 'mixin-utils/utils.libsonnet'; - -(import 'dashboard-utils.libsonnet') { - 'cortex-chunks.json': - ($.dashboard('Cortex / Chunks') + { uid: 'a56a3fa6284064eb392a115f3acbf744' }) - .addClusterSelectorTemplates() - .addRow( - $.row('Active Series / Chunks') - .addPanel( - $.panel('Series') + - $.queryPanel('sum(cortex_ingester_memory_series{%s})' % $.jobMatcher($._config.job_names.ingester), 'series'), - ) - .addPanel( - $.panel('Chunks per series') + - $.queryPanel('sum(cortex_ingester_memory_chunks{%s}) / sum(cortex_ingester_memory_series{%s})' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'chunks'), - ) - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Utilization') + - $.latencyPanel('cortex_ingester_chunk_utilization', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + - { yaxes: $.yaxes('percentunit') }, - ) - .addPanel( - $.panel('Age') + - $.latencyPanel('cortex_ingester_chunk_age_seconds', '{%s}' % $.jobMatcher($._config.job_names.ingester)), - ), - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Size') + - $.latencyPanel('cortex_ingester_chunk_length', '{%s}' % $.jobMatcher($._config.job_names.ingester), multiplier='1') + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Entries') + - $.queryPanel('sum(rate(cortex_chunk_store_index_entries_per_chunk_sum{%s}[5m])) / sum(rate(cortex_chunk_store_index_entries_per_chunk_count{%s}[5m]))' % [$.jobMatcher($._config.job_names.ingester), $.jobMatcher($._config.job_names.ingester)], 'entries'), - ), - ) - .addRow( - $.row('Flush Stats') - .addPanel( - $.panel('Queue Length') + - $.queryPanel('cortex_ingester_flush_queue_length{%s}' % $.jobMatcher($._config.job_names.ingester), '{{%s}}' % $._config.per_instance_label), - ) - .addPanel( - $.panel('Flush Rate') + - $.qpsPanel('cortex_ingester_chunk_age_seconds_count{%s}' % $.jobMatcher($._config.job_names.ingester)), - ), - ), - - 'cortex-wal.json': - ($.dashboard('Cortex / WAL') + { uid: 'd4fb924cdc1581cd8e870e3eb0110bda' }) - .addClusterSelectorTemplates() - .addRow( - $.row('') - .addPanel( - $.panel('Bytes Logged (WAL+Checkpoint) / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval])) + avg(rate(cortex_ingester_checkpoint_logged_bytes_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - ) - .addRow( - $.row('WAL') - .addPanel( - $.panel('Records logged / ingester / second') + - $.queryPanel('avg(rate(cortex_ingester_wal_records_logged_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.ingester), 'records'), - ) - .addPanel( - $.panel('Bytes per record') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_wal_records_logged_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - .addPanel( - $.panel('Bytes per sample') + - $.queryPanel('avg(rate(cortex_ingester_wal_logged_bytes_total{%(m)s}[$__rate_interval]) / rate(cortex_ingester_ingested_samples_total{%(m)s}[$__rate_interval]))' % { m: $.jobMatcher($._config.job_names.ingester) }, 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - .addPanel( - $.panel('Min(available disk space)') + - $.queryPanel('min(kubelet_volume_stats_available_bytes{cluster=~"$cluster", namespace=~"$namespace", persistentvolumeclaim=~"ingester.*"})', 'bytes') + - { yaxes: $.yaxes('bytes') }, - ) - ) - .addRow( - $.row('Checkpoint') - .addPanel( - $.panel('Checkpoint creation/deletion / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), - ) - .addPanel( - $.panel('Checkpoint creation/deletion failed / sec') + - $.queryPanel('rate(cortex_ingester_checkpoint_creations_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-creation' % $._config.per_instance_label) + - $.queryPanel('rate(cortex_ingester_checkpoint_deletions_failed_total{%s}[$__rate_interval])' % $.jobMatcher($._config.job_names.ingester), '{{%s}}-deletion' % $._config.per_instance_label), - ) - ), -} diff --git a/cortex-mixin/dashboards/comparison.libsonnet b/cortex-mixin/dashboards/comparison.libsonnet deleted file mode 100644 index 1716f7d4..00000000 --- a/cortex-mixin/dashboards/comparison.libsonnet +++ /dev/null @@ -1,105 +0,0 @@ -local utils = import 'mixin-utils/utils.libsonnet'; - -(import 'dashboard-utils.libsonnet') -{ - 'cortex-blocks-vs-chunks.json': - ($.dashboard('Cortex / Blocks vs Chunks') + { uid: '0e2b4dd23df9921972e3fb554c0fc483' }) - .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*cortex.*"}', 'cluster') - .addTemplate('blocks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addTemplate('chunks_namespace', 'kube_pod_container_info{image=~".*cortex.*"}', 'namespace') - .addRow( - $.row('Ingesters') - .addPanel( - $.panel('Samples / sec') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($blocks_namespace)/ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job=~"($chunks_namespace)/ingester"}[$__rate_interval]))', 'chunks') - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('Blocks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($blocks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - .addPanel( - $.panel('Chunks Latency') + - utils.latencyRecordingRulePanel('cortex_request_duration_seconds', [utils.selector.re('cluster', '$cluster'), utils.selector.re('job', '($chunks_namespace)/ingester'), utils.selector.eq('route', '/cortex.Ingester/Push')]) - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU per sample') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$blocks_namespace/ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval])) / sum(rate(cortex_ingester_ingested_samples_total{cluster=~"$cluster",job="$chunks_namespace/ingester"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory per active series') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"}) / sum(cortex_ingester_memory_series{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="ingester"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="ingester"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/ingester"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/ingester"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ) - .addRow( - $.row('Queriers') - .addPanel( - $.panel('Queries / sec (query-frontend)') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$blocks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_request_duration_seconds_count{cluster=~"$cluster",job="$chunks_namespace/query-frontend",route!="metrics"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Queries / sec (query-tee)') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('Latency 99th') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])))', 'blocks') + - $.queryPanel('histogram_quantile(0.99, sum by(backend, le) (rate(cortex_querytee_request_duration_seconds_bucket{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])))', 'chunks') + - { yaxes: $.yaxes('s') } - ) - .addPanel( - $.panel('Latency average') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$blocks_namespace\\\\..*"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum by(backend) (rate(cortex_querytee_request_duration_seconds_sum{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval])) / sum by(backend) (rate(cortex_querytee_request_duration_seconds_count{cluster=~"$cluster",backend=~".*\\\\.$chunks_namespace\\\\..*"}[$__rate_interval]))', 'chunks') + - { yaxes: $.yaxes('s') } - ) - ) - .addRow( - $.row('') - .addPanel( - $.panel('CPU') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"}[$__rate_interval]))', 'blocks') + - $.queryPanel('sum(rate(container_cpu_usage_seconds_total{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"}[$__rate_interval]))', 'chunks') - ) - .addPanel( - $.panel('Memory') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$blocks_namespace",container="querier"})', 'blocks - working set') + - $.queryPanel('sum(container_memory_working_set_bytes{cluster=~"$cluster",namespace="$chunks_namespace",container="querier"})', 'chunks - working set') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$blocks_namespace/querier"})', 'blocks - heap inuse') + - $.queryPanel('sum(go_memstats_heap_inuse_bytes{cluster=~"$cluster",job=~"$chunks_namespace/querier"})', 'chunks - heap inuse') + - { yaxes: $.yaxes('bytes') } - ) - ), -} diff --git a/cortex-mixin/dashboards/queries.libsonnet b/cortex-mixin/dashboards/queries.libsonnet index 853d0f8c..cada5c8e 100644 --- a/cortex-mixin/dashboards/queries.libsonnet +++ b/cortex-mixin/dashboards/queries.libsonnet @@ -110,23 +110,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.queryPanel('sum(rate(cortex_cache_corrupt_chunks_total{%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'Corrupt chunks'), ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Querier - Chunks storage - Index Cache') - .addPanel( - $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'Entries'), - ) - .addPanel( - $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier), $.jobMatcher($._config.job_names.querier)], 'hit rate') - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.querier), 'churn rate'), - ) - ) .addRow( $.row('Ingester') .addPanel( @@ -145,30 +128,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('short') }, ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Querier - Chunks storage - Store') - .addPanel( - $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.querier), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Querier - Blocks storage') diff --git a/cortex-mixin/dashboards/reads.libsonnet b/cortex-mixin/dashboards/reads.libsonnet index 9bc9b7d6..5a720784 100644 --- a/cortex-mixin/dashboards/reads.libsonnet +++ b/cortex-mixin/dashboards/reads.libsonnet @@ -216,30 +216,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; { yaxes: $.yaxes('s') } ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached - Chunks storage - Index') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="store.index-cache-read.memcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'store.index-cache-read.memcache.fetch')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached - Chunks storage - Chunks') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cache_request_duration_seconds_count{%s,method="chunksmemcache.fetch"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cache_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('method', 'chunksmemcache.fetch')]) - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Memcached – Blocks storage – Block index cache (store-gateway accesses)') // Resembles thanosMemcachedCache @@ -339,58 +315,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; 'metadata-cache' ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), - $.row('Cassandra') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="SELECT"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'SELECT')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), - $.row('BigTable') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/ReadRows"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/ReadRows')]) - ), - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), - $.row('DynamoDB') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.QueryPages"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'DynamoDB.QueryPages')]) - ), - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_store_backend, 'gcs'), - $.row('GCS') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="GET"}' % $.jobMatcher($._config.job_names.querier)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.querier) + [utils.selector.eq('operation', 'GET')]) - ) - ) // Object store metrics for the store-gateway. .addRowsIf( std.member($._config.storage_engine, 'blocks'), diff --git a/cortex-mixin/dashboards/ruler.libsonnet b/cortex-mixin/dashboards/ruler.libsonnet index d1062581..b243198c 100644 --- a/cortex-mixin/dashboards/ruler.libsonnet +++ b/cortex-mixin/dashboards/ruler.libsonnet @@ -144,47 +144,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.latencyPanel('cortex_ingester_client_request_duration_seconds', '{%s, operation="/cortex.Ingester/QueryStream"}' % $.jobMatcher($._config.job_names.ruler)) ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Ruler - Chunks storage - Index Cache') - .addPanel( - $.panel('Total entries') + - $.queryPanel('sum(querier_cache_added_new_total{cache="store.index-cache-read.fifocache",%s}) - sum(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s})' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'Entries'), - ) - .addPanel( - $.panel('Cache Hit %') + - $.queryPanel('(sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m])) - sum(rate(querier_cache_misses_total{cache="store.index-cache-read.fifocache",%s}[1m]))) / sum(rate(querier_cache_gets_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % [$.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler), $.jobMatcher($._config.job_names.ruler)], 'hit rate') - { yaxes: $.yaxes({ format: 'percentunit', max: 1 }) }, - ) - .addPanel( - $.panel('Churn Rate') + - $.queryPanel('sum(rate(querier_cache_evicted_total{cache="store.index-cache-read.fifocache",%s}[1m]))' % $.jobMatcher($._config.job_names.ruler), 'churn rate'), - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Ruler - Chunks storage - Store') - .addPanel( - $.panel('Index Lookups per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_index_lookups_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (pre-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_pre_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Series (post-intersection) per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_series_post_intersection_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - .addPanel( - $.panel('Chunks per Query') + - utils.latencyRecordingRulePanel('cortex_chunk_store_chunks_per_query', $.jobSelector($._config.job_names.ruler), multiplier=1) + - { yaxes: $.yaxes('short') }, - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ruler - Blocks storage') diff --git a/cortex-mixin/dashboards/writes.libsonnet b/cortex-mixin/dashboards/writes.libsonnet index e99faee4..c6563645 100644 --- a/cortex-mixin/dashboards/writes.libsonnet +++ b/cortex-mixin/dashboards/writes.libsonnet @@ -140,70 +140,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; utils.latencyRecordingRulePanel('cortex_kv_request_duration_seconds', $.jobSelector($._config.job_names.ingester)) ) ) - .addRowIf( - std.member($._config.storage_engine, 'chunks'), - $.row('Memcached') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_memcache_request_duration_seconds_count{%s,method="Memcache.Put"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_memcache_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('method', 'Memcache.Put')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'cassandra'), - $.row('Cassandra') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_cassandra_request_duration_seconds_count{%s, operation="INSERT"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_cassandra_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'INSERT')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'bigtable'), - $.row('BigTable') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_bigtable_request_duration_seconds_count{%s, operation="/google.bigtable.v2.Bigtable/MutateRows"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_bigtable_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', '/google.bigtable.v2.Bigtable/MutateRows')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_index_backend + $._config.chunk_store_backend, 'dynamodb'), - $.row('DynamoDB') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_dynamo_request_duration_seconds_count{%s, operation="DynamoDB.BatchWriteItem"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_dynamo_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'DynamoDB.BatchWriteItem')]) - ) - ) - .addRowIf( - std.member($._config.storage_engine, 'chunks') && - std.member($._config.chunk_store_backend, 'gcs'), - $.row('GCS') - .addPanel( - $.panel('Requests / sec') + - $.qpsPanel('cortex_gcs_request_duration_seconds_count{%s, operation="POST"}' % $.jobMatcher($._config.job_names.ingester)) - ) - .addPanel( - $.panel('Latency') + - utils.latencyRecordingRulePanel('cortex_gcs_request_duration_seconds', $.jobSelector($._config.job_names.ingester) + [utils.selector.eq('operation', 'POST')]) - ) - ) .addRowIf( std.member($._config.storage_engine, 'blocks'), $.row('Ingester - Blocks storage - Shipper') diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index c1ee3ef4..b5b68895 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -198,8 +198,6 @@ How to **investigate**: - If the failing service is going OOM (`OOMKilled`): scale up or increase the memory - If the failing service is crashing / panicking: look for the stack trace in the logs and investigate from there -### CortexTransferFailed -This alert goes off when an ingester fails to find another node to transfer its data to when it was shutting down. If there is both a pod stuck terminating and one stuck joining, look at the kubernetes events. This may be due to scheduling problems caused by some combination of anti affinity rules/resource utilization. Adding a new node can help in these circumstances. You can see recent events associated with a resource via kubectl describe, ex: `kubectl -n describe pod ` ### CortexIngesterUnhealthy This alert goes off when an ingester is marked as unhealthy. Check the ring web page to see which is marked as unhealthy. You could then check the logs to see if there are any related to that ingester ex: `kubectl logs -f ingester-01 --namespace=prod`. A simple way to resolve this may be to click the "Forgot" button on the ring page, especially if the pod doesn't exist anymore. It might not exist anymore because it was on a node that got shut down, so you could check to see if there are any logs related to the node that pod is/was on, ex: `kubectl get events --namespace=prod | grep cloud-provider-node`. @@ -464,29 +462,6 @@ How to **investigate**: - Safely manually delete the block from the bucket if was a partial delete or an upload failed by a compactor - Further investigate if was an upload failed by an ingester but not later retried (ingesters are expected to retry uploads until succeed) -### CortexWALCorruption - -This alert is only related to the chunks storage. This can happen because of 2 reasons: (1) Non graceful shutdown of ingesters. (2) Faulty storage or NFS. - -WAL corruptions are only detected at startups, so at this point the WAL/Checkpoint would have been repaired automatically. So we can only check what happened and if there was any data loss and take actions to avoid this happening in future. - -1. Check if there was any node restarts that force killed pods. If there is, then the corruption is from the non graceful shutdown of ingesters, which is generally fine. You can: - * Describe the pod to see the last state. - * Use `kube_pod_info` to check the node for the pod. `node_boot_time_seconds` to see if node just booted (which also indicates restart). - * You can use `eventrouter` logs to double check. - * Check ingester logs to check if the shutdown logs are missing at that time. -2. To confirm this, in the logs, check the WAL segment on which the corruption happened (let's say `X`) and the last checkpoint attempt number (let's say `Y`, this is the last WAL segment that was present when checkpointing started). -3. If `X > Y`, then it's most likely an abrupt restart of ingester and the corruption would be on the last few records of the last segment. To verify this, check the file timestamps of WAL segment `X` and `X - 1` if they were recent. -4. If `X < Y`, then the corruption was in some WAL segment which was not the last one. This indicates faulty disk and some data loss on that ingester. -5. In case of faulty disk corruption, if the number or ingesters that had corruption within the chunk flush age: - 1. Less than the quorum number for your replication factor: No data loss, because there is a guarantee that the data is replicated. For example, if replication factor is 3, then it's fine if corruption was on 1 ingester. - 2. Equal or more than the quorum number but less than replication factor: There is a good chance that there is no data loss if it was replicated to desired number of ingesters. But it's good to check once for data loss. - 3. Equal or more than the replication factor: Then there is definitely some data loss. - -### CortexTableSyncFailure - -_This alert applies to Cortex chunks storage only._ - ### CortexQueriesIncorrect _TODO: this playbook has not been written yet._ @@ -578,22 +553,6 @@ How to **investigate**: - `other` - Check both Cortex and memcached logs to find more details -### CortexOldChunkInMemory - -_This alert applies to Cortex chunks storage only._ - -### CortexCheckpointCreationFailed - -_This alert applies to Cortex chunks storage only._ - -### CortexCheckpointDeletionFailed - -_This alert applies to Cortex chunks storage only._ - -### CortexProvisioningMemcachedTooSmall - -_This alert applies to Cortex chunks storage only._ - ### CortexProvisioningTooManyActiveSeries This alert fires if the average number of in-memory series per ingester is above our target (1.5M).