Skip to content

Commit

Permalink
Merge branch 'main' into fix-test-readme
Browse files Browse the repository at this point in the history
  • Loading branch information
friedrichg authored Jun 8, 2024
2 parents 254305c + 47240d3 commit 57197ed
Show file tree
Hide file tree
Showing 15 changed files with 57 additions and 666 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ cortex-mixin.zip
cortex-mixin/out
cortex-mixin/vendor
/test-readme/
.vscode
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Changelog

## master / unreleased
* [CHANGE] Enable shuffle sharding in compactors
* [CHANGE] Remove chunks support for dashboards
* [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block`
* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard

## 1.16.1
* [CHANGE] Upgrade memcached to 1.6.23-alpine and memcached-exporter to v0.14.2
Expand Down
161 changes: 0 additions & 161 deletions cortex-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,6 @@
|||,
},
},
{
// We're syncing every 10mins, and this means with a 5min rate, we will have a NaN when syncs fail
// and we will never trigger the alert.
// We also have a 3h grace-period for creation of tables which means the we can fail for 3h before it's an outage.
alert: 'CortexTableSyncFailure',
expr: |||
100 * rate(cortex_table_manager_sync_duration_seconds_count{status_code!~"2.."}[15m])
/
rate(cortex_table_manager_sync_duration_seconds_count[15m])
> 10
|||,
'for': '30m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% errors syncing tables.
|||,
},
},
{
alert: 'CortexQueriesIncorrect',
expr: |||
Expand Down Expand Up @@ -206,41 +185,6 @@
|||,
},
},
{
alert: 'CortexTransferFailed',
expr: |||
max_over_time(cortex_shutdown_duration_seconds_count{op="transfer",status!="success"}[15m])
|||,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} transfer failed.
|||,
},
},
{
alert: 'CortexOldChunkInMemory',
// Even though we should flush chunks after 6h, we see that 99p of age of flushed chunks is closer
// to 10 hours.
// Ignore cortex_oldest_unflushed_chunk_timestamp_seconds that are zero (eg. distributors).
expr: |||
(time() - cortex_oldest_unflushed_chunk_timestamp_seconds > 36000)
and
(cortex_oldest_unflushed_chunk_timestamp_seconds > 0)
|||,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} has very old unflushed chunk in memory.
|||,
},
},
{
alert: 'CortexKVStoreFailure',
expr: |||
Expand Down Expand Up @@ -379,87 +323,6 @@
},
],
},
{
name: 'cortex_wal_alerts',
rules: [
{
// Alert immediately if WAL is corrupt.
alert: 'CortexWALCorruption',
expr: |||
increase(cortex_ingester_wal_corruptions_total[5m]) > 0
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} has a corrupted WAL or checkpoint.
|||,
},
},
{
// One or more failed checkpoint creation is a warning.
alert: 'CortexCheckpointCreationFailed',
expr: |||
increase(cortex_ingester_checkpoint_creations_failed_total[10m]) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} failed to create checkpoint.
|||,
},
},
{
// Two or more failed checkpoint creation in 1h means something is wrong.
alert: 'CortexCheckpointCreationFailed',
expr: |||
increase(cortex_ingester_checkpoint_creations_failed_total[1h]) > 1
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} is failing to create checkpoint.
|||,
},
},
{
// One or more failed checkpoint deletion is a warning.
alert: 'CortexCheckpointDeletionFailed',
expr: |||
increase(cortex_ingester_checkpoint_deletions_failed_total[10m]) > 0
|||,
labels: {
severity: 'warning',
},
annotations: {
message: |||
{{ $labels.job }}/{{ $labels.instance }} failed to delete checkpoint.
|||,
},
},
{
// Two or more failed checkpoint deletion in 2h means something is wrong.
// We give this more buffer than creation as this is a less critical operation.
alert: 'CortexCheckpointDeletionFailed',
expr: |||
increase(cortex_ingester_checkpoint_deletions_failed_total[2h]) > 1
|||,
labels: {
severity: 'critical',
},
annotations: {
message: |||
{{ $labels.instance }} is failing to delete checkpoint.
|||,
},
},
],
},
{
name: 'cortex-rollout-alerts',
rules: [
Expand Down Expand Up @@ -524,30 +387,6 @@
{
name: 'cortex-provisioning',
rules: [
{
alert: 'CortexProvisioningMemcachedTooSmall',
// 4 x in-memory series size = 24hrs of data.
expr: |||
(
4 *
sum by (%s) (cortex_ingester_memory_series * cortex_ingester_chunk_size_bytes_sum / cortex_ingester_chunk_size_bytes_count)
/ 1e9
)
>
(
sum by (%s) (memcached_limit_bytes{job=~".+/memcached"}) / 1e9
)
||| % [$._config.alert_aggregation_labels, $._config.alert_aggregation_labels],
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
message: |||
Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB.
||| % $._config,
},
},
{
alert: 'CortexProvisioningTooManyActiveSeries',
// We target each ingester to 1.5M in-memory series. This alert fires if the average
Expand Down
14 changes: 1 addition & 13 deletions cortex-mixin/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,7 @@
grafanaDashboardShards: 4,

_config+:: {
// Switch for overall storage engine.
// May contain 'chunks', 'blocks' or both.
// Enables chunks- or blocks- specific panels and dashboards.
storage_engine: ['blocks'],

// For chunks backend, switch for chunk index type.
// May contain 'bigtable', 'dynamodb' or 'cassandra'.
chunk_index_backend: ['bigtable', 'dynamodb', 'cassandra'],

// For chunks backend, switch for chunk store type.
// May contain 'bigtable', 'dynamodb', 'cassandra', 's3' or 'gcs'.
chunk_store_backend: ['bigtable', 'dynamodb', 'cassandra', 's3', 'gcs'],
storage_engine: ['blocks'], // TODO: Remove this option, it's not needed

// Tags for dashboards.
tags: ['cortex'],
Expand All @@ -32,7 +21,6 @@
ruler: '(ruler|cortex$)',
query_frontend: '(query-frontend.*|cortex$)', // Match also custom query-frontend deployments.
query_scheduler: 'query-scheduler.*', // Not part of single-binary. Match also custom query-scheduler deployments.
table_manager: '(table-manager|cortex$)',
ring_members: ['compactor', 'distributor', 'ingester.*', 'querier.*', 'ruler', 'store-gateway', 'cortex'],
store_gateway: '(store-gateway|cortex$)',
gateway: '(gateway|cortex-gw|cortex-gw-internal)',
Expand Down
19 changes: 3 additions & 16 deletions cortex-mixin/dashboards.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,22 +9,9 @@
(import 'dashboards/writes.libsonnet') +
(import 'dashboards/slow-queries.libsonnet') +
(import 'dashboards/rollout-progress.libsonnet') +

(if std.member($._config.storage_engine, 'blocks')
then
(import 'dashboards/compactor.libsonnet') +
(import 'dashboards/compactor-resources.libsonnet') +
(import 'dashboards/object-store.libsonnet')
else {}) +

(if std.member($._config.storage_engine, 'chunks')
then import 'dashboards/chunks.libsonnet'
else {}) +

(if std.member($._config.storage_engine, 'blocks')
&& std.member($._config.storage_engine, 'chunks')
then import 'dashboards/comparison.libsonnet'
else {}) +
(import 'dashboards/compactor.libsonnet') +
(import 'dashboards/compactor-resources.libsonnet') +
(import 'dashboards/object-store.libsonnet') +

(if !$._config.resources_dashboards_enabled then {} else
(import 'dashboards/reads-resources.libsonnet') +
Expand Down
100 changes: 0 additions & 100 deletions cortex-mixin/dashboards/chunks.libsonnet

This file was deleted.

Loading

0 comments on commit 57197ed

Please sign in to comment.