From 5b02f05126d9e43f206dda3c3b8ce69773c066b6 Mon Sep 17 00:00:00 2001 From: Ben Ye Date: Mon, 31 Oct 2022 10:45:22 -0700 Subject: [PATCH] Make cortex_bucket_store_blocks_loaded metric per user (#4918) * make cortex_bucket_store_blocks_loaded metric per user Signed-off-by: Ben Ye * fix integration test Signed-off-by: Ben Ye * update changelog Signed-off-by: Ben Ye * fix test Signed-off-by: Ben Ye * update changelog Signed-off-by: Ben Ye Signed-off-by: Ben Ye --- CHANGELOG.md | 1 + .../getting_started_with_gossiped_ring_test.go | 8 ++------ integration/querier_test.go | 4 +++- pkg/storegateway/bucket_store_metrics.go | 4 ++-- pkg/storegateway/bucket_store_metrics_test.go | 4 +++- pkg/storegateway/bucket_stores_test.go | 17 ++++++++--------- 6 files changed, 19 insertions(+), 19 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62fd1bd335a..368a385fa4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,6 +43,7 @@ * [CHANGE] Removes `-ingester.stream-chunks-when-using-blocks` experimental flag and stream chunks by default when `querier.ingester-streaming` is enabled. #4864 * [CHANGE] Compactor: Added `cortex_compactor_runs_interrupted_total` to separate compaction interruptions from failures * [CHANGE] Enable PromQL `@` modifier, negative offset always. #4927 +* [CHANGE] Store-gateway: Add user label to `cortex_bucket_store_blocks_loaded` metric. #4918 * [ENHANCEMENT] AlertManager: Retrying AlertManager Get Requests (Get Alertmanager status, Get Alertmanager Receivers) on next replica on error #4840 * [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532 #4839 * [ENHANCEMENT] Ring: DoBatch prioritize 4xx errors when failing. #4783 diff --git a/integration/getting_started_with_gossiped_ring_test.go b/integration/getting_started_with_gossiped_ring_test.go index bc7dbd42c83..3b9239310b0 100644 --- a/integration/getting_started_with_gossiped_ring_test.go +++ b/integration/getting_started_with_gossiped_ring_test.go @@ -110,10 +110,6 @@ func TestGettingStartedWithGossipedRing(t *testing.T) { require.Equal(t, model.ValVector, result.Type()) assert.Equal(t, expectedVector, result.(model.Vector)) - // Before flushing the blocks we expect no store-gateway has loaded any block. - require.NoError(t, cortex1.WaitSumMetrics(e2e.Equals(0), "cortex_bucket_store_blocks_loaded")) - require.NoError(t, cortex2.WaitSumMetrics(e2e.Equals(0), "cortex_bucket_store_blocks_loaded")) - // Flush blocks from ingesters to the store. for _, instance := range []*e2ecortex.CortexService{cortex1, cortex2} { res, err = e2e.GetRequest("http://" + instance.HTTPEndpoint() + "/flush") @@ -124,8 +120,8 @@ func TestGettingStartedWithGossipedRing(t *testing.T) { // Given store-gateway blocks sharding is enabled with the default replication factor of 3, // and ingestion replication factor is 1, we do expect the series has been ingested by 1 // single ingester and so we have 1 block shipped from ingesters and loaded by both store-gateways. - require.NoError(t, cortex1.WaitSumMetrics(e2e.Equals(1), "cortex_bucket_store_blocks_loaded")) - require.NoError(t, cortex2.WaitSumMetrics(e2e.Equals(1), "cortex_bucket_store_blocks_loaded")) + require.NoError(t, cortex1.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_bucket_store_blocks_loaded"}, e2e.WaitMissingMetrics)) + require.NoError(t, cortex2.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_bucket_store_blocks_loaded"}, e2e.WaitMissingMetrics)) // Make sure that no DNS failures occurred. // No actual DNS lookups are necessarily performed, so we can't really assert on that. diff --git a/integration/querier_test.go b/integration/querier_test.go index ff3ff01d270..a8a7c56712e 100644 --- a/integration/querier_test.go +++ b/integration/querier_test.go @@ -186,7 +186,9 @@ func TestQuerierWithBlocksStorageRunningInMicroservicesMode(t *testing.T) { // we don't known which store-gateway instance will synch the blocks, so we need to wait on // metrics extracted from all instances. if testCfg.blocksShardingStrategy != "" { - require.NoError(t, storeGateways.WaitSumMetrics(e2e.Equals(2), "cortex_bucket_store_blocks_loaded")) + // If shuffle sharding is enabled and we have tenant shard size set to 1, + // then the metric only appears in one store gateway instance. + require.NoError(t, storeGateways.WaitSumMetricsWithOptions(e2e.Equals(2), []string{"cortex_bucket_store_blocks_loaded"}, e2e.SkipMissingMetrics)) } else { require.NoError(t, storeGateways.WaitSumMetrics(e2e.Equals(float64(2*storeGateways.NumInstances())), "cortex_bucket_store_blocks_loaded")) } diff --git a/pkg/storegateway/bucket_store_metrics.go b/pkg/storegateway/bucket_store_metrics.go index bfed90a093a..567f5c39d32 100644 --- a/pkg/storegateway/bucket_store_metrics.go +++ b/pkg/storegateway/bucket_store_metrics.go @@ -67,7 +67,7 @@ func NewBucketStoreMetrics() *BucketStoreMetrics { blocksLoaded: prometheus.NewDesc( "cortex_bucket_store_blocks_loaded", "Number of currently loaded blocks.", - nil, nil), + []string{"user"}, nil), seriesDataTouched: prometheus.NewDesc( "cortex_bucket_store_series_data_touched", "How many items of a data type in a block were touched for a single series request.", @@ -212,7 +212,7 @@ func (m *BucketStoreMetrics) Collect(out chan<- prometheus.Metric) { data.SendSumOfCounters(out, m.blockDrops, "thanos_bucket_store_block_drops_total") data.SendSumOfCounters(out, m.blockDropFailures, "thanos_bucket_store_block_drop_failures_total") - data.SendSumOfGauges(out, m.blocksLoaded, "thanos_bucket_store_blocks_loaded") + data.SendSumOfGaugesPerUser(out, m.blocksLoaded, "thanos_bucket_store_blocks_loaded") data.SendSumOfSummariesWithLabels(out, m.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type") data.SendSumOfSummariesWithLabels(out, m.seriesDataFetched, "thanos_bucket_store_series_data_fetched", "data_type") diff --git a/pkg/storegateway/bucket_store_metrics_test.go b/pkg/storegateway/bucket_store_metrics_test.go index 2990c20a5c6..a400cb8c96e 100644 --- a/pkg/storegateway/bucket_store_metrics_test.go +++ b/pkg/storegateway/bucket_store_metrics_test.go @@ -25,7 +25,9 @@ func TestBucketStoreMetrics(t *testing.T) { err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(` # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 22519 + cortex_bucket_store_blocks_loaded{user="user1"} 5328 + cortex_bucket_store_blocks_loaded{user="user2"} 6908 + cortex_bucket_store_blocks_loaded{user="user3"} 10283 # HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts. # TYPE cortex_bucket_store_block_loads_total counter diff --git a/pkg/storegateway/bucket_stores_test.go b/pkg/storegateway/bucket_stores_test.go index d564a068b09..34a32ecdc3e 100644 --- a/pkg/storegateway/bucket_stores_test.go +++ b/pkg/storegateway/bucket_stores_test.go @@ -90,7 +90,8 @@ func TestBucketStores_InitialSync(t *testing.T) { assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(` # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 2 + cortex_bucket_store_blocks_loaded{user="user-1"} 1 + cortex_bucket_store_blocks_loaded{user="user-2"} 1 # HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts. # TYPE cortex_bucket_store_block_loads_total counter @@ -158,7 +159,7 @@ func TestBucketStores_InitialSyncShouldRetryOnFailure(t *testing.T) { # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 1 + cortex_bucket_store_blocks_loaded{user="user-1"} 1 # HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts. # TYPE cortex_bucket_store_block_loads_total counter @@ -219,7 +220,7 @@ func TestBucketStores_SyncBlocks(t *testing.T) { assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(` # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 2 + cortex_bucket_store_blocks_loaded{user="user-1"} 2 # HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts. # TYPE cortex_bucket_store_block_loads_total counter @@ -486,7 +487,8 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) { cortex_bucket_store_block_loads_total 2 # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 2 + cortex_bucket_store_blocks_loaded{user="user-1"} 1 + cortex_bucket_store_blocks_loaded{user="user-2"} 1 `), metricNames...)) // Single user left in shard. @@ -503,7 +505,7 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) { cortex_bucket_store_block_loads_total 2 # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 1 + cortex_bucket_store_blocks_loaded{user="user-1"} 1 `), metricNames...)) // No users left in this shard. @@ -518,9 +520,6 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) { # HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts. # TYPE cortex_bucket_store_block_loads_total counter cortex_bucket_store_block_loads_total 2 - # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. - # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 0 `), metricNames...)) // We can always get user back. @@ -537,7 +536,7 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) { cortex_bucket_store_block_loads_total 3 # HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks. # TYPE cortex_bucket_store_blocks_loaded gauge - cortex_bucket_store_blocks_loaded 1 + cortex_bucket_store_blocks_loaded{user="user-1"} 1 `), metricNames...)) }