Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make cortex_bucket_store_blocks_loaded metric per user #4918

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,9 @@
* [CHANGE] Disables TSDB isolation. #4825
* [CHANGE] Drops support Prometheus 1.x rule format on configdb. #4826
* [CHANGE] Removes `-ingester.stream-chunks-when-using-blocks` experimental flag and stream chunks by default when `querier.ingester-streaming` is enabled. #4864
* [CHANGE] Compactor: Added `cortex_compactor_runs_interrupted_total` to separate compaction interruptions from failures
* [CHANGE] Compactor: Added `cortex_compactor_runs_interrupted_total` to separate compaction interruptions from failures. #4921
Copy link
Member

@friedrichg friedrichg Oct 31, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should normally avoid this and create a specific PR for that

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK I will remove this change for this pr.

* [CHANGE] Enable PromQL `@` modifier, negative offset always. #4927
* [CHANGE] Store-gateway: Add user label to `cortex_bucket_store_blocks_loaded` metric. #4918
* [ENHANCEMENT] AlertManager: Retrying AlertManager Get Requests (Get Alertmanager status, Get Alertmanager Receivers) on next replica on error #4840
* [ENHANCEMENT] Querier/Ruler: Retry store-gateway in case of unexpected failure, instead of failing the query. #4532 #4839
* [ENHANCEMENT] Ring: DoBatch prioritize 4xx errors when failing. #4783
Expand Down
8 changes: 2 additions & 6 deletions integration/getting_started_with_gossiped_ring_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,6 @@ func TestGettingStartedWithGossipedRing(t *testing.T) {
require.Equal(t, model.ValVector, result.Type())
assert.Equal(t, expectedVector, result.(model.Vector))

// Before flushing the blocks we expect no store-gateway has loaded any block.
require.NoError(t, cortex1.WaitSumMetrics(e2e.Equals(0), "cortex_bucket_store_blocks_loaded"))
require.NoError(t, cortex2.WaitSumMetrics(e2e.Equals(0), "cortex_bucket_store_blocks_loaded"))

// Flush blocks from ingesters to the store.
for _, instance := range []*e2ecortex.CortexService{cortex1, cortex2} {
res, err = e2e.GetRequest("http://" + instance.HTTPEndpoint() + "/flush")
Expand All @@ -124,8 +120,8 @@ func TestGettingStartedWithGossipedRing(t *testing.T) {
// Given store-gateway blocks sharding is enabled with the default replication factor of 3,
// and ingestion replication factor is 1, we do expect the series has been ingested by 1
// single ingester and so we have 1 block shipped from ingesters and loaded by both store-gateways.
require.NoError(t, cortex1.WaitSumMetrics(e2e.Equals(1), "cortex_bucket_store_blocks_loaded"))
require.NoError(t, cortex2.WaitSumMetrics(e2e.Equals(1), "cortex_bucket_store_blocks_loaded"))
require.NoError(t, cortex1.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_bucket_store_blocks_loaded"}, e2e.WaitMissingMetrics))
require.NoError(t, cortex2.WaitSumMetricsWithOptions(e2e.Equals(1), []string{"cortex_bucket_store_blocks_loaded"}, e2e.WaitMissingMetrics))

// Make sure that no DNS failures occurred.
// No actual DNS lookups are necessarily performed, so we can't really assert on that.
Expand Down
4 changes: 3 additions & 1 deletion integration/querier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,9 @@ func TestQuerierWithBlocksStorageRunningInMicroservicesMode(t *testing.T) {
// we don't known which store-gateway instance will synch the blocks, so we need to wait on
// metrics extracted from all instances.
if testCfg.blocksShardingStrategy != "" {
require.NoError(t, storeGateways.WaitSumMetrics(e2e.Equals(2), "cortex_bucket_store_blocks_loaded"))
// If shuffle sharding is enabled and we have tenant shard size set to 1,
// then the metric only appears in one store gateway instance.
require.NoError(t, storeGateways.WaitSumMetricsWithOptions(e2e.Equals(2), []string{"cortex_bucket_store_blocks_loaded"}, e2e.SkipMissingMetrics))
} else {
require.NoError(t, storeGateways.WaitSumMetrics(e2e.Equals(float64(2*storeGateways.NumInstances())), "cortex_bucket_store_blocks_loaded"))
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/storegateway/bucket_store_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ func NewBucketStoreMetrics() *BucketStoreMetrics {
blocksLoaded: prometheus.NewDesc(
"cortex_bucket_store_blocks_loaded",
"Number of currently loaded blocks.",
nil, nil),
[]string{"user"}, nil),
seriesDataTouched: prometheus.NewDesc(
"cortex_bucket_store_series_data_touched",
"How many items of a data type in a block were touched for a single series request.",
Expand Down Expand Up @@ -212,7 +212,7 @@ func (m *BucketStoreMetrics) Collect(out chan<- prometheus.Metric) {
data.SendSumOfCounters(out, m.blockDrops, "thanos_bucket_store_block_drops_total")
data.SendSumOfCounters(out, m.blockDropFailures, "thanos_bucket_store_block_drop_failures_total")

data.SendSumOfGauges(out, m.blocksLoaded, "thanos_bucket_store_blocks_loaded")
data.SendSumOfGaugesPerUser(out, m.blocksLoaded, "thanos_bucket_store_blocks_loaded")
Copy link
Contributor

@harry671003 harry671003 Oct 18, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this cause an explosion in cardinality if there is a high churn in users?

Copy link
Contributor Author

@yeya24 yeya24 Oct 19, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My 2 cents:

  1. We have per user metrics already in other Cortex components
  2. For cardinality, I feel user label is okay. Compared to short lived job names like pod name, container ID, etc, user ID is relatively bounded.

Copy link
Contributor

@alvinlin123 alvinlin123 Oct 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this change is ok, but it looks like store-gateway only "soft deletes" per user metrics:

m.regs.RemoveUserRegistry(user, false)

(the second parameter is "hard delete?"

If that's the case, turning a metric to per-user may not be a good idea because of memory leak.

Copy link
Contributor

@alvinlin123 alvinlin123 Oct 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually, I think this PR would prevent memory leak.

so, I think we are good to go.

For cardinality, I feel user label is okay. Compared to short lived job names like pod name, container ID, etc, user ID is relatively bounded.

The user label can be "short lived" too; consider if you are running some test continuously, and each test run creates new user :)

However, I think this metrics is useful to be per user.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you are talking about benchmarking and continuously tests then every label can be "short lived", right? If we are not reusing the same test user.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea :)


data.SendSumOfSummariesWithLabels(out, m.seriesDataTouched, "thanos_bucket_store_series_data_touched", "data_type")
data.SendSumOfSummariesWithLabels(out, m.seriesDataFetched, "thanos_bucket_store_series_data_fetched", "data_type")
Expand Down
4 changes: 3 additions & 1 deletion pkg/storegateway/bucket_store_metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,9 @@ func TestBucketStoreMetrics(t *testing.T) {
err := testutil.GatherAndCompare(mainReg, bytes.NewBufferString(`
# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 22519
cortex_bucket_store_blocks_loaded{user="user1"} 5328
cortex_bucket_store_blocks_loaded{user="user2"} 6908
cortex_bucket_store_blocks_loaded{user="user3"} 10283

# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
# TYPE cortex_bucket_store_block_loads_total counter
Expand Down
17 changes: 8 additions & 9 deletions pkg/storegateway/bucket_stores_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,8 @@ func TestBucketStores_InitialSync(t *testing.T) {
assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 2
cortex_bucket_store_blocks_loaded{user="user-1"} 1
cortex_bucket_store_blocks_loaded{user="user-2"} 1

# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
# TYPE cortex_bucket_store_block_loads_total counter
Expand Down Expand Up @@ -158,7 +159,7 @@ func TestBucketStores_InitialSyncShouldRetryOnFailure(t *testing.T) {

# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 1
cortex_bucket_store_blocks_loaded{user="user-1"} 1

# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
# TYPE cortex_bucket_store_block_loads_total counter
Expand Down Expand Up @@ -219,7 +220,7 @@ func TestBucketStores_SyncBlocks(t *testing.T) {
assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(`
# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 2
cortex_bucket_store_blocks_loaded{user="user-1"} 2

# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
# TYPE cortex_bucket_store_block_loads_total counter
Expand Down Expand Up @@ -486,7 +487,8 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) {
cortex_bucket_store_block_loads_total 2
# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 2
cortex_bucket_store_blocks_loaded{user="user-1"} 1
cortex_bucket_store_blocks_loaded{user="user-2"} 1
`), metricNames...))

// Single user left in shard.
Expand All @@ -503,7 +505,7 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) {
cortex_bucket_store_block_loads_total 2
# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 1
cortex_bucket_store_blocks_loaded{user="user-1"} 1
`), metricNames...))

// No users left in this shard.
Expand All @@ -518,9 +520,6 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) {
# HELP cortex_bucket_store_block_loads_total Total number of remote block loading attempts.
# TYPE cortex_bucket_store_block_loads_total counter
cortex_bucket_store_block_loads_total 2
# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 0
`), metricNames...))

// We can always get user back.
Expand All @@ -537,7 +536,7 @@ func TestBucketStores_deleteLocalFilesForExcludedTenants(t *testing.T) {
cortex_bucket_store_block_loads_total 3
# HELP cortex_bucket_store_blocks_loaded Number of currently loaded blocks.
# TYPE cortex_bucket_store_blocks_loaded gauge
cortex_bucket_store_blocks_loaded 1
cortex_bucket_store_blocks_loaded{user="user-1"} 1
`), metricNames...))
}

Expand Down