Skip to content

Commit

Permalink
Merge pull request #8874 from jcsp/tiered-storage-stats
Browse files Browse the repository at this point in the history
cloud_storage: revise public metrics for cache, housekeeping
  • Loading branch information
jcsp authored Feb 14, 2023
2 parents f3c33fb + c8e8af5 commit ec99595
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 39 deletions.
8 changes: 3 additions & 5 deletions src/v/archival/probe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,10 @@ void ntp_level_probe::setup_public_metrics(const model::ntp& ntp) {
upload_housekeeping_probe::upload_housekeeping_probe() {
namespace sm = ss::metrics;

auto aggregate_labels = config::shard_local_cfg().aggregate_metrics()
? std::vector<sm::label>{sm::shard_label}
: std::vector<sm::label>{};
auto aggregate_labels = std::vector<sm::label>{sm::shard_label};

_service_metrics.add_group(
prometheus_sanitize::metrics_name("upload_housekeeping"),
prometheus_sanitize::metrics_name("cloud_storage_housekeeping"),
{
sm::make_counter(
"rounds",
Expand Down Expand Up @@ -171,7 +169,7 @@ upload_housekeeping_probe::upload_housekeeping_probe() {
});

_jobs_metrics.add_group(
prometheus_sanitize::metrics_name("upload_housekeeping_jobs"),
prometheus_sanitize::metrics_name("cloud_storage_jobs"),
{
sm::make_gauge(
"local_segment_reuploads",
Expand Down
117 changes: 83 additions & 34 deletions src/v/cloud_storage/cache_probe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,91 @@
namespace cloud_storage {

cache_probe::cache_probe() {
if (config::shard_local_cfg().disable_metrics()) {
return;
namespace sm = ss::metrics;

if (!config::shard_local_cfg().disable_metrics()) {
_metrics.add_group(
prometheus_sanitize::metrics_name("cloud_storage:cache"),
{
sm::make_counter(
"puts",
[this] { return _num_puts; },
sm::description("Total number of files put into cache.")),
sm::make_counter(
"gets",
[this] { return _num_gets; },
sm::description("Total number of cache get requests.")),
sm::make_counter(
"cached_gets",
[this] { return _num_cached_gets; },
sm::description(
"Total number of get requests that are already in cache.")),

sm::make_gauge(
"size_bytes",
[this] { return _cur_size_bytes; },
sm::description("Current cache size in bytes.")),
sm::make_gauge(
"files",
[this] { return _cur_num_files; },
sm::description("Current number of files in cache.")),
sm::make_gauge(
"in_progress_files",
[this] { return _cur_in_progress_files; },
sm::description(
"Current number of files that are being put to cache.")),
});
}

namespace sm = ss::metrics;
_metrics.add_group(
prometheus_sanitize::metrics_name("cloud_storage:cache"),
{
sm::make_counter(
"puts",
[this] { return _num_puts; },
sm::description("Total number of files put into cache.")),
sm::make_counter(
"gets",
[this] { return _num_gets; },
sm::description("Total number of cache get requests.")),
sm::make_counter(
"cached_gets",
[this] { return _num_cached_gets; },
sm::description(
"Total number of get requests that are already in cache.")),

sm::make_gauge(
"size_bytes",
[this] { return _cur_size_bytes; },
sm::description("Current cache size in bytes.")),
sm::make_gauge(
"files",
[this] { return _cur_num_files; },
sm::description("Current number of files in cache.")),
sm::make_gauge(
"in_progress_files",
[this] { return _cur_in_progress_files; },
sm::description(
"Current number of files that are being put to cache.")),
});
if (!config::shard_local_cfg().disable_public_metrics()) {
auto aggregate_labels = std::vector<sm::label>{sm::shard_label};

// Total disk usage information is only maintained on shard 0
if (ss::this_shard_id() == ss::shard_id{0}) {
_public_metrics.add_group(
prometheus_sanitize::metrics_name("cloud_storage_cache_space"),
{
sm::make_gauge(
"size_bytes",
[this] { return _cur_size_bytes; },
sm::description("Sum of size of cached objects."))
.aggregate(aggregate_labels),
sm::make_gauge(
"files",
[this] { return _cur_num_files; },
sm::description("Number of objects in cache."))
.aggregate(aggregate_labels),
});
}

// Put/get stats are local to each shard.
_public_metrics.add_group(
prometheus_sanitize::metrics_name("cloud_storage_cache_op"),
{
sm::make_counter(
"put",
[this] { return _num_puts; },
sm::description("Number of objects written into cache."))
.aggregate(aggregate_labels),
sm::make_counter(
"hit",
[this] { return _num_cached_gets; },
sm::description("Number of get requests for objects that are "
"already in cache."))
.aggregate(aggregate_labels),
sm::make_counter(
"miss",
[this] { return _num_cached_gets; },
sm::description("Number of get requests that are not satisfied "
"from the cache."))
.aggregate(aggregate_labels),
sm::make_gauge(
"in_progress_files",
[this] { return _cur_in_progress_files; },
sm::description("Number of files that are being put to cache."))
.aggregate(aggregate_labels),
});
}
}

} // namespace cloud_storage
5 changes: 5 additions & 0 deletions src/v/cloud_storage/cache_probe.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#pragma once

#include "model/fundamental.h"
#include "ssx/metrics.h"

#include <seastar/core/metrics_registration.hh>

Expand All @@ -23,6 +24,7 @@ class cache_probe {
void put() { ++_num_puts; }
void get() { ++_num_gets; }
void cached_get() { ++_num_cached_gets; }
void miss_get() { ++_num_miss_gets; }

void set_size(uint64_t size) { _cur_size_bytes = size; }
void set_num_files(uint64_t num_files) { _cur_num_files = num_files; }
Expand All @@ -33,12 +35,15 @@ class cache_probe {
uint64_t _num_puts = 0;
uint64_t _num_gets = 0;
uint64_t _num_cached_gets = 0;
uint64_t _num_miss_gets = 0;

int64_t _cur_size_bytes = 0;
int64_t _cur_num_files = 0;
int64_t _cur_in_progress_files = 0;

ss::metrics::metric_groups _metrics;
ss::metrics::metric_groups _public_metrics{
ssx::metrics::public_metrics_handle};
};

} // namespace cloud_storage
5 changes: 5 additions & 0 deletions src/v/cloud_storage/cache_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ uint64_t cache::get_total_cleaned() { return _total_cleaned; }
ss::future<> cache::consume_cache_space(size_t sz) {
vassert(ss::this_shard_id() == 0, "This method can only run on shard 0");
_current_cache_size += sz;
probe.set_size(_current_cache_size);
if (_current_cache_size > _max_cache_size) {
if (ss::lowres_clock::now() - _last_clean_up > min_clean_up_interval) {
auto units = ss::try_get_units(_cleanup_sm, 1);
Expand Down Expand Up @@ -275,6 +276,9 @@ ss::future<> cache::clean_up_cache() {
"Cache eviction deleted {} files of total size {}.",
i_to_delete,
deleted_size);

probe.set_size(_current_cache_size);
probe.set_num_files(candidates_for_deletion.size() - i_to_delete);
}

_last_clean_up = ss::lowres_clock::now();
Expand Down Expand Up @@ -395,6 +399,7 @@ ss::future<std::optional<cache_item>> cache::get(std::filesystem::path key) {
}
} catch (std::filesystem::filesystem_error& e) {
if (e.code() == std::errc::no_such_file_or_directory) {
probe.miss_get();
co_return std::nullopt;
} else {
throw;
Expand Down

0 comments on commit ec99595

Please sign in to comment.