Skip to content

Commit

Permalink
feat(new_metrics): add disk-level metric entity and migrate disk-leve…
Browse files Browse the repository at this point in the history
…l metrics for fs_manager (#1427)

#1425

In perf counters, all metrics of `fs_manager` are server-level. For example,
the total capacity and the available capacity of all disks where there are
data of pegasus.

However, sometimes the capacity and the available capacity of each disk
seem more important: no space left on the disk will lead to serious problems.
Therefore, after being migrated to new framework, the server-level metrics
of perf counters become disk-level, including the capacity and the available
capacity of a disk. As for another disk-level metric -- the available percentage
of each disk used by a replica server, just use division operator.

Once server-level metrics are needed, just aggregate on the disk-level ones.
To compute another 2 server-level metrics -- the minimal/maximal available
percentage among all disks used by a replica server in a node, for example,
just use min/max operators over disk-level ones for Prometheus.

To implement disk-level metrics, disk-level metric entity are also added.
  • Loading branch information
empiredan committed Dec 11, 2023
1 parent 14c8a10 commit 659743d
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 61 deletions.
94 changes: 57 additions & 37 deletions src/common/fs_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#include "common/gpid.h"
#include "common/replication_enums.h"
#include "fmt/core.h"
#include "perf_counter/perf_counter.h"
#include "fmt/ostream.h"
#include "replica_admin_types.h"
#include "runtime/api_layer1.h"
#include "utils/fail_point.h"
Expand All @@ -45,6 +45,18 @@
#include "utils/ports.h"
#include "absl/strings/string_view.h"

METRIC_DEFINE_entity(disk);

METRIC_DEFINE_gauge_int64(disk,
disk_capacity_total_mb,
dsn::metric_unit::kMegaBytes,
"The total disk capacity");

METRIC_DEFINE_gauge_int64(disk,
disk_capacity_avail_mb,
dsn::metric_unit::kMegaBytes,
"The available disk capacity");

namespace dsn {
namespace replication {

Expand Down Expand Up @@ -75,6 +87,34 @@ error_code disk_status_to_error_code(disk_status::type ds)
}
}

namespace {

metric_entity_ptr instantiate_disk_metric_entity(const std::string &tag,
const std::string &data_dir)
{
auto entity_id = fmt::format("disk_{}", tag);

return METRIC_ENTITY_disk.instantiate(entity_id, {{"tag", tag}, {"data_dir", data_dir}});
}

} // anonymous namespace

disk_capacity_metrics::disk_capacity_metrics(const std::string &tag, const std::string &data_dir)
: _disk_metric_entity(instantiate_disk_metric_entity(tag, data_dir)),
METRIC_VAR_INIT_disk(disk_capacity_total_mb),
METRIC_VAR_INIT_disk(disk_capacity_avail_mb)
{
}

const metric_entity_ptr &disk_capacity_metrics::disk_metric_entity() const
{
CHECK_NOTNULL(_disk_metric_entity,
"disk metric entity should has been instantiated: "
"uninitialized entity cannot be used to instantiate "
"metric");
return _disk_metric_entity;
}

uint64_t dir_node::replicas_count() const
{
uint64_t sum = 0;
Expand Down Expand Up @@ -133,6 +173,9 @@ void dir_node::update_disk_stat()
disk_available_ratio = static_cast<int>(
disk_capacity_mb == 0 ? 0 : std::round(disk_available_mb * 100.0 / disk_capacity_mb));

METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_total_mb, disk_capacity_mb);
METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_avail_mb, disk_available_mb);

// It's able to change status from NORMAL to SPACE_INSUFFICIENT, and vice versa.
disk_status::type old_status = status;
auto new_status = disk_available_ratio < FLAGS_disk_min_available_space_ratio
Expand All @@ -150,30 +193,6 @@ void dir_node::update_disk_stat()
enum_to_string(status));
}

fs_manager::fs_manager()
{
_counter_total_capacity_mb.init_app_counter("eon.replica_stub",
"disk.capacity.total(MB)",
COUNTER_TYPE_NUMBER,
"total disk capacity in MB");
_counter_total_available_mb.init_app_counter("eon.replica_stub",
"disk.available.total(MB)",
COUNTER_TYPE_NUMBER,
"total disk available in MB");
_counter_total_available_ratio.init_app_counter("eon.replica_stub",
"disk.available.total.ratio",
COUNTER_TYPE_NUMBER,
"total disk available ratio");
_counter_min_available_ratio.init_app_counter("eon.replica_stub",
"disk.available.min.ratio",
COUNTER_TYPE_NUMBER,
"minimal disk available ratio in all disks");
_counter_max_available_ratio.init_app_counter("eon.replica_stub",
"disk.available.max.ratio",
COUNTER_TYPE_NUMBER,
"maximal disk available ratio in all disks");
}

dir_node *fs_manager::get_dir_node(const std::string &subdir) const
{
std::string norm_subdir;
Expand Down Expand Up @@ -347,8 +366,14 @@ void fs_manager::remove_replica(const gpid &pid)

void fs_manager::update_disk_stat()
{
_total_capacity_mb = 0;
_total_available_mb = 0;
int total_available_ratio = 0;
int min_available_ratio = 100;
int max_available_ratio = 0;

zauto_write_lock l(_lock);
reset_disk_stat();

for (auto &dn : _dir_nodes) {
// If the disk is already in IO_ERROR status, it will not change to other status, just skip
// it.
Expand All @@ -360,10 +385,10 @@ void fs_manager::update_disk_stat()
dn->update_disk_stat();
_total_capacity_mb += dn->disk_capacity_mb;
_total_available_mb += dn->disk_available_mb;
_min_available_ratio = std::min(dn->disk_available_ratio, _min_available_ratio);
_max_available_ratio = std::max(dn->disk_available_ratio, _max_available_ratio);
min_available_ratio = std::min(dn->disk_available_ratio, min_available_ratio);
max_available_ratio = std::max(dn->disk_available_ratio, max_available_ratio);
}
_total_available_ratio = static_cast<int>(
total_available_ratio = static_cast<int>(
_total_capacity_mb == 0 ? 0 : std::round(_total_available_mb * 100.0 / _total_capacity_mb));

LOG_INFO("update disk space succeed: disk_count = {}, total_capacity_mb = {}, "
Expand All @@ -372,14 +397,9 @@ void fs_manager::update_disk_stat()
_dir_nodes.size(),
_total_capacity_mb,
_total_available_mb,
_total_available_ratio,
_min_available_ratio,
_max_available_ratio);
_counter_total_capacity_mb->set(_total_capacity_mb);
_counter_total_available_mb->set(_total_available_mb);
_counter_total_available_ratio->set(_total_available_ratio);
_counter_min_available_ratio->set(_min_available_ratio);
_counter_max_available_ratio->set(_max_available_ratio);
total_available_ratio,
min_available_ratio,
max_available_ratio);
}

void fs_manager::add_new_dir_node(const std::string &data_dir, const std::string &tag)
Expand Down
57 changes: 34 additions & 23 deletions src/common/fs_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@

#include "common/replication_other_types.h"
#include "metadata_types.h"
#include "perf_counter/perf_counter_wrapper.h"
#include "utils/autoref_ptr.h"
#include "utils/error_code.h"
#include "utils/flags.h"
#include "absl/strings/string_view.h"
#include "utils/metrics.h"
#include "utils/ports.h"
#include "utils/zlocks.h"

namespace dsn {
Expand All @@ -44,6 +46,25 @@ DSN_DECLARE_int32(disk_min_available_space_ratio);

error_code disk_status_to_error_code(disk_status::type ds);

class disk_capacity_metrics
{
public:
disk_capacity_metrics(const std::string &tag, const std::string &data_dir);
~disk_capacity_metrics() = default;

const metric_entity_ptr &disk_metric_entity() const;

METRIC_DEFINE_SET_METHOD(disk_capacity_total_mb, int64_t)
METRIC_DEFINE_SET_METHOD(disk_capacity_avail_mb, int64_t)

private:
const metric_entity_ptr _disk_metric_entity;
METRIC_VAR_DECLARE_gauge_int64(disk_capacity_total_mb);
METRIC_VAR_DECLARE_gauge_int64(disk_capacity_avail_mb);

DISALLOW_COPY_AND_ASSIGN(disk_capacity_metrics);
};

struct dir_node
{
public:
Expand All @@ -57,6 +78,9 @@ struct dir_node
std::map<app_id, std::set<gpid>> holding_primary_replicas;
std::map<app_id, std::set<gpid>> holding_secondary_replicas;

private:
disk_capacity_metrics disk_capacity;

public:
dir_node(const std::string &tag_,
const std::string &dir_,
Expand All @@ -69,7 +93,8 @@ struct dir_node
disk_capacity_mb(disk_capacity_mb_),
disk_available_mb(disk_available_mb_),
disk_available_ratio(disk_available_ratio_),
status(status_)
status(status_),
disk_capacity(tag_, dir_)
{
}
// All functions are not thread-safe. However, they are only used in fs_manager
Expand All @@ -88,7 +113,8 @@ struct dir_node
class fs_manager
{
public:
fs_manager();
fs_manager() = default;
~fs_manager() = default;

// Should be called before open/load any replicas.
// NOTE: 'data_dirs' and 'data_dir_tags' must have the same size and in the same order.
Expand Down Expand Up @@ -136,37 +162,22 @@ class fs_manager
std::vector<disk_info> get_disk_infos(int app_id) const;

private:
void reset_disk_stat()
{
_total_capacity_mb = 0;
_total_available_mb = 0;
_total_available_ratio = 0;
_min_available_ratio = 100;
_max_available_ratio = 0;
}

dir_node *get_dir_node(const std::string &subdir) const;

// when visit the tag/storage of the _dir_nodes map, there's no need to protect by the lock.
// but when visit the holding_replicas, you must take care.
// TODO(wangdan): _dir_nodes should be protected by lock since add_new_disk are supported:
// it might be updated arbitrarily at any time.
//
// Especially when visiting the holding_replicas, you must take care.
mutable zrwlock_nr _lock; // [ lock

int64_t _total_capacity_mb = 0;
int64_t _total_available_mb = 0;
int _total_available_ratio = 0;
int _min_available_ratio = 100;
int _max_available_ratio = 0;

// Once dir_node has been added to '_dir_nodes', it will not be removed, it will be marked
// as non-NORMAL status if it is not available.
std::vector<std::shared_ptr<dir_node>> _dir_nodes;
// ] end of lock

perf_counter_wrapper _counter_total_capacity_mb;
perf_counter_wrapper _counter_total_available_mb;
perf_counter_wrapper _counter_total_available_ratio;
perf_counter_wrapper _counter_min_available_ratio;
perf_counter_wrapper _counter_max_available_ratio;

friend class replica_test;
friend class replica_stub;
friend class mock_replica_stub;
Expand Down
1 change: 1 addition & 0 deletions src/common/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ set(MY_PROJ_NAME dsn_replication_common_test)
set(MY_SRC_SEARCH_MODE "GLOB")

set(MY_PROJ_LIBS
dsn_http
dsn_replication_common
dsn_runtime
gtest
Expand Down
1 change: 0 additions & 1 deletion src/replica/test/replica_disk_test_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ class replica_disk_test_base : public replica_test_base
generate_mock_app_info();

stub->_fs_manager._dir_nodes.clear();
stub->_fs_manager.reset_disk_stat();
generate_mock_dir_nodes(dir_nodes_count);
generate_mock_empty_dir_node(empty_dir_nodes_count);

Expand Down
7 changes: 7 additions & 0 deletions src/utils/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ class error_code;
_##name(METRIC_##name.instantiate(entity##_metric_entity(), ##__VA_ARGS__))
#define METRIC_VAR_INIT_replica(name, ...) METRIC_VAR_INIT(name, replica, ##__VA_ARGS__)
#define METRIC_VAR_INIT_server(name, ...) METRIC_VAR_INIT(name, server, ##__VA_ARGS__)
#define METRIC_VAR_INIT_disk(name, ...) METRIC_VAR_INIT(name, disk, ##__VA_ARGS__)

// Perform increment-related operations on metrics including gauge and counter.
#define METRIC_VAR_INCREMENT_BY(name, x) \
Expand Down Expand Up @@ -195,6 +196,11 @@ class error_code;

#define METRIC_VAR_AUTO_LATENCY_DURATION_NS(name) __##name##_auto_latency.duration_ns()

#define METRIC_DEFINE_SET_METHOD(name, value_type) \
void set_##name(value_type value) { METRIC_VAR_SET(name, value); }

#define METRIC_CALL_SET_METHOD(obj, name, value) obj.set_##name(value)

namespace dsn {
class metric; // IWYU pragma: keep
class metric_entity_prototype; // IWYU pragma: keep
Expand Down Expand Up @@ -615,6 +621,7 @@ enum class metric_unit : size_t
kBytes,
kMegaBytes,
kCapacityUnits,
kPercent,
kRequests,
kSeeks,
kPointLookups,
Expand Down

0 comments on commit 659743d

Please sign in to comment.