Skip to content

Commit

Permalink
feat(new_metrics): add partition-level metric entity and migrate part…
Browse files Browse the repository at this point in the history
…ition-level metrics for greedy_load_balancer of meta (apache#1435)

apache#1331

In perf counters, all metrics of greedy_load_balancer are server-level, for
example, the number of each kind of operations by greedy balancer, including
moving primaries, copying primaries and copying secondaries.

For new metrics, it is hoped that they are fine-grained, since sometimes we
want to know which primaries are moved. Also, it is convenient to calculate
table-level or server-level metrics by just aggregate on partition-level ones.

The metrics of greedy_load_balancer that are changed to partition-level and
migrated to new framework include: the number of balance operations by
greedy balancer that are recently needed to be executed, move primaries,
copy primaries, and copy secondaries.

In addition to the metrics of greedy_load_balancer, we also change some
metrics of server_state again to partition-level which have been migrated
to table-level in apache#1431, 
for the reason that partition-level is considered more appropriate for them
than table-level.  The metrics changed to partition-level include the number
of times the configuration has been changed and the number of times the
status of partition has been changed to unwritable or writable for a partition.

To implement table-level metrics, partition-level metric entity is also added.
  • Loading branch information
empiredan authored and wangdan committed Dec 6, 2023
1 parent 537612d commit fc2351b
Show file tree
Hide file tree
Showing 14 changed files with 448 additions and 138 deletions.
4 changes: 2 additions & 2 deletions src/common/fs_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ void dir_node::update_disk_stat()
disk_available_ratio = static_cast<int>(
disk_capacity_mb == 0 ? 0 : std::round(disk_available_mb * 100.0 / disk_capacity_mb));

METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_total_mb, disk_capacity_mb);
METRIC_CALL_SET_METHOD(disk_capacity, disk_capacity_avail_mb, disk_available_mb);
METRIC_SET(disk_capacity, disk_capacity_total_mb, disk_capacity_mb);
METRIC_SET(disk_capacity, disk_capacity_avail_mb, disk_available_mb);

// It's able to change status from NORMAL to SPACE_INSUFFICIENT, and vice versa.
disk_status::type old_status = status;
Expand Down
4 changes: 2 additions & 2 deletions src/common/fs_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ class disk_capacity_metrics

const metric_entity_ptr &disk_metric_entity() const;

METRIC_DEFINE_SET_METHOD(disk_capacity_total_mb, int64_t)
METRIC_DEFINE_SET_METHOD(disk_capacity_avail_mb, int64_t)
METRIC_DEFINE_SET(disk_capacity_total_mb, int64_t)
METRIC_DEFINE_SET(disk_capacity_avail_mb, int64_t)

private:
const metric_entity_ptr _disk_metric_entity;
Expand Down
48 changes: 16 additions & 32 deletions src/meta/greedy_load_balancer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,21 +36,23 @@
#include "cluster_balance_policy.h"
#include "greedy_load_balancer.h"
#include "meta/load_balance_policy.h"
#include "meta/meta_service.h"
#include "meta/server_load_balancer.h"
#include "meta/server_state.h"
#include "meta/table_metrics.h"
#include "meta_admin_types.h"
#include "meta_data.h"
#include "perf_counter/perf_counter.h"
#include "runtime/rpc/rpc_address.h"
#include "utils/command_manager.h"
#include "utils/flags.h"
#include "utils/fmt_logging.h"
#include "utils/math.h"
#include "utils/metrics.h"

namespace dsn {
class gpid;

namespace replication {
class meta_service;

DSN_DEFINE_bool(meta_server, balance_cluster, false, "whether to enable cluster balancer");
DSN_TAG_VARIABLE(balance_cluster, FT_MUTABLE);
Expand All @@ -63,27 +65,6 @@ greedy_load_balancer::greedy_load_balancer(meta_service *_svc) : server_load_bal
_cluster_balance_policy = std::make_unique<cluster_balance_policy>(_svc);

::memset(t_operation_counters, 0, sizeof(t_operation_counters));

// init perf counters
_balance_operation_count.init_app_counter("eon.greedy_balancer",
"balance_operation_count",
COUNTER_TYPE_NUMBER,
"balance operation count to be done");
_recent_balance_move_primary_count.init_app_counter(
"eon.greedy_balancer",
"recent_balance_move_primary_count",
COUNTER_TYPE_VOLATILE_NUMBER,
"move primary count by balancer in the recent period");
_recent_balance_copy_primary_count.init_app_counter(
"eon.greedy_balancer",
"recent_balance_copy_primary_count",
COUNTER_TYPE_VOLATILE_NUMBER,
"copy primary count by balancer in the recent period");
_recent_balance_copy_secondary_count.init_app_counter(
"eon.greedy_balancer",
"recent_balance_copy_secondary_count",
COUNTER_TYPE_VOLATILE_NUMBER,
"copy secondary count by balancer in the recent period");
}

greedy_load_balancer::~greedy_load_balancer() {}
Expand Down Expand Up @@ -228,34 +209,37 @@ bool greedy_load_balancer::check(meta_view view, migration_list &list)
void greedy_load_balancer::report(const dsn::replication::migration_list &list,
bool balance_checker)
{
int counters[MAX_COUNT];
::memset(counters, 0, sizeof(counters));
#define __METRIC_INCREMENT(name) \
METRIC_INCREMENT(balance_stats, name, action.first, balance_checker)

int counters[MAX_COUNT] = {0};
greedy_balance_stats balance_stats;

counters[ALL_COUNT] = list.size();
for (const auto &action : list) {
switch (action.second.get()->balance_type) {
case balancer_request_type::move_primary:
counters[MOVE_PRI_COUNT]++;
__METRIC_INCREMENT(greedy_move_primary_operations);
break;
case balancer_request_type::copy_primary:
counters[COPY_PRI_COUNT]++;
__METRIC_INCREMENT(greedy_copy_primary_operations);
break;
case balancer_request_type::copy_secondary:
counters[COPY_SEC_COUNT]++;
__METRIC_INCREMENT(greedy_copy_secondary_operations);
break;
default:
CHECK(false, "");
}
}

::memcpy(t_operation_counters, counters, sizeof(counters));
METRIC_SET_GREEDY_BALANCE_STATS(_svc->get_server_state()->get_table_metric_entities(),
balance_stats);

// update perf counters
_balance_operation_count->set(list.size());
if (!balance_checker) {
_recent_balance_move_primary_count->add(counters[MOVE_PRI_COUNT]);
_recent_balance_copy_primary_count->add(counters[COPY_PRI_COUNT]);
_recent_balance_copy_secondary_count->add(counters[COPY_SEC_COUNT]);
}
#undef __METRIC_INCREMENT
}
} // namespace replication
} // namespace dsn
7 changes: 0 additions & 7 deletions src/meta/greedy_load_balancer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@

#include "meta/meta_data.h"
#include "meta_admin_types.h"
#include "perf_counter/perf_counter_wrapper.h"
#include "server_load_balancer.h"

namespace dsn {
Expand Down Expand Up @@ -79,12 +78,6 @@ class greedy_load_balancer : public server_load_balancer

std::unique_ptr<command_deregister> _get_balance_operation_count;

// perf counters
perf_counter_wrapper _balance_operation_count;
perf_counter_wrapper _recent_balance_move_primary_count;
perf_counter_wrapper _recent_balance_copy_primary_count;
perf_counter_wrapper _recent_balance_copy_secondary_count;

private:
void greedy_balancer(bool balance_checker);
bool all_replica_infos_collected(const node_state &ns);
Expand Down
5 changes: 5 additions & 0 deletions src/meta/meta_split_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#include "meta/meta_service.h"
#include "meta/meta_state_service.h"
#include "meta/server_state.h"
#include "meta/table_metrics.h"
#include "meta_admin_types.h"
#include "meta_split_service.h"
#include "meta_state_service_utils.h"
Expand Down Expand Up @@ -118,6 +119,7 @@ void meta_split_service::do_start_partition_split(std::shared_ptr<app_state> app
app->partition_count *= 2;
app->helpers->contexts.resize(app->partition_count);
app->partitions.resize(app->partition_count);
_state->get_table_metric_entities().resize_partitions(app->app_id, app->partition_count);
app->envs[replica_envs::SPLIT_VALIDATE_PARTITION_HASH] = "true";

for (int i = 0; i < app->partition_count; ++i) {
Expand Down Expand Up @@ -553,10 +555,13 @@ void meta_split_service::do_cancel_partition_split(std::shared_ptr<app_state> ap
LOG_INFO("app({}) update partition count on remote storage, new partition count is {}",
app->app_name,
app->partition_count / 2);

zauto_write_lock l(app_lock());

app->partition_count /= 2;
app->helpers->contexts.resize(app->partition_count);
app->partitions.resize(app->partition_count);
_state->get_table_metric_entities().resize_partitions(app->app_id, app->partition_count);
};

auto copy = *app;
Expand Down
38 changes: 16 additions & 22 deletions src/meta/server_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
#include "utils/config_api.h"
#include "utils/flags.h"
#include "utils/fmt_logging.h"
#include "utils/metrics.h"
#include "utils/string_conv.h"
#include "utils/strings.h"

Expand Down Expand Up @@ -489,7 +490,7 @@ error_code server_state::sync_apps_to_remote_storage()
"invalid app name, name = {}",
kv_pair.second->app_name);
_exist_apps.emplace(kv_pair.second->app_name, kv_pair.second);
_table_metric_entities.create_entity(kv_pair.first);
_table_metric_entities.create_entity(kv_pair.first, kv_pair.second->partition_count);
}
}

Expand Down Expand Up @@ -655,7 +656,7 @@ dsn::error_code server_state::sync_apps_from_remote_storage()
if (app->status == app_status::AS_AVAILABLE) {
app->status = app_status::AS_CREATING;
_exist_apps.emplace(app->app_name, app);
_table_metric_entities.create_entity(app->app_id);
_table_metric_entities.create_entity(app->app_id, app->partition_count);
} else if (app->status == app_status::AS_DROPPED) {
app->status = app_status::AS_DROPPING;
} else {
Expand Down Expand Up @@ -1158,7 +1159,7 @@ void server_state::create_app(dsn::message_ex *msg)

_all_apps.emplace(app->app_id, app);
_exist_apps.emplace(request.app_name, app);
_table_metric_entities.create_entity(app->app_id);
_table_metric_entities.create_entity(app->app_id, app->partition_count);
}
}

Expand Down Expand Up @@ -1388,7 +1389,8 @@ void server_state::recall_app(dsn::message_ex *msg)
target_app->helpers->pending_response = msg;

_exist_apps.emplace(target_app->app_name, target_app);
_table_metric_entities.create_entity(target_app->app_id);
_table_metric_entities.create_entity(target_app->app_id,
target_app->partition_count);
}
}
}
Expand Down Expand Up @@ -1613,15 +1615,12 @@ void server_state::update_configuration_locally(
_config_change_subscriber(_all_apps);
}

METRIC_CALL_TABLE_INCREMENT_METHOD(
_table_metric_entities, partition_configuration_changes, app.app_id);
METRIC_INCREMENT(_table_metric_entities, partition_configuration_changes, gpid);
if (old_health_status >= HS_WRITABLE_ILL && new_health_status < HS_WRITABLE_ILL) {
METRIC_CALL_TABLE_INCREMENT_METHOD(
_table_metric_entities, unwritable_partition_changes, app.app_id);
METRIC_INCREMENT(_table_metric_entities, unwritable_partition_changes, gpid);
}
if (old_health_status < HS_WRITABLE_ILL && new_health_status >= HS_WRITABLE_ILL) {
METRIC_CALL_TABLE_INCREMENT_METHOD(
_table_metric_entities, writable_partition_changes, app.app_id);
METRIC_INCREMENT(_table_metric_entities, writable_partition_changes, gpid);
}
}

Expand Down Expand Up @@ -2425,18 +2424,13 @@ void server_state::update_partition_metrics()
counters[st]++;
}

METRIC_CALL_TABLE_SET_METHOD(
_table_metric_entities, dead_partitions, app->app_id, counters[HS_DEAD]);
METRIC_CALL_TABLE_SET_METHOD(
_table_metric_entities, unreadable_partitions, app->app_id, counters[HS_UNREADABLE]);
METRIC_CALL_TABLE_SET_METHOD(
_table_metric_entities, unwritable_partitions, app->app_id, counters[HS_UNWRITABLE]);
METRIC_CALL_TABLE_SET_METHOD(_table_metric_entities,
writable_ill_partitions,
app->app_id,
counters[HS_WRITABLE_ILL]);
METRIC_CALL_TABLE_SET_METHOD(
_table_metric_entities, healthy_partitions, app->app_id, counters[HS_HEALTHY]);
METRIC_SET_TABLE_HEALTH_STATS(_table_metric_entities,
app->app_id,
counters[HS_DEAD],
counters[HS_UNREADABLE],
counters[HS_UNWRITABLE],
counters[HS_WRITABLE_ILL],
counters[HS_HEALTHY]);

return true;
};
Expand Down
2 changes: 2 additions & 0 deletions src/meta/server_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,8 @@ class server_state
task_tracker *tracker() { return &_tracker; }
void wait_all_task() { _tracker.wait_outstanding_tasks(); }

table_metric_entities &get_table_metric_entities() { return _table_metric_entities; }

private:
FRIEND_TEST(backup_service_test, test_invalid_backup_request);

Expand Down
2 changes: 1 addition & 1 deletion src/meta/server_state_restore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ std::pair<dsn::error_code, std::shared_ptr<app_state>> server_state::restore_app

_all_apps.emplace(app->app_id, app);
_exist_apps.emplace(info.app_name, app);
_table_metric_entities.create_entity(app->app_id);
_table_metric_entities.create_entity(app->app_id, app->partition_count);
}
}
// TODO: using one single env to replace
Expand Down
Loading

0 comments on commit fc2351b

Please sign in to comment.