Skip to content

Commit

Permalink
feat(new_metrics): migrate partition-level metrics for partition_guar…
Browse files Browse the repository at this point in the history
…dian (#1440)

#1331

In perf counters, there's only one metric for partition_guardian, namely
the number of operations that fail to choose the primary replica, which
is server-level. It would be changed to partition-level in new metrics
since this could give which partitions fail to choose primaries and how
frequency those happen. Still, to compute table-level or server-level
metrics just aggregate on partition-level ones.
  • Loading branch information
empiredan authored and wangdan committed May 5, 2023
1 parent e83402b commit af3cad2
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 11 deletions.
14 changes: 6 additions & 8 deletions src/meta/partition_guardian.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,11 @@
#include "meta/meta_data.h"
#include "meta/meta_service.h"
#include "meta/server_load_balancer.h"
#include "perf_counter/perf_counter.h"
#include "meta/server_state.h"
#include "meta/table_metrics.h"
#include "utils/flags.h"
#include "utils/fmt_logging.h"
#include "utils/metrics.h"
#include "utils/string_conv.h"
#include "utils/strings.h"
#include "utils/time_utils.h"
Expand All @@ -53,12 +55,6 @@ partition_guardian::partition_guardian(meta_service *svc) : _svc(svc)
} else {
_replica_assign_delay_ms_for_dropouts = 0;
}

_recent_choose_primary_fail_count.init_app_counter(
"eon.server_load_balancer",
"recent_choose_primary_fail_count",
COUNTER_TYPE_VOLATILE_NUMBER,
"choose primary fail count in the recent period");
}

pc_status partition_guardian::cure(meta_view view,
Expand Down Expand Up @@ -452,7 +448,9 @@ pc_status partition_guardian::on_missing_primary(meta_view &view, const dsn::gpi
LOG_WARNING("{}: don't select any node for security reason, administrator can select "
"a proper one by shell",
gpid_name);
_recent_choose_primary_fail_count->increment();
METRIC_INCREMENT(_svc->get_server_state()->get_table_metric_entities(),
choose_primary_failed_operations,
gpid);
ddd_partition_info pinfo;
pinfo.config = pc;
for (int i = 0; i < cc.dropped.size(); ++i) {
Expand Down
2 changes: 0 additions & 2 deletions src/meta/partition_guardian.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
#include "dsn.layer2_types.h"
#include "meta_admin_types.h"
#include "meta_data.h"
#include "perf_counter/perf_counter_wrapper.h"
#include "runtime/rpc/rpc_address.h"
#include "utils/command_manager.h"
#include "utils/zlocks.h"
Expand Down Expand Up @@ -91,7 +90,6 @@ class partition_guardian
}

meta_service *_svc;
perf_counter_wrapper _recent_choose_primary_fail_count;

mutable zlock _ddd_partitions_lock; // [
std::map<gpid, ddd_partition_info> _ddd_partitions;
Expand Down
8 changes: 7 additions & 1 deletion src/meta/table_metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ METRIC_DEFINE_counter(partition,
dsn::metric_unit::kOperations,
"The number of balance operations by greedy balancer that copy secondaries");

METRIC_DEFINE_counter(partition,
choose_primary_failed_operations,
dsn::metric_unit::kOperations,
"The number of operations that fail to choose the primary replica");

METRIC_DEFINE_entity(table);

// The number of partitions in each status, see `health_status` and `partition_health_status()`
Expand Down Expand Up @@ -133,7 +138,8 @@ partition_metrics::partition_metrics(int32_t table_id, int32_t partition_id)
METRIC_VAR_INIT_partition(greedy_recent_balance_operations),
METRIC_VAR_INIT_partition(greedy_move_primary_operations),
METRIC_VAR_INIT_partition(greedy_copy_primary_operations),
METRIC_VAR_INIT_partition(greedy_copy_secondary_operations)
METRIC_VAR_INIT_partition(greedy_copy_secondary_operations),
METRIC_VAR_INIT_partition(choose_primary_failed_operations)
{
}

Expand Down
5 changes: 5 additions & 0 deletions src/meta/table_metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ class partition_metrics
METRIC_DEFINE_INCREMENT_BY(greedy_copy_primary_operations)
METRIC_DEFINE_INCREMENT_BY(greedy_copy_secondary_operations)

METRIC_DEFINE_INCREMENT(choose_primary_failed_operations)

private:
const int32_t _table_id;
const int32_t _partition_id;
Expand All @@ -64,6 +66,7 @@ class partition_metrics
METRIC_VAR_DECLARE_counter(greedy_move_primary_operations);
METRIC_VAR_DECLARE_counter(greedy_copy_primary_operations);
METRIC_VAR_DECLARE_counter(greedy_copy_secondary_operations);
METRIC_VAR_DECLARE_counter(choose_primary_failed_operations);

DISALLOW_COPY_AND_ASSIGN(partition_metrics);
};
Expand Down Expand Up @@ -112,6 +115,7 @@ class table_metrics
__METRIC_DEFINE_INCREMENT(partition_configuration_changes)
__METRIC_DEFINE_INCREMENT(unwritable_partition_changes)
__METRIC_DEFINE_INCREMENT(writable_partition_changes)
__METRIC_DEFINE_INCREMENT(choose_primary_failed_operations)

#undef __METRIC_DEFINE_INCREMENT

Expand Down Expand Up @@ -221,6 +225,7 @@ class table_metric_entities
__METRIC_DEFINE_INCREMENT(partition_configuration_changes)
__METRIC_DEFINE_INCREMENT(unwritable_partition_changes)
__METRIC_DEFINE_INCREMENT(writable_partition_changes)
__METRIC_DEFINE_INCREMENT(choose_primary_failed_operations)

#undef __METRIC_DEFINE_INCREMENT

Expand Down

0 comments on commit af3cad2

Please sign in to comment.