Skip to content

Commit

Permalink
feat(new_metrics): migrate metrics for replica_stub (part 2) (apache#…
Browse files Browse the repository at this point in the history
…1459)

apache#1454

This is the 2nd part of migrating metrics of `replica_stub` to new framework,
all of which are learn-related.

During this migration, there are 3 metrics still keeping server-level, including
the number of learning replicas, the max duration and the max size of files that
are copied from learnee among all learning replicas.

Another 11 metrics are changed from server-level to replica-level, since they
should be observed for each replica. All of them are observed from the view
of learners, namely the potential secondary replica. The learnee is the primary
replica. These metrics include: the number of learns launched by learner, the
number of learn rounds launched by learner (during a learn there might be
multiple rounds, the number of files that are copied from learnee, the size of
files that are copied from learnee, the size of data that are copied from learnee's
buffer, the number of learn responses of `LT_CACHE`, `LT_APP` and `LT_LOG`
type decided by learner with each learn response related to an `RPC_LEARN`
request, the number of times learner resets its local state (since its local state
is newer than learnee's) with each reset related to an learn response of an
`RPC_LEARN` request, the number of failed and successful learns launched by
learner.
  • Loading branch information
empiredan committed Dec 11, 2023
1 parent a5c841e commit 3022109
Show file tree
Hide file tree
Showing 6 changed files with 141 additions and 104 deletions.
75 changes: 74 additions & 1 deletion src/replica/replica.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,68 @@ METRIC_DEFINE_counter(replica,
dsn::metric_unit::kRequests,
"The number of rejected non-idempotent write requests by duplication");

METRIC_DEFINE_counter(
replica,
learn_count,
dsn::metric_unit::kLearns,
"The number of learns launched by learner (i.e. potential secondary replica)");

METRIC_DEFINE_counter(replica,
learn_rounds,
dsn::metric_unit::kRounds,
"The number of learn rounds launched by learner (during a learn there might"
"be multiple rounds)");

METRIC_DEFINE_counter(replica,
learn_copy_files,
dsn::metric_unit::kFiles,
"The number of files that are copied from learnee (i.e. primary replica)");

METRIC_DEFINE_counter(replica,
learn_copy_file_bytes,
dsn::metric_unit::kBytes,
"The size of file that are copied from learnee");

METRIC_DEFINE_counter(replica,
learn_copy_buffer_bytes,
dsn::metric_unit::kBytes,
"The size of data that are copied from learnee's buffer");

METRIC_DEFINE_counter(replica,
learn_lt_cache_responses,
dsn::metric_unit::kResponses,
"The number of learn responses of LT_CACHE type decided by learner, with "
"each learn response related to an RPC_LEARN request");

METRIC_DEFINE_counter(replica,
learn_lt_app_responses,
dsn::metric_unit::kResponses,
"The number of learn responses of LT_APP type decided by learner, with each "
"learn response related to an RPC_LEARN request");

METRIC_DEFINE_counter(replica,
learn_lt_log_responses,
dsn::metric_unit::kResponses,
"The number of learn responses of LT_LOG type decided by learner, with each "
"learn response related to an RPC_LEARN request");

METRIC_DEFINE_counter(replica,
learn_resets,
dsn::metric_unit::kResets,
"The number of times learner resets its local state (since its local state "
"is newer than learnee's), with each reset related to an learn response of "
"an RPC_LEARN request");

METRIC_DEFINE_counter(replica,
learn_failed_count,
dsn::metric_unit::kLearns,
"The number of failed learns launched by learner");

METRIC_DEFINE_counter(replica,
learn_successful_count,
dsn::metric_unit::kLearns,
"The number of successful learns launched by learner");

namespace dsn {
namespace replication {

Expand Down Expand Up @@ -186,7 +248,18 @@ replica::replica(replica_stub *stub,
METRIC_VAR_INIT_replica(splitting_rejected_write_requests),
METRIC_VAR_INIT_replica(splitting_rejected_read_requests),
METRIC_VAR_INIT_replica(bulk_load_ingestion_rejected_write_requests),
METRIC_VAR_INIT_replica(dup_rejected_non_idempotent_write_requests)
METRIC_VAR_INIT_replica(dup_rejected_non_idempotent_write_requests),
METRIC_VAR_INIT_replica(learn_count),
METRIC_VAR_INIT_replica(learn_rounds),
METRIC_VAR_INIT_replica(learn_copy_files),
METRIC_VAR_INIT_replica(learn_copy_file_bytes),
METRIC_VAR_INIT_replica(learn_copy_buffer_bytes),
METRIC_VAR_INIT_replica(learn_lt_cache_responses),
METRIC_VAR_INIT_replica(learn_lt_app_responses),
METRIC_VAR_INIT_replica(learn_lt_log_responses),
METRIC_VAR_INIT_replica(learn_resets),
METRIC_VAR_INIT_replica(learn_failed_count),
METRIC_VAR_INIT_replica(learn_successful_count)
{
CHECK(!_app_info.app_type.empty(), "");
CHECK_NOTNULL(stub, "");
Expand Down
12 changes: 12 additions & 0 deletions src/replica/replica.h
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,18 @@ class replica : public serverlet<replica>, public ref_counter, public replica_ba
METRIC_VAR_DECLARE_counter(dup_rejected_non_idempotent_write_requests);
std::vector<perf_counter *> _counters_table_level_latency;

METRIC_VAR_DECLARE_counter(learn_count);
METRIC_VAR_DECLARE_counter(learn_rounds);
METRIC_VAR_DECLARE_counter(learn_copy_files);
METRIC_VAR_DECLARE_counter(learn_copy_file_bytes);
METRIC_VAR_DECLARE_counter(learn_copy_buffer_bytes);
METRIC_VAR_DECLARE_counter(learn_lt_cache_responses);
METRIC_VAR_DECLARE_counter(learn_lt_app_responses);
METRIC_VAR_DECLARE_counter(learn_lt_log_responses);
METRIC_VAR_DECLARE_counter(learn_resets);
METRIC_VAR_DECLARE_counter(learn_failed_count);
METRIC_VAR_DECLARE_counter(learn_successful_count);

dsn::task_tracker _tracker;
// the thread access checker
dsn::thread_access_checker _checker;
Expand Down
39 changes: 25 additions & 14 deletions src/replica/replica_learn.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,6 @@
#include "mutation.h"
#include "mutation_log.h"
#include "nfs/nfs_node.h"
#include "perf_counter/perf_counter.h"
#include "perf_counter/perf_counter_wrapper.h"
#include "replica.h"
#include "replica/duplication/replica_duplicator_manager.h"
#include "replica/prepare_list.h"
Expand All @@ -72,8 +70,21 @@
#include "utils/filesystem.h"
#include "utils/flags.h"
#include "utils/fmt_logging.h"
#include "utils/metrics.h"
#include "utils/thread_access_checker.h"

METRIC_DECLARE_counter(learn_count);
METRIC_DECLARE_counter(learn_rounds);
METRIC_DECLARE_counter(learn_copy_files);
METRIC_DECLARE_counter(learn_copy_file_bytes);
METRIC_DECLARE_counter(learn_copy_buffer_bytes);
METRIC_DECLARE_counter(learn_lt_cache_responses);
METRIC_DECLARE_counter(learn_lt_app_responses);
METRIC_DECLARE_counter(learn_lt_log_responses);
METRIC_DECLARE_counter(learn_resets);
METRIC_DECLARE_counter(learn_failed_count);
METRIC_DECLARE_counter(learn_successful_count);

namespace dsn {
namespace replication {

Expand Down Expand Up @@ -131,7 +142,7 @@ void replica::init_learn(uint64_t signature)
return;
}

_stub->_counter_replicas_learning_recent_start_count->increment();
METRIC_VAR_INCREMENT(learn_count);

_potential_secondary_states.learning_version = signature;
_potential_secondary_states.learning_start_ts_ns = dsn_now_ns();
Expand Down Expand Up @@ -171,7 +182,7 @@ void replica::init_learn(uint64_t signature)

// missed ones need to be loaded via private logs
else {
_stub->_counter_replicas_learning_recent_round_start_count->increment();
METRIC_VAR_INCREMENT(learn_rounds);
_potential_secondary_states.learning_round_is_running = true;
_potential_secondary_states.catchup_with_private_log_task =
tasking::create_task(LPC_CATCHUP_WITH_PRIVATE_LOGS,
Expand Down Expand Up @@ -225,7 +236,7 @@ void replica::init_learn(uint64_t signature)
return;
}

_stub->_counter_replicas_learning_recent_round_start_count->increment();
METRIC_VAR_INCREMENT(learn_rounds);
_potential_secondary_states.learning_round_is_running = true;

learn_request request;
Expand Down Expand Up @@ -596,7 +607,7 @@ void replica::on_learn_reply(error_code err, learn_request &&req, learn_response
enum_to_string(_potential_secondary_states.learning_status));

_potential_secondary_states.learning_copy_buffer_size += resp.state.meta.length();
_stub->_counter_replicas_learning_recent_copy_buffer_size->add(resp.state.meta.length());
METRIC_VAR_INCREMENT_BY(learn_copy_buffer_bytes, resp.state.meta.length());

if (resp.err != ERR_OK) {
if (resp.err == ERR_INACTIVE_STATE || resp.err == ERR_INCONSISTENT_STATE) {
Expand Down Expand Up @@ -644,7 +655,7 @@ void replica::on_learn_reply(error_code err, learn_request &&req, learn_response
_app->last_committed_decree(),
resp.last_committed_decree);

_stub->_counter_replicas_learning_recent_learn_reset_count->increment();
METRIC_VAR_INCREMENT(learn_resets);

// close app
auto err = _app->close(true);
Expand Down Expand Up @@ -737,13 +748,13 @@ void replica::on_learn_reply(error_code err, learn_request &&req, learn_response

switch (resp.type) {
case learn_type::LT_CACHE:
_stub->_counter_replicas_learning_recent_learn_cache_count->increment();
METRIC_VAR_INCREMENT(learn_lt_cache_responses);
break;
case learn_type::LT_APP:
_stub->_counter_replicas_learning_recent_learn_app_count->increment();
METRIC_VAR_INCREMENT(learn_lt_app_responses);
break;
case learn_type::LT_LOG:
_stub->_counter_replicas_learning_recent_learn_log_count->increment();
METRIC_VAR_INCREMENT(learn_lt_log_responses);
break;
default:
// do nothing
Expand Down Expand Up @@ -1038,8 +1049,8 @@ void replica::on_copy_remote_state_completed(error_code err,
if (err == ERR_OK) {
_potential_secondary_states.learning_copy_file_count += resp.state.files.size();
_potential_secondary_states.learning_copy_file_size += size;
_stub->_counter_replicas_learning_recent_copy_file_count->add(resp.state.files.size());
_stub->_counter_replicas_learning_recent_copy_file_size->add(size);
METRIC_VAR_INCREMENT_BY(learn_copy_files, resp.state.files.size());
METRIC_VAR_INCREMENT_BY(learn_copy_file_bytes, size);
}

if (err != ERR_OK) {
Expand Down Expand Up @@ -1240,7 +1251,7 @@ void replica::handle_learning_error(error_code err, bool is_local_error)
}
}

_stub->_counter_replicas_learning_recent_learn_fail_count->increment();
METRIC_VAR_INCREMENT(learn_failed_count);

update_local_configuration_with_no_ballot_change(
is_local_error ? partition_status::PS_ERROR : partition_status::PS_INACTIVE);
Expand Down Expand Up @@ -1404,7 +1415,7 @@ void replica::on_learn_completion_notification_reply(error_code err,
handle_learning_error(resp.err, false);
}
} else {
_stub->_counter_replicas_learning_recent_learn_succ_count->increment();
METRIC_VAR_INCREMENT(learn_successful_count);
}
}

Expand Down
98 changes: 23 additions & 75 deletions src/replica/replica_stub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,22 @@ METRIC_DEFINE_gauge_int64(server,
dsn::metric_unit::kReplicas,
"The number of closing replicas");

METRIC_DEFINE_gauge_int64(server,
learning_replicas,
dsn::metric_unit::kReplicas,
"The number of learning replicas");

METRIC_DEFINE_gauge_int64(server,
learning_replicas_max_duration_ms,
dsn::metric_unit::kMilliSeconds,
"The max duration among all learning replicas");

METRIC_DEFINE_gauge_int64(
server,
learning_replicas_max_copy_file_bytes,
dsn::metric_unit::kBytes,
"The max size of files that are copied from learnee among all learning replicas");

namespace dsn {
namespace replication {
DSN_DEFINE_bool(replication,
Expand Down Expand Up @@ -221,7 +237,10 @@ replica_stub::replica_stub(replica_state_subscriber subscriber /*= nullptr*/,
_is_running(false),
METRIC_VAR_INIT_server(total_replicas),
METRIC_VAR_INIT_server(opening_replicas),
METRIC_VAR_INIT_server(closing_replicas)
METRIC_VAR_INIT_server(closing_replicas),
METRIC_VAR_INIT_server(learning_replicas),
METRIC_VAR_INIT_server(learning_replicas_max_duration_ms),
METRIC_VAR_INIT_server(learning_replicas_max_copy_file_bytes)
{
#ifdef DSN_ENABLE_GPERF
_is_releasing_memory = false;
Expand All @@ -239,77 +258,6 @@ replica_stub::~replica_stub(void) { close(); }

void replica_stub::install_perf_counters()
{
_counter_replicas_learning_count.init_app_counter("eon.replica_stub",
"replicas.learning.count",
COUNTER_TYPE_NUMBER,
"current learning count");
_counter_replicas_learning_max_duration_time_ms.init_app_counter(
"eon.replica_stub",
"replicas.learning.max.duration.time(ms)",
COUNTER_TYPE_NUMBER,
"current learning max duration time(ms)");
_counter_replicas_learning_max_copy_file_size.init_app_counter(
"eon.replica_stub",
"replicas.learning.max.copy.file.size",
COUNTER_TYPE_NUMBER,
"current learning max copy file size");
_counter_replicas_learning_recent_start_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.start.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"current learning start count in the recent period");
_counter_replicas_learning_recent_round_start_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.round.start.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning round start count in the recent period");
_counter_replicas_learning_recent_copy_file_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.copy.file.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning copy file count in the recent period");
_counter_replicas_learning_recent_copy_file_size.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.copy.file.size",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning copy file size in the recent period");
_counter_replicas_learning_recent_copy_buffer_size.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.copy.buffer.size",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning copy buffer size in the recent period");
_counter_replicas_learning_recent_learn_cache_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.learn.cache.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning LT_CACHE count in the recent period");
_counter_replicas_learning_recent_learn_app_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.learn.app.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning LT_APP count in the recent period");
_counter_replicas_learning_recent_learn_log_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.learn.log.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning LT_LOG count in the recent period");
_counter_replicas_learning_recent_learn_reset_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.learn.reset.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning reset count in the recent period"
"for the reason of resp.last_committed_decree < _app->last_committed_decree()");
_counter_replicas_learning_recent_learn_fail_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.learn.fail.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning fail count in the recent period");
_counter_replicas_learning_recent_learn_succ_count.init_app_counter(
"eon.replica_stub",
"replicas.learning.recent.learn.succ.count",
COUNTER_TYPE_VOLATILE_NUMBER,
"learning succeed count in the recent period");

_counter_replicas_recent_prepare_fail_count.init_app_counter(
"eon.replica_stub",
"replicas.recent.prepare.fail.count",
Expand Down Expand Up @@ -2016,9 +1964,9 @@ void replica_stub::on_gc()
}
}

_counter_replicas_learning_count->set(learning_count);
_counter_replicas_learning_max_duration_time_ms->set(learning_max_duration_time_ms);
_counter_replicas_learning_max_copy_file_size->set(learning_max_copy_file_size);
METRIC_VAR_SET(learning_replicas, learning_count);
METRIC_VAR_SET(learning_replicas_max_duration_ms, learning_max_duration_time_ms);
METRIC_VAR_SET(learning_replicas_max_copy_file_bytes, learning_max_copy_file_size);
_counter_cold_backup_running_count->set(cold_backup_running_count);
_counter_cold_backup_max_duration_time_ms->set(cold_backup_max_duration_time_ms);
_counter_cold_backup_max_upload_file_size->set(cold_backup_max_upload_file_size);
Expand Down
17 changes: 3 additions & 14 deletions src/replica/replica_stub.h
Original file line number Diff line number Diff line change
Expand Up @@ -534,20 +534,9 @@ class replica_stub : public serverlet<replica_stub>, public ref_counter
METRIC_VAR_DECLARE_gauge_int64(opening_replicas);
METRIC_VAR_DECLARE_gauge_int64(closing_replicas);

perf_counter_wrapper _counter_replicas_learning_count;
perf_counter_wrapper _counter_replicas_learning_max_duration_time_ms;
perf_counter_wrapper _counter_replicas_learning_max_copy_file_size;
perf_counter_wrapper _counter_replicas_learning_recent_start_count;
perf_counter_wrapper _counter_replicas_learning_recent_round_start_count;
perf_counter_wrapper _counter_replicas_learning_recent_copy_file_count;
perf_counter_wrapper _counter_replicas_learning_recent_copy_file_size;
perf_counter_wrapper _counter_replicas_learning_recent_copy_buffer_size;
perf_counter_wrapper _counter_replicas_learning_recent_learn_cache_count;
perf_counter_wrapper _counter_replicas_learning_recent_learn_app_count;
perf_counter_wrapper _counter_replicas_learning_recent_learn_log_count;
perf_counter_wrapper _counter_replicas_learning_recent_learn_reset_count;
perf_counter_wrapper _counter_replicas_learning_recent_learn_fail_count;
perf_counter_wrapper _counter_replicas_learning_recent_learn_succ_count;
METRIC_VAR_DECLARE_gauge_int64(learning_replicas);
METRIC_VAR_DECLARE_gauge_int64(learning_replicas_max_duration_ms);
METRIC_VAR_DECLARE_gauge_int64(learning_replicas_max_copy_file_bytes);

perf_counter_wrapper _counter_replicas_recent_prepare_fail_count;
perf_counter_wrapper _counter_replicas_recent_replica_move_error_count;
Expand Down
4 changes: 4 additions & 0 deletions src/utils/metrics.h
Original file line number Diff line number Diff line change
Expand Up @@ -657,6 +657,7 @@ enum class metric_unit : size_t
kReplicas,
kServers,
kRequests,
kResponses,
kSeeks,
kPointLookups,
kValues,
Expand All @@ -670,6 +671,9 @@ enum class metric_unit : size_t
kOperations,
kTasks,
kDisconnections,
kLearns,
kRounds,
kResets,
kInvalidUnit,
};

Expand Down

0 comments on commit 3022109

Please sign in to comment.