Skip to content

Commit

Permalink
feat(new_metrics): migrate built-in server-level metrics (#1418)
Browse files Browse the repository at this point in the history
#1321

Migrate built-in server-level metrics to new framework, including the total
amount of virtual/physical memory usage in MB. Both metrics are wrapped
in `builtin_metrics` and updated periodically by a timer, which is started
or stopped along with meta/replica servers.

Since `dsn_perf_counter_test` involves both metrics which have been
removed from perf-counters, just disable it.
  • Loading branch information
empiredan committed Dec 11, 2023
1 parent b1730fb commit 2d362f1
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 98 deletions.
18 changes: 14 additions & 4 deletions .github/workflows/lint_and_test_cpp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,9 @@ jobs:
- dsn_meta_state_tests
- dsn.meta.test
- dsn_nfs_test
- dsn_perf_counter_test
# TODO(wangdan): Since builtin_counters (memused.virt and memused.res) for perf-counters
# have been removed and dsn_perf_counter_test depends on them, disable it.
# - dsn_perf_counter_test
- dsn_replica_backup_test
- dsn_replica_bulk_load_test
- dsn_replica_dup_test
Expand Down Expand Up @@ -352,7 +354,9 @@ jobs:
- dsn_meta_state_tests
- dsn.meta.test
- dsn_nfs_test
- dsn_perf_counter_test
# TODO(wangdan): Since builtin_counters (memused.virt and memused.res) for perf-counters
# have been removed and dsn_perf_counter_test depends on them, disable it.
# - dsn_perf_counter_test
- dsn_replica_backup_test
- dsn_replica_bulk_load_test
- dsn_replica_dup_test
Expand Down Expand Up @@ -485,7 +489,11 @@ jobs:
# - base_api_test
# - base_test
# - bulk_load_test
# - detect_hotspot_test
# # TODO(wangdan): Since the hotspot detection depends on the perf-counters system which
# # is being replaced with the new metrics system, its test will fail. Temporarily disable
# # the test and re-enable it after the hotspot detection is migrated to the new metrics
# # system.
# # - detect_hotspot_test
# - dsn_aio_test
# - dsn_block_service_test
# - dsn_client_test
Expand All @@ -494,7 +502,9 @@ jobs:
# - dsn_meta_state_tests
# - dsn.meta.test
# - dsn_nfs_test
# - dsn_perf_counter_test
# # TODO(wangdan): Since builtin_counters (memused.virt and memused.res) for perf-counters
# # have been removed and dsn_perf_counter_test depends on them, disable it.
# # - dsn_perf_counter_test
# - dsn_replica_backup_test
# - dsn_replica_bulk_load_test
# - dsn_replica_dup_test
Expand Down
4 changes: 3 additions & 1 deletion run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,9 @@ function run_test()
dsn_meta_state_tests
dsn.meta.test
dsn_nfs_test
dsn_perf_counter_test
# TODO(wangdan): Since builtin_counters (memused.virt and memused.res) for perf-counters
# have been removed and dsn_perf_counter_test depends on them, disable it.
# dsn_perf_counter_test
dsn_replica_backup_test
dsn_replica_bulk_load_test
dsn_replica_dup_test
Expand Down
57 changes: 0 additions & 57 deletions src/perf_counter/builtin_counters.cpp

This file was deleted.

3 changes: 0 additions & 3 deletions src/perf_counter/perf_counters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@
#include <sstream>
#include <utility>

#include "builtin_counters.h"
#include "perf_counter/perf_counter.h"
#include "perf_counter/perf_counter_atomic.h"
#include "perf_counter/perf_counter_utils.h"
Expand Down Expand Up @@ -367,8 +366,6 @@ std::string perf_counters::list_snapshot_by_literal(

void perf_counters::take_snapshot()
{
builtin_counters::instance().update_counters();

std::vector<perf_counter_ptr> all_counters;
get_all_counters(&all_counters);

Expand Down
58 changes: 35 additions & 23 deletions src/server/pegasus_service_app.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <pegasus/version.h>
#include <pegasus/git_commit.h>
#include "reporter/pegasus_counter_reporter.h"
#include "utils/builtin_metrics.h"

namespace pegasus {
namespace server {
Expand All @@ -32,8 +33,7 @@ class pegasus_replication_service_app : public ::dsn::replication::replication_s
{
public:
pegasus_replication_service_app(const dsn::service_app_info *info)
: ::dsn::replication::replication_service_app::replication_service_app(info),
_updater_started(false)
: ::dsn::replication::replication_service_app::replication_service_app(info)
{
}

Expand All @@ -43,33 +43,39 @@ class pegasus_replication_service_app : public ::dsn::replication::replication_s
std::vector<std::string> args_new(args);
args_new.emplace_back(PEGASUS_VERSION);
args_new.emplace_back(PEGASUS_GIT_COMMIT);
::dsn::error_code ret = ::dsn::replication::replication_service_app::start(args_new);

if (ret == ::dsn::ERR_OK) {
pegasus_counter_reporter::instance().start();
_updater_started = true;
}
return ret;
// Actually the root caller, start_app() in service_control_task::exec() will also do
// CHECK for ERR_OK. Do CHECK here to guarantee that all following services (such as
// built-in metrics) are started.
CHECK_EQ(::dsn::replication::replication_service_app::start(args_new), ::dsn::ERR_OK);

// TODO(wangdan): remove after all metrics have been migrated.
pegasus_counter_reporter::instance().start();

_builtin_metrics.start();
return ::dsn::ERR_OK;
}

virtual ::dsn::error_code stop(bool cleanup = false) override
{
::dsn::error_code ret = ::dsn::replication::replication_service_app::stop();
if (_updater_started) {
pegasus_counter_reporter::instance().stop();
}

// TODO(wangdan): remove after all metrics have been migrated.
pegasus_counter_reporter::instance().stop();

_builtin_metrics.stop();
return ret;
}

private:
bool _updater_started;
dsn::builtin_metrics _builtin_metrics;
};

class pegasus_meta_service_app : public ::dsn::service::meta_service_app
{
public:
pegasus_meta_service_app(const dsn::service_app_info *info)
: ::dsn::service::meta_service_app::meta_service_app(info), _updater_started(false)
: ::dsn::service::meta_service_app::meta_service_app(info)
{
}

Expand All @@ -79,26 +85,32 @@ class pegasus_meta_service_app : public ::dsn::service::meta_service_app
std::vector<std::string> args_new(args);
args_new.emplace_back(PEGASUS_VERSION);
args_new.emplace_back(PEGASUS_GIT_COMMIT);
::dsn::error_code ret = ::dsn::service::meta_service_app::start(args_new);

if (ret == ::dsn::ERR_OK) {
pegasus_counter_reporter::instance().start();
_updater_started = true;
}
return ret;
// Actually the root caller, start_app() in service_control_task::exec() will also do
// CHECK for ERR_OK. Do CHECK here to guarantee that all following services (such as
// built-in metrics) are started.
CHECK_EQ(::dsn::service::meta_service_app::start(args_new), ::dsn::ERR_OK);

// TODO(wangdan): remove after all metrics have been migrated.
pegasus_counter_reporter::instance().start();

_builtin_metrics.start();
return ::dsn::ERR_OK;
}

virtual ::dsn::error_code stop(bool cleanup = false) override
{
::dsn::error_code ret = ::dsn::service::meta_service_app::stop();
if (_updater_started) {
pegasus_counter_reporter::instance().stop();
}

// TODO(wangdan): remove after all metrics have been migrated.
pegasus_counter_reporter::instance().stop();

_builtin_metrics.stop();
return ret;
}

private:
bool _updater_started;
dsn::builtin_metrics _builtin_metrics;
};

} // namespace server
Expand Down
92 changes: 92 additions & 0 deletions src/utils/builtin_metrics.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "utils/builtin_metrics.h"

#include <stdint.h>
#include <functional>

#include "utils/autoref_ptr.h"
#include "utils/flags.h"
#include "utils/fmt_logging.h"
#include "utils/process_utils.h"
#include "utils/string_view.h"

METRIC_DEFINE_gauge_int64(server,
virtual_mem_usage_mb,
dsn::metric_unit::kMegaBytes,
"The total amount of virtual memory usage in MB");

METRIC_DEFINE_gauge_int64(server,
resident_mem_usage_mb,
dsn::metric_unit::kMegaBytes,
"The total amount of physical memory usage in MB");

namespace dsn {

DSN_DEFINE_uint64(metrics,
builtin_metrics_update_interval_ms,
10 * 1000,
"The interval (milliseconds) at which builtin metrics are updated.");

builtin_metrics::builtin_metrics()
: METRIC_VAR_INIT_server(virtual_mem_usage_mb), METRIC_VAR_INIT_server(resident_mem_usage_mb)
{
}

builtin_metrics::~builtin_metrics()
{
CHECK(!_timer, "timer should have been destroyed by stop()");
}

void builtin_metrics::on_close() {}

void builtin_metrics::start()
{
CHECK(!_timer, "timer should not have been initialized before start()");

_timer.reset(new metric_timer(FLAGS_builtin_metrics_update_interval_ms,
std::bind(&builtin_metrics::update, this),
std::bind(&builtin_metrics::on_close, this)));
}

void builtin_metrics::stop()
{
CHECK(_timer, "timer should have been initialized before stop()");

// Close the timer synchronously.
_timer->close();
_timer->wait();

// Reset the timer to mark that it has been stopped, now it could be started.
_timer.reset();
}

void builtin_metrics::update()
{
double vm_usage;
double resident_set;
utils::process_mem_usage(vm_usage, resident_set);

auto virt_mb = static_cast<uint64_t>(vm_usage) >> 10;
auto res_mb = static_cast<uint64_t>(resident_set) >> 10;
METRIC_VAR_SET(virtual_mem_usage_mb, virt_mb);
METRIC_VAR_SET(resident_mem_usage_mb, res_mb);
LOG_INFO("virt = {} MB, res = {} MB", virt_mb, res_mb);
}

} // namespace dsn
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,34 @@
// specific language governing permissions and limitations
// under the License.

#include "perf_counter_wrapper.h"
#include "utils/singleton.h"
#pragma once

#include <memory>

#include "utils/metrics.h"
#include "utils/ports.h"

namespace dsn {
class builtin_counters : public dsn::utils::singleton<builtin_counters>

class builtin_metrics
{
public:
builtin_counters();
~builtin_counters();
void update_counters();
builtin_metrics();
~builtin_metrics();

void start();
void stop();

private:
dsn::perf_counter_wrapper _memused_virt;
dsn::perf_counter_wrapper _memused_res;
void on_close();
void update();

METRIC_VAR_DECLARE_gauge_int64(virtual_mem_usage_mb);
METRIC_VAR_DECLARE_gauge_int64(resident_mem_usage_mb);

std::unique_ptr<metric_timer> _timer;

DISALLOW_COPY_AND_ASSIGN(builtin_metrics);
};
}

} // namespace dsn
2 changes: 1 addition & 1 deletion src/utils/metrics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ namespace dsn {
DSN_DEFINE_uint64(metrics,
entity_retirement_delay_ms,
10 * 60 * 1000,
"The retention internal (milliseconds) for an entity after it becomes stale.");
"The retention interval (milliseconds) for an entity after it becomes stale.");

metric_entity::metric_entity(const metric_entity_prototype *prototype,
const std::string &id,
Expand Down

0 comments on commit 2d362f1

Please sign in to comment.