Skip to content
This repository has been archived by the owner on Jun 23, 2022. It is now read-only.

feat: restrict the replication factor while creating app #963

Merged
merged 9 commits into from
Dec 15, 2021
6 changes: 6 additions & 0 deletions src/meta/meta_service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1186,5 +1186,11 @@ void meta_service::on_query_backup_status(query_backup_status_rpc rpc)
_backup_handler->query_backup_status(std::move(rpc));
}

size_t meta_service::get_alive_node_count() const
{
zauto_lock l(_failure_detector->_lock);
return _alive_set.size();
}

} // namespace replication
} // namespace dsn
2 changes: 2 additions & 0 deletions src/meta/meta_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ class meta_service : public serverlet<meta_service>

dsn::task_tracker *tracker() { return &_tracker; }

size_t get_alive_node_count() const;

private:
void register_rpc_handlers();
void register_ctrl_commands();
Expand Down
69 changes: 68 additions & 1 deletion src/meta/server_state.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,17 @@ using namespace dsn;
namespace dsn {
namespace replication {

const int32_t max_allowed_replica_count = 15;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why the max is not set dynamic variable like min?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Initially the max_allowed_replica_count was set configurable. Then as discussed with @hycdong above, max_allowed_replica_count has been changed as a const value, not a configurable one, since in normal cases it will not be updated.

Copy link
Contributor

@Smityz Smityz Nov 26, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By the way, the const int should be named like k... refer to google code style

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. I'd considered following google code style such as the camel-case naming of constant which begins with 'k'. On the other hand, however I'd found many constants are named in upper or lower snake case before I turned to follow the existing custom. I think we should gradually unify the naming style.


DSN_DEFINE_int32("meta_server",
levy5307 marked this conversation as resolved.
Show resolved Hide resolved
min_allowed_replica_count,
1,
"min allowed replica count for arbitrary number of nodes in a cluster");

DSN_DEFINE_validator(min_allowed_replica_count, [](int32_t allowed_replica_count) -> bool {
return allowed_replica_count > 0 && allowed_replica_count <= max_allowed_replica_count;
});

static const char *lock_state = "lock";
static const char *unlock_state = "unlock";

Expand Down Expand Up @@ -1072,7 +1083,13 @@ void server_state::create_app(dsn::message_ex *msg)
opt.replica_count == exist_app.max_replica_count;
};

if (request.options.partition_count <= 0 || request.options.replica_count <= 0) {
auto level = _meta_svc->get_function_level();
if (level <= meta_function_level::fl_freezed) {
derror_f("current meta function level is freezed since there are too few alive nodes");
response.err = ERR_STATE_FREEZED;
will_create_app = false;
} else if (request.options.partition_count <= 0 ||
!validate_target_max_replica_count(request.options.replica_count)) {
response.err = ERR_INVALID_PARAMETERS;
will_create_app = false;
} else {
Expand Down Expand Up @@ -2864,5 +2881,55 @@ void server_state::clear_app_envs(const app_env_rpc &env_rpc)
new_envs.c_str());
});
}

namespace {

bool validate_target_max_replica_count_internal(int32_t max_replica_count,
int32_t alive_node_count,
std::string &hint_message)
{
if (max_replica_count > max_allowed_replica_count) {
hint_message = fmt::format("requested replica count({}) exceeds the max "
"allowed replica count({})",
max_replica_count,
max_allowed_replica_count);
return false;
}

if (max_replica_count < FLAGS_min_allowed_replica_count) {
hint_message = fmt::format("requested replica count({}) is less than the min "
"allowed replica count({})",
max_replica_count,
FLAGS_min_allowed_replica_count);
return false;
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you may don't need to split two if to log the hit_message, consider the following whether more simple:

 if (max_replica_count > max_allowed_replica_count || max_replica_count < FLAGS_min_allowed_replica_count) {
        hint_message = fmt::format("requested replica count({}) must be range within [min={} max={}] ",
                                   max_replica_count,
                                   FLAGS_min_allowed_replica_count
                                   max_allowed_replica_count);
        return false;
    }

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, I'll merge the both conditions. A range seems more specific.

if (max_replica_count > alive_node_count) {
hint_message = fmt::format("there are not enough alive replica servers({}) "
"for the requested replica count({})",
alive_node_count,
max_replica_count);
return false;
}

return true;
}

} // anonymous namespace

bool server_state::validate_target_max_replica_count(int32_t max_replica_count)
{
auto alive_node_count = static_cast<int32_t>(_meta_svc->get_alive_node_count());

std::string hint_message;
bool is_valid = validate_target_max_replica_count_internal(
max_replica_count, alive_node_count, hint_message);
if (!is_valid) {
derror_f("target max replica count is invalid: message={}", hint_message);
}

return is_valid;
}

} // namespace replication
} // namespace dsn
3 changes: 3 additions & 0 deletions src/meta/server_state.h
Original file line number Diff line number Diff line change
Expand Up @@ -292,6 +292,9 @@ class server_state
void process_one_partition(std::shared_ptr<app_state> &app);
void transition_staging_state(std::shared_ptr<app_state> &app);

// check whether a max replica count is valid especially for a new app
bool validate_target_max_replica_count(int32_t max_replica_count);
acelyc111 marked this conversation as resolved.
Show resolved Hide resolved

private:
friend class bulk_load_service;
friend class bulk_load_service_test;
Expand Down
131 changes: 112 additions & 19 deletions src/meta/test/meta_app_operation_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.

#include <gtest/gtest.h>
#include <dsn/dist/fmt_logging.h>
#include <dsn/service_api_c.h>

#include "meta_service_test_app.h"
Expand All @@ -24,17 +25,22 @@

namespace dsn {
namespace replication {

DSN_DECLARE_int32(min_allowed_replica_count);

acelyc111 marked this conversation as resolved.
Show resolved Hide resolved
class meta_app_operation_test : public meta_test_base
{
public:
meta_app_operation_test() {}

error_code
create_app_test(int32_t partition_count, int32_t replica_count, bool success_if_exist)
error_code create_app_test(int32_t partition_count,
int32_t replica_count,
bool success_if_exist,
const std::string &app_name)
{
configuration_create_app_request create_request;
configuration_create_app_response create_response;
create_request.app_name = APP_NAME;
create_request.app_name = app_name;
create_request.options.app_type = "simple_kv";
create_request.options.partition_count = partition_count;
create_request.options.replica_count = replica_count;
Expand Down Expand Up @@ -94,18 +100,46 @@ class meta_app_operation_test : public meta_test_base
app->expire_second -= 604800;
}

void clear_nodes() { _ss->_nodes.clear(); }

const std::string APP_NAME = "app_operation_test";
const std::string OLD_APP_NAME = "old_app_operation";
const int32_t PARTITION_COUNT = 4;
const int32_t REPLICA_COUNT = 3;
};

TEST_F(meta_app_operation_test, create_app)
{
// Test cases:
// Test cases: (assert min_allowed_replica_count <= max_allowed_replica_count)
// - wrong partition_count (< 0)
// - wrong partition_count (= 0)
// - wrong replica_count (< 0)
// - wrong replica_count (= 0)
// - wrong replica_count (> max_allowed_replica_count > alive_node_count)
// - wrong replica_count (> alive_node_count > max_allowed_replica_count)
// - wrong replica_count (> alive_node_count = max_allowed_replica_count)
// - wrong replica_count (= max_allowed_replica_count, and > alive_node_count)
// - wrong replica_count (< max_allowed_replica_count, and > alive_node_count)
// - wrong replica_count (= alive_node_count, and > max_allowed_replica_count)
// - wrong replica_count (< alive_node_count, and > max_allowed_replica_count)
// - valid replica_count (= max_allowed_replica_count, and = alive_node_count)
// - valid replica_count (= max_allowed_replica_count, and < alive_node_count)
// - valid replica_count (< max_allowed_replica_count, and = alive_node_count)
// - valid replica_count (< max_allowed_replica_count < alive_node_count)
// - valid replica_count (< alive_node_count < max_allowed_replica_count)
// - valid replica_count (< alive_node_count = max_allowed_replica_count)
// - wrong replica_count (< min_allowed_replica_count < alive_node_count)
// - wrong replica_count (< alive_node_count < min_allowed_replica_count)
// - wrong replica_count (< min_allowed_replica_count = alive_node_count)
// - wrong replica_count (< min_allowed_replica_count, and > alive_node_count)
// - wrong replica_count (< min_allowed_replica_count, and = alive_node_count)
// - wrong replica_count (= min_allowed_replica_count, and > alive_node_count)
// - valid replica_count (= min_allowed_replica_count, and < alive_node_count)
// - cluster freezed (alive_node_count = 0)
// - cluster freezed (alive_node_count = 1 < min_live_node_count_for_unfreeze)
// - cluster freezed (alive_node_count = 2 < min_live_node_count_for_unfreeze)
// - cluster not freezed (alive_node_count = min_live_node_count_for_unfreeze)
// - create succeed with single-replica
// - create succeed with double-replica
// - create app succeed
// - wrong partition_count
// - wrong replica_count
// - create failed with table existed
// - wrong app_status creating
// - wrong app_status recalling
Expand All @@ -114,32 +148,91 @@ TEST_F(meta_app_operation_test, create_app)
// - create succeed with success_if_exist=true
hycdong marked this conversation as resolved.
Show resolved Hide resolved
struct create_test
{
std::string app_name;
int32_t partition_count;
int32_t replica_count;
uint64_t min_live_node_count_for_unfreeze;
int alive_node_count;
int32_t min_allowed_replica_count;
bool success_if_exist;
app_status::type before_status;
error_code expected_err;
} tests[] = {
{PARTITION_COUNT, REPLICA_COUNT, false, app_status::AS_INVALID, ERR_OK},
{0, REPLICA_COUNT, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{PARTITION_COUNT, 0, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{PARTITION_COUNT, REPLICA_COUNT, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{PARTITION_COUNT, REPLICA_COUNT, false, app_status::AS_CREATING, ERR_BUSY_CREATING},
{PARTITION_COUNT, REPLICA_COUNT, false, app_status::AS_RECALLING, ERR_BUSY_CREATING},
{PARTITION_COUNT, REPLICA_COUNT, false, app_status::AS_DROPPING, ERR_BUSY_DROPPING},
{PARTITION_COUNT, REPLICA_COUNT, false, app_status::AS_DROPPED, ERR_OK},
{PARTITION_COUNT, REPLICA_COUNT, true, app_status::AS_INVALID, ERR_OK}};
} tests[] = {{APP_NAME, -1, 3, 2, 3, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 0, 3, 2, 3, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, -1, 1, 3, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 0, 1, 3, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 16, 2, 14, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 17, 2, 16, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 16, 2, 15, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 15, 2, 14, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 14, 2, 13, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 16, 2, 16, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 16, 2, 17, 1, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME + "_1", 4, 15, 2, 15, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME + "_2", 4, 15, 2, 16, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME + "_3", 4, 14, 2, 14, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME + "_4", 4, 14, 2, 16, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME + "_5", 4, 13, 2, 14, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME + "_6", 4, 14, 2, 15, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME, 4, 3, 2, 5, 4, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 3, 2, 4, 5, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 3, 2, 4, 4, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 3, 2, 2, 4, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 3, 2, 3, 4, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 4, 2, 3, 4, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME + "_7", 4, 3, 2, 4, 3, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME, 4, 1, 1, 0, 1, false, app_status::AS_INVALID, ERR_STATE_FREEZED},
{APP_NAME, 4, 2, 2, 1, 1, false, app_status::AS_INVALID, ERR_STATE_FREEZED},
{APP_NAME, 4, 3, 3, 2, 1, false, app_status::AS_INVALID, ERR_STATE_FREEZED},
{APP_NAME + "_8", 4, 3, 3, 3, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME + "_9", 4, 1, 1, 1, 1, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME + "_10", 4, 2, 1, 2, 2, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME, 4, 3, 2, 3, 3, false, app_status::AS_INVALID, ERR_OK},
{APP_NAME, 4, 3, 2, 3, 3, false, app_status::AS_INVALID, ERR_INVALID_PARAMETERS},
{APP_NAME, 4, 3, 2, 3, 3, false, app_status::AS_CREATING, ERR_BUSY_CREATING},
{APP_NAME, 4, 3, 2, 3, 3, false, app_status::AS_RECALLING, ERR_BUSY_CREATING},
{APP_NAME, 4, 3, 2, 3, 3, false, app_status::AS_DROPPING, ERR_BUSY_DROPPING},
{APP_NAME, 4, 3, 2, 3, 3, false, app_status::AS_DROPPED, ERR_OK},
{APP_NAME, 4, 3, 2, 3, 3, true, app_status::AS_INVALID, ERR_OK}};

clear_nodes();

const int total_node_count = 20;
std::vector<rpc_address> nodes = ensure_enough_alive_nodes(total_node_count);

// the meta function level will become freezed once
// alive_nodes * 100 < total_nodes * node_live_percentage_threshold_for_update
// even if alive_nodes >= min_live_node_count_for_unfreeze
set_node_live_percentage_threshold_for_update(0);

auto reserved_min_allowed_replica_count = FLAGS_min_allowed_replica_count;

levy5307 marked this conversation as resolved.
Show resolved Hide resolved
for (auto test : tests) {
FLAGS_min_allowed_replica_count = test.min_allowed_replica_count;
set_min_live_node_count_for_unfreeze(test.min_live_node_count_for_unfreeze);

dassert_f(total_node_count >= test.alive_node_count,
"total_node_count({}) should be >= alive_node_count({})",
total_node_count,
test.alive_node_count);
for (int i = 0; i < total_node_count - test.alive_node_count; i++) {
_ms->set_node_state({nodes[i]}, false);
}

if (test.before_status == app_status::AS_DROPPED) {
update_app_status(app_status::AS_AVAILABLE);
drop_app(APP_NAME);
} else if (test.before_status != app_status::AS_INVALID) {
update_app_status(test.before_status);
}
auto err = create_app_test(test.partition_count, test.replica_count, test.success_if_exist);
auto err = create_app_test(
test.partition_count, test.replica_count, test.success_if_exist, test.app_name);
ASSERT_EQ(err, test.expected_err);

_ms->set_node_state(nodes, true);
}

FLAGS_min_allowed_replica_count = reserved_min_allowed_replica_count;
}

TEST_F(meta_app_operation_test, drop_app)
Expand Down
2 changes: 1 addition & 1 deletion src/meta/test/meta_duplication_service_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ TEST_F(meta_duplication_service_test, remove_dup)

TEST_F(meta_duplication_service_test, duplication_sync)
{
std::vector<rpc_address> server_nodes = generate_node_list(3);
std::vector<rpc_address> server_nodes = ensure_enough_alive_nodes(3);
rpc_address node = server_nodes[0];

std::string test_app = "test_app_0";
Expand Down
Loading