From 89cd12b5d472fd020b2d217f35e12411b77f1ad3 Mon Sep 17 00:00:00 2001 From: zhao liwei Date: Tue, 31 Mar 2020 12:21:50 +0800 Subject: [PATCH] refactor(backup): delay the removal of checkpoint files produced by cold backup (#430) --- .../replication/common/replication_common.cpp | 8 +++++ .../replication/common/replication_common.h | 1 + src/dist/replication/lib/replica.h | 1 + src/dist/replication/lib/replica_backup.cpp | 27 +++++++++++------ .../meta_server/meta_backup_service.cpp | 15 ++++++++++ .../test/meta_test/unit_test/backup_test.cpp | 29 +++++++++++++++++-- .../unit_test/meta_http_service_test.cpp | 10 +++---- 7 files changed, 74 insertions(+), 17 deletions(-) diff --git a/src/dist/replication/common/replication_common.cpp b/src/dist/replication/common/replication_common.cpp index a7d64eec2a..469486100f 100644 --- a/src/dist/replication/common/replication_common.cpp +++ b/src/dist/replication/common/replication_common.cpp @@ -108,6 +108,8 @@ replication_options::replication_options() learn_app_max_concurrent_count = 5; max_concurrent_uploading_file_count = 10; + + cold_backup_checkpoint_reserve_minutes = 10; } replication_options::~replication_options() {} @@ -506,6 +508,12 @@ void replication_options::initialize() max_concurrent_uploading_file_count, "concurrent uploading file count"); + cold_backup_checkpoint_reserve_minutes = + (int)dsn_config_get_value_uint64("replication", + "cold_backup_checkpoint_reserve_minutes", + cold_backup_checkpoint_reserve_minutes, + "reserve minutes of cold backup checkpoint"); + replica_helper::load_meta_servers(meta_servers); sanity_check(); diff --git a/src/dist/replication/common/replication_common.h b/src/dist/replication/common/replication_common.h index d29f5419ca..4b379be83d 100644 --- a/src/dist/replication/common/replication_common.h +++ b/src/dist/replication/common/replication_common.h @@ -112,6 +112,7 @@ class replication_options std::string cold_backup_root; int32_t max_concurrent_uploading_file_count; + int32_t cold_backup_checkpoint_reserve_minutes; public: replication_options(); diff --git a/src/dist/replication/lib/replica.h b/src/dist/replication/lib/replica.h index dc423db979..e23ee99b64 100644 --- a/src/dist/replication/lib/replica.h +++ b/src/dist/replication/lib/replica.h @@ -311,6 +311,7 @@ class replica : public serverlet, public ref_counter, public replica_ba ///////////////////////////////////////////////////////////////// // cold backup void clear_backup_checkpoint(const std::string &policy_name); + void background_clear_backup_checkpoint(const std::string &policy_name); void generate_backup_checkpoint(cold_backup_context_ptr backup_context); void trigger_async_checkpoint_for_backup(cold_backup_context_ptr backup_context); void wait_async_checkpoint_for_backup(cold_backup_context_ptr backup_context); diff --git a/src/dist/replication/lib/replica_backup.cpp b/src/dist/replication/lib/replica_backup.cpp index c7b1c29015..c58765408b 100644 --- a/src/dist/replication/lib/replica_backup.cpp +++ b/src/dist/replication/lib/replica_backup.cpp @@ -42,10 +42,9 @@ void replica::on_cold_backup(const backup_request &request, /*out*/ backup_respo // send clear request to secondaries send_backup_request_to_secondary(request); } + // clear local checkpoint dirs in background thread - tasking::enqueue(LPC_BACKGROUND_COLD_BACKUP, &_tracker, [this, policy_name]() { - clear_backup_checkpoint(policy_name); - }); + background_clear_backup_checkpoint(policy_name); return; } @@ -106,10 +105,9 @@ void replica::on_cold_backup(const backup_request &request, /*out*/ backup_respo // send clear request to secondaries send_backup_request_to_secondary(request); } + // clear local checkpoint dirs in background thread - tasking::enqueue(LPC_BACKGROUND_COLD_BACKUP, &_tracker, [this, policy_name]() { - clear_backup_checkpoint(policy_name); - }); + background_clear_backup_checkpoint(policy_name); } } return; @@ -194,10 +192,9 @@ void replica::on_cold_backup(const backup_request &request, /*out*/ backup_respo backup_request new_request = request; new_request.backup_id = 0; send_backup_request_to_secondary(new_request); + // clear local checkpoint dirs in background thread - tasking::enqueue(LPC_BACKGROUND_COLD_BACKUP, &_tracker, [this, policy_name]() { - clear_backup_checkpoint(policy_name); - }); + background_clear_backup_checkpoint(policy_name); response.err = ERR_OK; } else { dwarn( @@ -421,6 +418,18 @@ static bool backup_parse_dir_name(const char *name, } } +void replica::background_clear_backup_checkpoint(const std::string &policy_name) +{ + ddebug_replica("schedule to clear all checkpoint dirs of policy({}) in {} minutes", + policy_name, + options()->cold_backup_checkpoint_reserve_minutes); + tasking::enqueue(LPC_BACKGROUND_COLD_BACKUP, + &_tracker, + [this, policy_name]() { clear_backup_checkpoint(policy_name); }, + get_gpid().thread_hash(), + std::chrono::minutes(options()->cold_backup_checkpoint_reserve_minutes)); +} + // clear all checkpoint dirs of the policy void replica::clear_backup_checkpoint(const std::string &policy_name) { diff --git a/src/dist/replication/meta_server/meta_backup_service.cpp b/src/dist/replication/meta_server/meta_backup_service.cpp index 233d424652..e8d7878547 100644 --- a/src/dist/replication/meta_server/meta_backup_service.cpp +++ b/src/dist/replication/meta_server/meta_backup_service.cpp @@ -1204,6 +1204,7 @@ void backup_service::start() start_create_policy_meta_root(after_create_policy_meta_root); } +// TODO(zhaoliwei) refactor function add_backup_policy void backup_service::add_backup_policy(dsn::message_ex *msg) { configuration_add_backup_policy_request request; @@ -1212,6 +1213,20 @@ void backup_service::add_backup_policy(dsn::message_ex *msg) ::dsn::unmarshall(msg, request); std::set app_ids; std::map app_names; + + // The backup interval must be greater than checkpoint reserve time. + // Or the next cold backup checkpoint may be cleared by the clear operation. + if (request.backup_interval_seconds <= + _meta_svc->get_options().cold_backup_checkpoint_reserve_minutes * 60) { + response.err = ERR_INVALID_PARAMETERS; + response.hint_message = fmt::format( + "backup interval must be greater than cold_backup_checkpoint_reserve_minutes={}", + _meta_svc->get_options().cold_backup_checkpoint_reserve_minutes); + _meta_svc->reply_data(msg, response); + msg->release_ref(); + return; + } + { // check app status zauto_read_lock l; diff --git a/src/dist/replication/test/meta_test/unit_test/backup_test.cpp b/src/dist/replication/test/meta_test/unit_test/backup_test.cpp index 1f84dd7e21..6b603a760e 100644 --- a/src/dist/replication/test/meta_test/unit_test/backup_test.cpp +++ b/src/dist/replication/test/meta_test/unit_test/backup_test.cpp @@ -672,7 +672,7 @@ void meta_service_test_app::backup_service_test() req.backup_provider_type = std::string("local_service"); req.policy_name = test_policy_name; req.app_ids = {1, 2, 3}; - req.backup_interval_seconds = 10; + req.backup_interval_seconds = 24 * 60 * 60; // case1: backup policy don't contain invalid app_id // result: backup policy will not be added, and return ERR_INVALID_PARAMETERS @@ -687,7 +687,30 @@ void meta_service_test_app::backup_service_test() ASSERT_TRUE(resp.err == ERR_INVALID_PARAMETERS); } - // case2: backup policy contains valid app_id + // case2: backup policy interval time < checkpoint reserve time + // result: backup policy will not be added, and return ERR_INVALID_PARAMETERS + { + int64_t old_backup_interval_seconds = req.backup_interval_seconds; + req.backup_interval_seconds = 10; + configuration_add_backup_policy_response resp; + server_state *state = meta_svc->get_server_state(); + state->_all_apps.insert(std::make_pair(1, std::make_shared(app_info()))); + auto r = fake_rpc_call(RPC_CM_ADD_BACKUP_POLICY, + LPC_DEFAULT_CALLBACK, + backup_svc, + &backup_service::add_backup_policy, + req); + fake_wait_rpc(r, resp); + + std::string hint_message = fmt::format( + "backup interval must be greater than cold_backup_checkpoint_reserve_minutes={}", + meta_svc->get_options().cold_backup_checkpoint_reserve_minutes); + ASSERT_TRUE(resp.err == ERR_INVALID_PARAMETERS); + ASSERT_TRUE(resp.hint_message == hint_message); + req.backup_interval_seconds = old_backup_interval_seconds; + } + + // case3: backup policy contains valid app_id // result: add backup policy succeed { configuration_add_backup_policy_response resp; @@ -720,7 +743,7 @@ void meta_service_test_app::backup_service_test() const policy &p = backup_svc->_policy_states.at(test_policy_name)->get_policy(); ASSERT_TRUE(p.app_ids.size() == 1 && p.app_ids.count(1) == 1); ASSERT_TRUE(p.backup_provider_type == std::string("local_service")); - ASSERT_TRUE(p.backup_interval_seconds == 10); + ASSERT_TRUE(p.backup_interval_seconds == 24 * 60 * 60); ASSERT_TRUE(p.policy_name == test_policy_name); } } diff --git a/src/dist/replication/test/meta_test/unit_test/meta_http_service_test.cpp b/src/dist/replication/test/meta_test/unit_test/meta_http_service_test.cpp index d81427d3d8..213cd9bf05 100644 --- a/src/dist/replication/test/meta_test/unit_test/meta_http_service_test.cpp +++ b/src/dist/replication/test/meta_test/unit_test/meta_http_service_test.cpp @@ -100,7 +100,7 @@ class meta_backup_test_base : public meta_test_base request.policy_name = policy_name; request.backup_provider_type = "local_service"; - request.backup_interval_seconds = 1; + request.backup_interval_seconds = 24 * 60 * 60; request.backup_history_count_to_keep = 1; request.start_time = "12:00"; request.app_ids.clear(); @@ -148,19 +148,19 @@ TEST_F(meta_backup_test_base, get_backup_policy) {"", "{}\n", http_status_code::ok}, {"TEST1", "{\"TEST1\":{\"name\":\"TEST1\",\"backup_provider_type\":\"local_service\"," - "\"backup_interval\":\"1\",\"app_ids\":\"[2]\",\"start_time\":\"12:00\"," + "\"backup_interval\":\"86400\",\"app_ids\":\"[2]\",\"start_time\":\"12:00\"," "\"status\":\"enabled\",\"backup_history_count\":\"1\"}}\n", http_status_code::ok}, {"TEST2", "{\"TEST2\":{\"name\":\"TEST2\",\"backup_provider_type\":\"local_service\"," - "\"backup_interval\":\"1\",\"app_ids\":\"[2]\",\"start_time\":\"12:00\"," + "\"backup_interval\":\"86400\",\"app_ids\":\"[2]\",\"start_time\":\"12:00\"," "\"status\":\"enabled\",\"backup_history_count\":\"1\"}}\n", http_status_code::ok}, {"", "{\"TEST1\":{\"name\":\"TEST1\",\"backup_provider_type\":\"local_service\",\"backup_" - "interval\":\"1\",\"app_ids\":\"[2]\",\"start_time\":\"12:00\",\"status\":\"enabled\"," + "interval\":\"86400\",\"app_ids\":\"[2]\",\"start_time\":\"12:00\",\"status\":\"enabled\"," "\"backup_history_count\":\"1\"},\"TEST2\":{\"name\":\"TEST2\",\"backup_provider_" - "type\":\"local_service\",\"backup_interval\":\"1\",\"app_ids\":\"[2]\",\"start_" + "type\":\"local_service\",\"backup_interval\":\"86400\",\"app_ids\":\"[2]\",\"start_" "time\":\"12:00\",\"status\":\"enabled\",\"backup_history_count\":\"1\"}}\n", http_status_code::ok}, {"TEST3", "{}\n", http_status_code::ok},