From 3d4b8814b75c6a32927558b589e0e119cd84f7a6 Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Mon, 24 Apr 2023 21:02:12 +0800 Subject: [PATCH 1/6] refactor: fs manager --- src/replica/replica.cpp | 7 +++++-- src/replica/replica_failover.cpp | 6 +++++- src/replica/replica_learn.cpp | 8 ++++++-- src/replica/replication_app_base.cpp | 6 ++++-- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/replica/replica.cpp b/src/replica/replica.cpp index d2f9ccefbb..e23d389ed1 100644 --- a/src/replica/replica.cpp +++ b/src/replica/replica.cpp @@ -304,11 +304,14 @@ void replica::on_client_read(dsn::message_ex *request, bool ignore_throttling) auto storage_error = _app->on_request(request); if (dsn_unlikely(storage_error != ERR_OK)) { switch (storage_error) { - // TODO(yingchun): Now only kCorruption is dealt, consider to deal with more storage - // engine errors. + // TODO(yingchun): Now only kCorruption and kIOError is dealt, consider to deal with + // more storage engine errors. case rocksdb::Status::kCorruption: handle_local_failure(ERR_RDB_CORRUPTION); break; + case rocksdb::Status::kIOError: + handle_local_failure(ERR_DISK_IO_ERROR); + break; default: LOG_ERROR_PREFIX("client read encountered an unhandled error: {}", storage_error); } diff --git a/src/replica/replica_failover.cpp b/src/replica/replica_failover.cpp index 94edaab837..c465c221d7 100644 --- a/src/replica/replica_failover.cpp +++ b/src/replica/replica_failover.cpp @@ -33,8 +33,10 @@ * xxxx-xx-xx, author, fix bug about xxx */ +#include #include +#include "common/fs_manager.h" #include "common/replication_common.h" #include "common/replication_enums.h" #include "dsn.layer2_types.h" @@ -54,7 +56,9 @@ void replica::handle_local_failure(error_code error) { LOG_INFO_PREFIX("handle local failure error {}, status = {}", error, enum_to_string(status())); - if (error == ERR_RDB_CORRUPTION) { + if (error == ERR_DISK_IO_ERROR) { + _dir_node->status = disk_status::IO_ERROR; + } else if (error == ERR_RDB_CORRUPTION) { _data_corrupted = true; } diff --git a/src/replica/replica_learn.cpp b/src/replica/replica_learn.cpp index f65f775190..75fc9c5894 100644 --- a/src/replica/replica_learn.cpp +++ b/src/replica/replica_learn.cpp @@ -1238,8 +1238,12 @@ void replica::handle_learning_error(error_code err, bool is_local_error) err, is_local_error ? "local_error" : "remote error"); - if (is_local_error && err == ERR_RDB_CORRUPTION) { - _data_corrupted = true; + if (is_local_error) { + if (err == ERR_DISK_IO_ERROR) { + _dir_node->status = disk_status::IO_ERROR; + } else if (err == ERR_RDB_CORRUPTION) { + _data_corrupted = true; + } } _stub->_counter_replicas_learning_recent_learn_fail_count->increment(); diff --git a/src/replica/replication_app_base.cpp b/src/replica/replication_app_base.cpp index 5159fd757d..db92bdf17e 100644 --- a/src/replica/replication_app_base.cpp +++ b/src/replica/replication_app_base.cpp @@ -457,10 +457,12 @@ error_code replication_app_base::apply_mutation(const mutation *mu) // an error. if (!has_ingestion_request) { switch (storage_error) { - // TODO(yingchun): Now only kCorruption is dealt, consider to deal with more storage - // engine errors. + // TODO(yingchun): Now only kCorruption and kIOError are dealt, consider to deal with + // more storage engine errors. case rocksdb::Status::kCorruption: return ERR_RDB_CORRUPTION; + case rocksdb::Status::kIOError: + return ERR_DISK_IO_ERROR; default: return ERR_LOCAL_APP_FAILURE; } From 53f70eb0eea3efd61331f6b7f153a714ef62e2c0 Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Thu, 8 Jun 2023 23:50:26 +0800 Subject: [PATCH 2/6] add test replica_error_test.test_auto_trash_of_corruption --- src/common/fs_manager.h | 2 +- src/replica/replica.h | 2 +- src/replica/replica_stub.cpp | 2 +- src/replica/replica_stub.h | 2 +- src/replica/test/replica_test.cpp | 46 ++++++++++++++++++++++--------- 5 files changed, 37 insertions(+), 17 deletions(-) diff --git a/src/common/fs_manager.h b/src/common/fs_manager.h index 1073669878..65fb0243ac 100644 --- a/src/common/fs_manager.h +++ b/src/common/fs_manager.h @@ -175,7 +175,7 @@ class fs_manager FRIEND_TEST(fs_manager, find_best_dir_for_new_replica); FRIEND_TEST(fs_manager, get_dir_node); FRIEND_TEST(open_replica_test, open_replica_add_decree_and_ballot_check); - FRIEND_TEST(replica_test, test_auto_trash); + FRIEND_TEST(replica_error_test, test_auto_trash_of_corruption); }; } // replication } // dsn diff --git a/src/replica/replica.h b/src/replica/replica.h index 3c1f1068b6..6cdb7fa9c4 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -554,7 +554,7 @@ class replica : public serverlet, public ref_counter, public replica_ba friend class replica_follower; friend class ::pegasus::server::pegasus_server_test_base; friend class ::pegasus::server::rocksdb_wrapper_test; - FRIEND_TEST(replica_test, test_auto_trash); + FRIEND_TEST(replica_error_test, test_auto_trash_of_corruption); // replica configuration, updated by update_local_configuration ONLY replica_configuration _config; diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index a3e16500a7..3bd061b838 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -2372,8 +2372,8 @@ void replica_stub::close_replica(replica_ptr r) _counter_replicas_closing_count->decrement(); } + _fs_manager.remove_replica(id); if (r->is_data_corrupted()) { - _fs_manager.remove_replica(id); move_to_err_path(r->dir(), "trash replica"); _counter_replicas_recent_replica_move_error_count->increment(); } diff --git a/src/replica/replica_stub.h b/src/replica/replica_stub.h index c6aa984f9b..b6f54c4850 100644 --- a/src/replica/replica_stub.h +++ b/src/replica/replica_stub.h @@ -417,8 +417,8 @@ class replica_stub : public serverlet, public ref_counter friend class replica_follower_test; friend class replica_http_service_test; FRIEND_TEST(open_replica_test, open_replica_add_decree_and_ballot_check); + FRIEND_TEST(replica_error_test, test_auto_trash_of_corruption); FRIEND_TEST(replica_test, test_clear_on_failure); - FRIEND_TEST(replica_test, test_auto_trash); typedef std::unordered_map opening_replicas; typedef std::unordered_map> diff --git a/src/replica/test/replica_test.cpp b/src/replica/test/replica_test.cpp index 45ca0eab3c..2bbff197ba 100644 --- a/src/replica/test/replica_test.cpp +++ b/src/replica/test/replica_test.cpp @@ -495,8 +495,20 @@ TEST_F(replica_test, test_clear_on_failure) ASSERT_FALSE(has_gpid(_pid)); } -TEST_F(replica_test, test_auto_trash) +class replica_error_test : public replica_test, public testing::WithParamInterface { +}; + +INSTANTIATE_TEST_CASE_P(, + replica_error_test, + ::testing::Values(ERR_RDB_CORRUPTION, ERR_DISK_IO_ERROR)); + +TEST_P(replica_error_test, test_auto_trash_of_corruption) +{ + const auto ec = GetParam(); + // The replica path will only be moved to error path when encounter ERR_RDB_CORRUPTION error. + bool moved_to_err_path = (ec == ERR_RDB_CORRUPTION); + // Clear up the remaining state. auto *dn = stub->get_fs_manager()->find_replica_dir(_app_info.app_type, _pid); if (dn != nullptr) { @@ -509,33 +521,41 @@ TEST_F(replica_test, test_auto_trash) replica *rep = stub->generate_replica(_app_info, _pid, partition_status::PS_PRIMARY, 1, false, true); - auto path = rep->dir(); + auto original_replica_path = rep->dir(); ASSERT_TRUE(has_gpid(_pid)); - rep->handle_local_failure(ERR_RDB_CORRUPTION); + rep->handle_local_failure(ec); stub->wait_closing_replicas_finished(); - ASSERT_FALSE(dsn::utils::filesystem::path_exists(path)); - dn = stub->get_fs_manager()->get_dir_node(path); + ASSERT_EQ(!moved_to_err_path, dsn::utils::filesystem::path_exists(original_replica_path)); + dn = stub->get_fs_manager()->get_dir_node(original_replica_path); ASSERT_NE(dn, nullptr); std::vector subs; ASSERT_TRUE(dsn::utils::filesystem::get_subdirectories(dn->full_dir, subs, false)); - bool found = false; + bool found_err_path = false; + std::string err_path; const int ts_length = 16; - size_t err_pos = path.size() + ts_length + 1; // Add 1 for dot in path. + size_t err_pos = original_replica_path.size() + ts_length + 1; // Add 1 for dot in path. for (const auto &sub : subs) { - if (sub.size() <= path.size()) { + if (sub.size() <= original_replica_path.size()) { continue; } uint64_t ts = 0; - if (sub.find(path) == 0 && sub.find(kFolderSuffixErr) == err_pos && - dsn::buf2uint64(sub.substr(path.size() + 1, ts_length), ts)) { - found = true; + if (sub.find(original_replica_path) == 0 && sub.find(kFolderSuffixErr) == err_pos && + dsn::buf2uint64(sub.substr(original_replica_path.size() + 1, ts_length), ts)) { + err_path = sub; + ASSERT_GT(ts, 0); + found_err_path = true; break; } } - ASSERT_TRUE(found); - ASSERT_FALSE(has_gpid(_pid)); + ASSERT_EQ(moved_to_err_path, found_err_path); + ASSERT_EQ(has_gpid(_pid)); + + // It's safe to cleanup the .err path after been found. + if (!err_path.empty()) { + dsn::utils::filesystem::remove_path(err_path); + } } TEST_F(replica_test, update_deny_client_test) From 887e5ccefc82caa09bafc63ed100734f6870a26a Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Fri, 9 Jun 2023 00:32:01 +0800 Subject: [PATCH 3/6] fix --- src/replica/test/replica_test.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/replica/test/replica_test.cpp b/src/replica/test/replica_test.cpp index 2bbff197ba..d7c40fc386 100644 --- a/src/replica/test/replica_test.cpp +++ b/src/replica/test/replica_test.cpp @@ -16,6 +16,7 @@ // under the License. #include +#include // IWYU pragma: no_include // IWYU pragma: no_include #include @@ -550,7 +551,7 @@ TEST_P(replica_error_test, test_auto_trash_of_corruption) } } ASSERT_EQ(moved_to_err_path, found_err_path); - ASSERT_EQ(has_gpid(_pid)); + ASSERT_FALSE(has_gpid(_pid)); // It's safe to cleanup the .err path after been found. if (!err_path.empty()) { From 9a4e7e4347997f315b39ecdb4ede647194e54b77 Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Sun, 11 Jun 2023 17:21:18 +0800 Subject: [PATCH 4/6] add ut --- src/replica/replica.h | 1 + src/replica/replica_stub.cpp | 2 +- src/replica/test/CMakeLists.txt | 3 ++- src/replica/test/replica_disk_test.cpp | 27 ++++++++++++++++++++++++++ src/replica/test/replica_test.cpp | 4 ++++ 5 files changed, 35 insertions(+), 2 deletions(-) diff --git a/src/replica/replica.h b/src/replica/replica.h index 6cdb7fa9c4..3933784da0 100644 --- a/src/replica/replica.h +++ b/src/replica/replica.h @@ -554,6 +554,7 @@ class replica : public serverlet, public ref_counter, public replica_ba friend class replica_follower; friend class ::pegasus::server::pegasus_server_test_base; friend class ::pegasus::server::rocksdb_wrapper_test; + FRIEND_TEST(replica_disk_test, disk_io_error_test); FRIEND_TEST(replica_error_test, test_auto_trash_of_corruption); // replica configuration, updated by update_local_configuration ONLY diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index 3bd061b838..f828585562 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -2140,7 +2140,7 @@ void replica_stub::open_replica( } if (rep == nullptr) { - LOG_INFO( + LOG_WARNING( "{}@{}: open replica failed, erase from opening replicas", id, _primary_address_str); zauto_write_lock l(_replicas_lock); CHECK_GT_MSG(_opening_replicas.erase(id), diff --git a/src/replica/test/CMakeLists.txt b/src/replica/test/CMakeLists.txt index 0a34a0c7c0..3b82d28e12 100644 --- a/src/replica/test/CMakeLists.txt +++ b/src/replica/test/CMakeLists.txt @@ -46,7 +46,8 @@ set(MY_PROJ_LIBS dsn_meta_server dsn_runtime zookeeper hashtable - gtest) + gtest + test_utils) set(MY_BOOST_LIBS Boost::system Boost::filesystem Boost::regex) diff --git a/src/replica/test/replica_disk_test.cpp b/src/replica/test/replica_disk_test.cpp index 11b1399ac6..a6f0bb9a66 100644 --- a/src/replica/test/replica_disk_test.cpp +++ b/src/replica/test/replica_disk_test.cpp @@ -46,9 +46,13 @@ #include "utils/error_code.h" #include "utils/filesystem.h" #include "utils/fmt_logging.h" +#include "test_util/test_util.h" + +using pegasus::AssertEventually; namespace dsn { namespace replication { +DSN_DECLARE_bool(fd_disabled); using query_disk_info_rpc = rpc_holder; @@ -286,5 +290,28 @@ TEST_F(replica_disk_test, add_new_disk_test) } } +TEST_F(replica_disk_test, disk_io_error_test) +{ + // Disable failure detector to avoid connecting with meta server which is not started. + FLAGS_fd_disabled = true; + + gpid test_pid(app_info_1.app_id, 0); + const auto rep = stub->get_replica(test_pid); + auto *old_dn = rep->get_dir_node(); + + rep->handle_local_failure(ERR_DISK_IO_ERROR); + ASSERT_EVENTUALLY([&] { ASSERT_TRUE(!old_dn->has(test_pid)); }); + + // The replica will not be located on the old dir_node. + auto *new_dn = stub->get_fs_manager()->find_best_dir_for_new_replica(test_pid); + ASSERT_NE(old_dn, new_dn); + + // The replicas will not be located on the old dir_node. + for (int i = 0; i < 16; i++) { + new_dn = stub->get_fs_manager()->find_best_dir_for_new_replica(gpid(3, i)); + ASSERT_NE(old_dn, new_dn); + } +} + } // namespace replication } // namespace dsn diff --git a/src/replica/test/replica_test.cpp b/src/replica/test/replica_test.cpp index d7c40fc386..d19e775625 100644 --- a/src/replica/test/replica_test.cpp +++ b/src/replica/test/replica_test.cpp @@ -552,6 +552,10 @@ TEST_P(replica_error_test, test_auto_trash_of_corruption) } ASSERT_EQ(moved_to_err_path, found_err_path); ASSERT_FALSE(has_gpid(_pid)); + ASSERT_EQ(moved_to_err_path, dn->status == disk_status::NORMAL) << moved_to_err_path << ", " + << enum_to_string(dn->status); + ASSERT_EQ(!moved_to_err_path, dn->status == disk_status::IO_ERROR) + << moved_to_err_path << ", " << enum_to_string(dn->status); // It's safe to cleanup the .err path after been found. if (!err_path.empty()) { From b67ded4b19c0930f071fc666a9eb5847a5dd05e0 Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Sun, 11 Jun 2023 17:38:44 +0800 Subject: [PATCH 5/6] iwyu --- src/replica/test/replica_disk_test.cpp | 3 ++- src/replica/test/replica_test.cpp | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/replica/test/replica_disk_test.cpp b/src/replica/test/replica_disk_test.cpp index a6f0bb9a66..9793913013 100644 --- a/src/replica/test/replica_disk_test.cpp +++ b/src/replica/test/replica_disk_test.cpp @@ -42,11 +42,12 @@ #include "replica_admin_types.h" #include "replica_disk_test_base.h" #include "runtime/rpc/rpc_holder.h" +#include "test_util/test_util.h" #include "utils/autoref_ptr.h" #include "utils/error_code.h" #include "utils/filesystem.h" +#include "utils/flags.h" #include "utils/fmt_logging.h" -#include "test_util/test_util.h" using pegasus::AssertEventually; diff --git a/src/replica/test/replica_test.cpp b/src/replica/test/replica_test.cpp index d19e775625..f1ff0acc9d 100644 --- a/src/replica/test/replica_test.cpp +++ b/src/replica/test/replica_test.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,7 @@ #include "common/gpid.h" #include "common/replica_envs.h" #include "common/replication.codes.h" +#include "common/replication_enums.h" #include "common/replication_other_types.h" #include "consensus_types.h" #include "dsn.layer2_types.h" From 7c8ebcc616ca1b811ad00bf2f38da01ba6f84f78 Mon Sep 17 00:00:00 2001 From: Yingchun Lai Date: Wed, 14 Jun 2023 12:31:36 +0800 Subject: [PATCH 6/6] cr1 --- src/replica/replica.cpp | 2 +- src/replica/test/replica_disk_test.cpp | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/replica/replica.cpp b/src/replica/replica.cpp index e23d389ed1..be2a0177b5 100644 --- a/src/replica/replica.cpp +++ b/src/replica/replica.cpp @@ -304,7 +304,7 @@ void replica::on_client_read(dsn::message_ex *request, bool ignore_throttling) auto storage_error = _app->on_request(request); if (dsn_unlikely(storage_error != ERR_OK)) { switch (storage_error) { - // TODO(yingchun): Now only kCorruption and kIOError is dealt, consider to deal with + // TODO(yingchun): Now only kCorruption and kIOError are dealt, consider to deal with // more storage engine errors. case rocksdb::Status::kCorruption: handle_local_failure(ERR_RDB_CORRUPTION); diff --git a/src/replica/test/replica_disk_test.cpp b/src/replica/test/replica_disk_test.cpp index 9793913013..2ecef5adc7 100644 --- a/src/replica/test/replica_disk_test.cpp +++ b/src/replica/test/replica_disk_test.cpp @@ -308,8 +308,11 @@ TEST_F(replica_disk_test, disk_io_error_test) ASSERT_NE(old_dn, new_dn); // The replicas will not be located on the old dir_node. + const int kNewAppId = 3; + // Make sure the app with id 'kNewAppId' is not existed. + ASSERT_EQ(nullptr, stub->get_replica(gpid(kNewAppId, 0))); for (int i = 0; i < 16; i++) { - new_dn = stub->get_fs_manager()->find_best_dir_for_new_replica(gpid(3, i)); + new_dn = stub->get_fs_manager()->find_best_dir_for_new_replica(gpid(kNewAppId, i)); ASSERT_NE(old_dn, new_dn); } }