From c5522852ef9605067897609314c944800ef0c6ab Mon Sep 17 00:00:00 2001 From: heyuchen Date: Sat, 8 May 2021 18:45:13 +0800 Subject: [PATCH 1/5] feat: add broken disk check while initialize --- include/dsn/utility/filesystem.h | 10 +++ src/common/fs_manager.cpp | 1 + src/common/fs_manager.h | 7 +++ src/replica/replica_stub.cpp | 76 ++++++++++++++--------- src/replica/replica_stub.h | 2 + src/replica/test/replica_disk_test.cpp | 18 ++++++ src/replica/test/replica_disk_test_base.h | 15 +++++ src/utils/filesystem.cpp | 73 ++++++++++++++++++++++ 8 files changed, 174 insertions(+), 28 deletions(-) diff --git a/include/dsn/utility/filesystem.h b/include/dsn/utility/filesystem.h index 66f735e80a..00b699f18b 100644 --- a/include/dsn/utility/filesystem.h +++ b/include/dsn/utility/filesystem.h @@ -136,6 +136,16 @@ bool verify_file(const std::string &fname, const std::string &expected_md5, const int64_t &expected_fsize); +// create driectory and get absolute path +bool create_directory(const std::string &path, + /*out*/ std::string &absolute_path, + /*out*/ std::string &err_msg); + +bool write_file(const std::string &fname, std::string &buf); + +// check if directory is readable and writable +bool check_dir_rw(const std::string &path, /*out*/ std::string &err_msg); + } // namespace filesystem } // namespace utils } // namespace dsn diff --git a/src/common/fs_manager.cpp b/src/common/fs_manager.cpp index 75114bdd36..56b45e38f2 100644 --- a/src/common/fs_manager.cpp +++ b/src/common/fs_manager.cpp @@ -156,6 +156,7 @@ dsn::error_code fs_manager::initialize(const std::vector &data_dirs norm_path.c_str(), tags[i].c_str()); } + _available_data_dirs = data_dirs; if (!for_test) { update_disk_stat(); diff --git a/src/common/fs_manager.h b/src/common/fs_manager.h index 9b698785a9..f23fdb9cca 100644 --- a/src/common/fs_manager.h +++ b/src/common/fs_manager.h @@ -79,6 +79,12 @@ class fs_manager bool for_each_dir_node(const std::function &func) const; void update_disk_stat(); + const std::vector &get_available_data_dirs() const + { + zauto_read_lock l(_lock); + return _available_data_dirs; + } + private: void reset_disk_stat() { @@ -102,6 +108,7 @@ class fs_manager int _max_available_ratio = 0; std::vector> _dir_nodes; + std::vector _available_data_dirs; perf_counter_wrapper _counter_total_capacity_mb; perf_counter_wrapper _counter_total_available_mb; diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index b847d16e81..5845670086 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -64,6 +64,12 @@ namespace dsn { namespace replication { +DSN_DEFINE_bool("replication", + ignore_broken_disk, + true, + "true means ignore broken data disk when initialize"); +DSN_TAG_VARIABLE(ignore_broken_disk, FT_MUTABLE); + bool replica_stub::s_not_exit_on_log_failure = false; replica_stub::replica_stub(replica_state_subscriber subscriber /*= nullptr*/, @@ -503,33 +509,13 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f } // init dirs - if (!dsn::utils::filesystem::create_directory(_options.slog_dir)) { - dassert(false, "Fail to create directory %s.", _options.slog_dir.c_str()); - } std::string cdir; - if (!dsn::utils::filesystem::get_absolute_path(_options.slog_dir, cdir)) { - dassert(false, "Fail to get absolute path from %s.", _options.slog_dir.c_str()); + std::string err_msg; + if (!dsn::utils::filesystem::create_directory(_options.slog_dir, cdir, err_msg)) { + dassert(false, "{}", err_msg); } _options.slog_dir = cdir; - int count = 0; - for (auto &dir : _options.data_dirs) { - if (!dsn::utils::filesystem::create_directory(dir)) { - dassert(false, "Fail to create directory %s.", dir.c_str()); - } - std::string cdir; - if (!dsn::utils::filesystem::get_absolute_path(dir, cdir)) { - dassert(false, "Fail to get absolute path from %s.", dir.c_str()); - } - dir = cdir; - ddebug("data_dirs[%d] = %s", count, dir.c_str()); - count++; - } - - { - dsn::error_code err; - err = _fs_manager.initialize(_options.data_dirs, _options.data_dir_tags, false); - dassert(err == dsn::ERR_OK, "initialize fs manager failed, err(%s)", err.to_string()); - } + initialize_fs_manager(_options.data_dirs, _options.data_dir_tags); _log = new mutation_log_shared(_options.slog_dir, _options.log_shared_file_size_mb, @@ -541,7 +527,7 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f ddebug("start to load replicas"); std::vector dir_list; - for (auto &dir : _options.data_dirs) { + for (auto &dir : _fs_manager.get_available_data_dirs()) { std::vector tmp_list; if (!dsn::utils::filesystem::get_subdirectories(dir, tmp_list, false)) { dassert(false, "Fail to get subdirectories in %s.", dir.c_str()); @@ -789,6 +775,40 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f } } +void replica_stub::initialize_fs_manager(std::vector &data_dirs, + std::vector &data_dir_tags) +{ + std::string cdir; + std::string err_msg; + int count = 0; + std::vector available_dirs; + std::vector available_dir_tags; + for (int i = 0; i < data_dir_tags.size(); ++i) { + std::string &dir = data_dirs[i]; + bool flag = dsn::utils::filesystem::create_directory(dir, cdir, err_msg); + if (flag) { + flag = dsn::utils::filesystem::check_dir_rw(dir, err_msg); + } + if (!flag) { + if (FLAGS_ignore_broken_disk) { + dwarn_f("data dir[{}] is broken, ignore it, error:{}", dir, err_msg); + } else { + dassert_f(false, "{}", err_msg); + } + continue; + } + ddebug_f("data_dirs[{}] = {}", count, cdir); + available_dirs.emplace_back(cdir); + available_dir_tags.emplace_back(data_dir_tags[i]); + count++; + } + + dassert_f(available_dirs.size() > 0, + "initialize fs manager failed, no available data directory"); + error_code err = _fs_manager.initialize(available_dirs, available_dir_tags, false); + dassert_f(err == dsn::ERR_OK, "initialize fs manager failed, err({})", err); +} + void replica_stub::initialize_start() { // start timer for configuration sync @@ -1860,7 +1880,7 @@ void replica_stub::on_disk_stat() uint64_t start = dsn_now_ns(); disk_cleaning_report report{}; - dsn::replication::disk_remove_useless_dirs(_options.data_dirs, report); + dsn::replication::disk_remove_useless_dirs(_fs_manager.get_available_data_dirs(), report); _fs_manager.update_disk_stat(); update_disk_holding_replicas(); @@ -2555,7 +2575,7 @@ std::string replica_stub::get_replica_dir(const char *app_type, gpid id, bool cr std::string gpid_str = fmt::format("{}.{}", id, app_type); std::string replica_dir; bool is_dir_exist = false; - for (const std::string &data_dir : _options.data_dirs) { + for (const std::string &data_dir : _fs_manager.get_available_data_dirs()) { std::string dir = utils::filesystem::path_combine(data_dir, gpid_str); if (utils::filesystem::directory_exists(dir)) { if (is_dir_exist) { @@ -2577,7 +2597,7 @@ replica_stub::get_child_dir(const char *app_type, gpid child_pid, const std::str { std::string gpid_str = fmt::format("{}.{}", child_pid.to_string(), app_type); std::string child_dir; - for (const std::string &data_dir : _options.data_dirs) { + for (const std::string &data_dir : _fs_manager.get_available_data_dirs()) { std::string dir = utils::filesystem::path_combine(data_dir, gpid_str); // = /. // check if 's is equal to diff --git a/src/replica/replica_stub.h b/src/replica/replica_stub.h index ca0efcd25a..a48d68e0c4 100644 --- a/src/replica/replica_stub.h +++ b/src/replica/replica_stub.h @@ -97,6 +97,8 @@ class replica_stub : public serverlet, public ref_counter // void initialize(const replication_options &opts, bool clear = false); void initialize(bool clear = false); + void initialize_fs_manager(std::vector &data_dirs, + std::vector &data_dir_tags); void set_options(const replication_options &opts) { _options = opts; } void open_service(); void close(); diff --git a/src/replica/test/replica_disk_test.cpp b/src/replica/test/replica_disk_test.cpp index e76a089d53..11879d32d2 100644 --- a/src/replica/test/replica_disk_test.cpp +++ b/src/replica/test/replica_disk_test.cpp @@ -198,5 +198,23 @@ TEST_F(replica_disk_test, gc_disk_useless_dir) ASSERT_EQ(report.error_replica_count, 2); } +TEST_F(replica_disk_test, broken_disk_test) +{ + // Test cases: + // create: true, check_rw: true + // create: true, check_rw: false + // create: false + struct broken_disk_test + { + std::string mock_create_dir; + std::string mock_rw_flag; + int32_t data_dir_size; + } tests[]{{"true", "true", 3}, {"true", "false", 2}, {"false", "false", 2}}; + for (const auto &test : tests) { + ASSERT_EQ(test.data_dir_size, + ignore_broken_disk_test(test.mock_create_dir, test.mock_rw_flag)); + } +} + } // namespace replication } // namespace dsn diff --git a/src/replica/test/replica_disk_test_base.h b/src/replica/test/replica_disk_test_base.h index 2da3e1d4eb..237c32a3dd 100644 --- a/src/replica/test/replica_disk_test_base.h +++ b/src/replica/test/replica_disk_test_base.h @@ -95,6 +95,21 @@ class replica_disk_test_base : public replica_test_base } } + int32_t ignore_broken_disk_test(const std::string &mock_create_directory, + const std::string &mock_check_rw) + { + std::vector data_dirs = {"disk1", "disk2", "disk3"}; + std::vector data_dir_tags = {"tag1", "tag2", "tag3"}; + auto test_stub = make_unique(); + fail::cfg("filesystem_create_directory", "return(" + mock_create_directory + ")"); + fail::cfg("filesystem_check_dir_rw", "return(" + mock_check_rw + ")"); + fail::cfg("update_disk_stat", "return()"); + test_stub->initialize_fs_manager(data_dirs, data_dir_tags); + int32_t dir_size = test_stub->_fs_manager.get_available_data_dirs().size(); + test_stub.reset(); + return dir_size; + } + public: int empty_dir_nodes_count = 1; int dir_nodes_count = 5; diff --git a/src/utils/filesystem.cpp b/src/utils/filesystem.cpp index 14c9c5c87a..a6f6533768 100644 --- a/src/utils/filesystem.cpp +++ b/src/utils/filesystem.cpp @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -826,6 +827,78 @@ bool verify_file(const std::string &fname, return true; } +bool create_directory(const std::string &path, std::string &absolute_path, std::string &err_msg) +{ + FAIL_POINT_INJECT_F("filesystem_create_directory", [path](string_view str) { + // when str contains 'false', and path contains broken_disk_dir, mock create fail(return + // false) + std::string broken_disk_dir = "disk1"; + return str.find("false") == string_view::npos || + path.find(broken_disk_dir) == std::string::npos; + }); + + if (!create_directory(path)) { + err_msg = fmt::format("Fail to create directory {}.", path); + return false; + } + if (!get_absolute_path(path, absolute_path)) { + err_msg = fmt::format("Fail to get absolute path from {}.", path); + return false; + } + return true; +} + +bool write_file(const std::string &fname, std::string &buf) +{ + if (!file_exists(fname)) { + derror_f("file({}) doesn't exist", fname); + return false; + } + + std::ofstream fstream; + fstream.open(fname.c_str()); + fstream << buf; + fstream.close(); + return true; +} + +bool check_dir_rw(const std::string &path, std::string &err_msg) +{ + FAIL_POINT_INJECT_F("filesystem_check_dir_rw", [path](string_view str) { + // when str contains 'false', and path contains broken_disk_dir, mock check fail(return + // false) + std::string broken_disk_dir = "disk1"; + return str.find("false") == string_view::npos || + path.find(broken_disk_dir) == std::string::npos; + }); + + std::string fname = "read_write_test_file"; + std::string fpath = path_combine(path, fname); + if (!create_file(fpath)) { + err_msg = fmt::format("Fail to create test file {}.", fpath); + return false; + } + + std::string value = "test_value"; + if (!write_file(fpath, value)) { + err_msg = fmt::format("Fail to write file {}.", fpath); + return false; + } + + std::string buf; + if (read_file(fpath, buf) != ERR_OK) { + err_msg = fmt::format("Fail to read file {}.", fpath); + return false; + } + + if (!remove_path(fpath)) { + err_msg = fmt::format("Fail to remove test file {}.", fpath); + return false; + } + + return true; +} + } // namespace filesystem } // namespace utils } // namespace dsn From 71708caac7cfeae1ebd0ce1193c3bb942ea3b99c Mon Sep 17 00:00:00 2001 From: heyuchen Date: Mon, 17 May 2021 13:14:32 +0800 Subject: [PATCH 2/5] fix --- src/replica/test/replica_disk_test_base.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/replica/test/replica_disk_test_base.h b/src/replica/test/replica_disk_test_base.h index 237c32a3dd..db93fd7571 100644 --- a/src/replica/test/replica_disk_test_base.h +++ b/src/replica/test/replica_disk_test_base.h @@ -146,7 +146,7 @@ class replica_disk_test_base : public replica_test_base dir_node *node_disk = new dir_node(fmt::format("tag_empty_{}", num), fmt::format("./tag_empty_{}", num)); stub->_fs_manager._dir_nodes.emplace_back(node_disk); - stub->_options.data_dirs.push_back(node_disk->full_dir); + stub->_fs_manager._available_data_dirs.emplace_back(node_disk->full_dir); utils::filesystem::create_directory(node_disk->full_dir); num--; } @@ -175,7 +175,7 @@ class replica_disk_test_base : public replica_test_base disk_available_mb, disk_available_ratio); - stub->_options.data_dirs.push_back( + stub->_fs_manager._available_data_dirs.emplace_back( node_disk->full_dir); // open replica need the options utils::filesystem::create_directory(node_disk->full_dir); From 5ed2a197bfcd0c7e4b0ef92ba3cb6948723922f3 Mon Sep 17 00:00:00 2001 From: heyuchen Date: Mon, 24 May 2021 10:20:41 +0800 Subject: [PATCH 3/5] update by cr --- src/replica/replica_stub.cpp | 7 ++----- src/utils/filesystem.cpp | 4 ++-- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index 5845670086..2efc387db8 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -785,11 +785,8 @@ void replica_stub::initialize_fs_manager(std::vector &data_dirs, std::vector available_dir_tags; for (int i = 0; i < data_dir_tags.size(); ++i) { std::string &dir = data_dirs[i]; - bool flag = dsn::utils::filesystem::create_directory(dir, cdir, err_msg); - if (flag) { - flag = dsn::utils::filesystem::check_dir_rw(dir, err_msg); - } - if (!flag) { + if (!utils::filesystem::create_directory(dir, cdir, err_msg) || + !utils::filesystem::check_dir_rw(dir, err_msg)) { if (FLAGS_ignore_broken_disk) { dwarn_f("data dir[{}] is broken, ignore it, error:{}", dir, err_msg); } else { diff --git a/src/utils/filesystem.cpp b/src/utils/filesystem.cpp index a6f6533768..6f5a0a2b1b 100644 --- a/src/utils/filesystem.cpp +++ b/src/utils/filesystem.cpp @@ -886,8 +886,8 @@ bool check_dir_rw(const std::string &path, std::string &err_msg) } std::string buf; - if (read_file(fpath, buf) != ERR_OK) { - err_msg = fmt::format("Fail to read file {}.", fpath); + if (read_file(fpath, buf) != ERR_OK || buf != value) { + err_msg = fmt::format("Fail to read file {} or get wrong value({}).", fpath, buf); return false; } From 85fba405c0ff603095d58d6cf5c424d2f25f9077 Mon Sep 17 00:00:00 2001 From: heyuchen Date: Wed, 2 Jun 2021 17:23:58 +0800 Subject: [PATCH 4/5] update by cr --- include/dsn/utility/filesystem.h | 1 + src/replica/replica_stub.cpp | 4 ++-- src/utils/filesystem.cpp | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/dsn/utility/filesystem.h b/include/dsn/utility/filesystem.h index 00b699f18b..b2cb0cc6c8 100644 --- a/include/dsn/utility/filesystem.h +++ b/include/dsn/utility/filesystem.h @@ -144,6 +144,7 @@ bool create_directory(const std::string &path, bool write_file(const std::string &fname, std::string &buf); // check if directory is readable and writable +// call `create_directory` before to make `path` exist bool check_dir_rw(const std::string &path, /*out*/ std::string &err_msg); } // namespace filesystem diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index 6096fa3a0f..93032bd5c1 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -786,8 +786,8 @@ void replica_stub::initialize_fs_manager(std::vector &data_dirs, std::vector available_dir_tags; for (int i = 0; i < data_dir_tags.size(); ++i) { std::string &dir = data_dirs[i]; - if (!utils::filesystem::create_directory(dir, cdir, err_msg) || - !utils::filesystem::check_dir_rw(dir, err_msg)) { + if (dsn_unlikely(!utils::filesystem::create_directory(dir, cdir, err_msg)) || + dsn_unlikely(!utils::filesystem::check_dir_rw(dir, err_msg))) { if (FLAGS_ignore_broken_disk) { dwarn_f("data dir[{}] is broken, ignore it, error:{}", dir, err_msg); } else { diff --git a/src/utils/filesystem.cpp b/src/utils/filesystem.cpp index 6f5a0a2b1b..21fc233be8 100644 --- a/src/utils/filesystem.cpp +++ b/src/utils/filesystem.cpp @@ -882,12 +882,14 @@ bool check_dir_rw(const std::string &path, std::string &err_msg) std::string value = "test_value"; if (!write_file(fpath, value)) { err_msg = fmt::format("Fail to write file {}.", fpath); + remove_path(fpath); return false; } std::string buf; if (read_file(fpath, buf) != ERR_OK || buf != value) { err_msg = fmt::format("Fail to read file {} or get wrong value({}).", fpath, buf); + remove_path(fpath); return false; } From 5cdf258d8230e59963e74a12f81a7464cf72ee4d Mon Sep 17 00:00:00 2001 From: heyuchen Date: Fri, 4 Jun 2021 10:12:57 +0800 Subject: [PATCH 5/5] update by cr --- src/replica/replica_stub.cpp | 6 +++--- src/utils/filesystem.cpp | 9 ++------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/src/replica/replica_stub.cpp b/src/replica/replica_stub.cpp index 93032bd5c1..5cac5549f5 100644 --- a/src/replica/replica_stub.cpp +++ b/src/replica/replica_stub.cpp @@ -784,10 +784,10 @@ void replica_stub::initialize_fs_manager(std::vector &data_dirs, int count = 0; std::vector available_dirs; std::vector available_dir_tags; - for (int i = 0; i < data_dir_tags.size(); ++i) { + for (auto i = 0; i < data_dir_tags.size(); ++i) { std::string &dir = data_dirs[i]; - if (dsn_unlikely(!utils::filesystem::create_directory(dir, cdir, err_msg)) || - dsn_unlikely(!utils::filesystem::check_dir_rw(dir, err_msg))) { + if (dsn_unlikely(!utils::filesystem::create_directory(dir, cdir, err_msg) || + !utils::filesystem::check_dir_rw(dir, err_msg))) { if (FLAGS_ignore_broken_disk) { dwarn_f("data dir[{}] is broken, ignore it, error:{}", dir, err_msg); } else { diff --git a/src/utils/filesystem.cpp b/src/utils/filesystem.cpp index 21fc233be8..f4252f90b7 100644 --- a/src/utils/filesystem.cpp +++ b/src/utils/filesystem.cpp @@ -37,6 +37,7 @@ #include #include +#include #include #include #include @@ -879,22 +880,16 @@ bool check_dir_rw(const std::string &path, std::string &err_msg) return false; } + auto cleanup = defer([&fpath]() { remove_path(fpath); }); std::string value = "test_value"; if (!write_file(fpath, value)) { err_msg = fmt::format("Fail to write file {}.", fpath); - remove_path(fpath); return false; } std::string buf; if (read_file(fpath, buf) != ERR_OK || buf != value) { err_msg = fmt::format("Fail to read file {} or get wrong value({}).", fpath, buf); - remove_path(fpath); - return false; - } - - if (!remove_path(fpath)) { - err_msg = fmt::format("Fail to remove test file {}.", fpath); return false; }