Skip to content

Commit

Permalink
feat(disk): add broken disk check while initialize (#834)
Browse files Browse the repository at this point in the history
  • Loading branch information
hycdong authored Jun 7, 2021
1 parent 78aefe1 commit f1827ec
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 30 deletions.
11 changes: 11 additions & 0 deletions include/dsn/utility/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,17 @@ bool verify_file(const std::string &fname,
const std::string &expected_md5,
const int64_t &expected_fsize);

// create driectory and get absolute path
bool create_directory(const std::string &path,
/*out*/ std::string &absolute_path,
/*out*/ std::string &err_msg);

bool write_file(const std::string &fname, std::string &buf);

// check if directory is readable and writable
// call `create_directory` before to make `path` exist
bool check_dir_rw(const std::string &path, /*out*/ std::string &err_msg);

} // namespace filesystem
} // namespace utils
} // namespace dsn
1 change: 1 addition & 0 deletions src/common/fs_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ dsn::error_code fs_manager::initialize(const std::vector<std::string> &data_dirs
norm_path.c_str(),
tags[i].c_str());
}
_available_data_dirs = data_dirs;

if (!for_test) {
update_disk_stat();
Expand Down
8 changes: 8 additions & 0 deletions src/common/fs_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,12 @@ class fs_manager
bool for_each_dir_node(const std::function<bool(const dir_node &)> &func) const;
void update_disk_stat();

const std::vector<std::string> &get_available_data_dirs() const
{
zauto_read_lock l(_lock);
return _available_data_dirs;
}

private:
void reset_disk_stat()
{
Expand All @@ -112,6 +118,8 @@ class fs_manager
int _max_available_ratio = 0;

std::vector<std::shared_ptr<dir_node>> _dir_nodes;
std::vector<std::string> _available_data_dirs;

// Used for disk available space check
// disk status will be updated periodically, this vector record nodes whose disk_status changed
// in this round
Expand Down
73 changes: 45 additions & 28 deletions src/replica/replica_stub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@
namespace dsn {
namespace replication {

DSN_DEFINE_bool("replication",
ignore_broken_disk,
true,
"true means ignore broken data disk when initialize");
DSN_TAG_VARIABLE(ignore_broken_disk, FT_MUTABLE);

bool replica_stub::s_not_exit_on_log_failure = false;

replica_stub::replica_stub(replica_state_subscriber subscriber /*= nullptr*/,
Expand Down Expand Up @@ -504,33 +510,13 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f
}

// init dirs
if (!dsn::utils::filesystem::create_directory(_options.slog_dir)) {
dassert(false, "Fail to create directory %s.", _options.slog_dir.c_str());
}
std::string cdir;
if (!dsn::utils::filesystem::get_absolute_path(_options.slog_dir, cdir)) {
dassert(false, "Fail to get absolute path from %s.", _options.slog_dir.c_str());
std::string err_msg;
if (!dsn::utils::filesystem::create_directory(_options.slog_dir, cdir, err_msg)) {
dassert(false, "{}", err_msg);
}
_options.slog_dir = cdir;
int count = 0;
for (auto &dir : _options.data_dirs) {
if (!dsn::utils::filesystem::create_directory(dir)) {
dassert(false, "Fail to create directory %s.", dir.c_str());
}
std::string cdir;
if (!dsn::utils::filesystem::get_absolute_path(dir, cdir)) {
dassert(false, "Fail to get absolute path from %s.", dir.c_str());
}
dir = cdir;
ddebug("data_dirs[%d] = %s", count, dir.c_str());
count++;
}

{
dsn::error_code err;
err = _fs_manager.initialize(_options.data_dirs, _options.data_dir_tags, false);
dassert(err == dsn::ERR_OK, "initialize fs manager failed, err(%s)", err.to_string());
}
initialize_fs_manager(_options.data_dirs, _options.data_dir_tags);

_log = new mutation_log_shared(_options.slog_dir,
_options.log_shared_file_size_mb,
Expand All @@ -542,7 +528,7 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f
ddebug("start to load replicas");

std::vector<std::string> dir_list;
for (auto &dir : _options.data_dirs) {
for (auto &dir : _fs_manager.get_available_data_dirs()) {
std::vector<std::string> tmp_list;
if (!dsn::utils::filesystem::get_subdirectories(dir, tmp_list, false)) {
dassert(false, "Fail to get subdirectories in %s.", dir.c_str());
Expand Down Expand Up @@ -790,6 +776,37 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f
}
}

void replica_stub::initialize_fs_manager(std::vector<std::string> &data_dirs,
std::vector<std::string> &data_dir_tags)
{
std::string cdir;
std::string err_msg;
int count = 0;
std::vector<std::string> available_dirs;
std::vector<std::string> available_dir_tags;
for (auto i = 0; i < data_dir_tags.size(); ++i) {
std::string &dir = data_dirs[i];
if (dsn_unlikely(!utils::filesystem::create_directory(dir, cdir, err_msg) ||
!utils::filesystem::check_dir_rw(dir, err_msg))) {
if (FLAGS_ignore_broken_disk) {
dwarn_f("data dir[{}] is broken, ignore it, error:{}", dir, err_msg);
} else {
dassert_f(false, "{}", err_msg);
}
continue;
}
ddebug_f("data_dirs[{}] = {}", count, cdir);
available_dirs.emplace_back(cdir);
available_dir_tags.emplace_back(data_dir_tags[i]);
count++;
}

dassert_f(available_dirs.size() > 0,
"initialize fs manager failed, no available data directory");
error_code err = _fs_manager.initialize(available_dirs, available_dir_tags, false);
dassert_f(err == dsn::ERR_OK, "initialize fs manager failed, err({})", err);
}

void replica_stub::initialize_start()
{
// start timer for configuration sync
Expand Down Expand Up @@ -1861,7 +1878,7 @@ void replica_stub::on_disk_stat()
uint64_t start = dsn_now_ns();
disk_cleaning_report report{};

dsn::replication::disk_remove_useless_dirs(_options.data_dirs, report);
dsn::replication::disk_remove_useless_dirs(_fs_manager.get_available_data_dirs(), report);
_fs_manager.update_disk_stat();
update_disk_holding_replicas();
update_disks_status();
Expand Down Expand Up @@ -2569,7 +2586,7 @@ std::string replica_stub::get_replica_dir(const char *app_type, gpid id, bool cr
std::string gpid_str = fmt::format("{}.{}", id, app_type);
std::string replica_dir;
bool is_dir_exist = false;
for (const std::string &data_dir : _options.data_dirs) {
for (const std::string &data_dir : _fs_manager.get_available_data_dirs()) {
std::string dir = utils::filesystem::path_combine(data_dir, gpid_str);
if (utils::filesystem::directory_exists(dir)) {
if (is_dir_exist) {
Expand All @@ -2591,7 +2608,7 @@ replica_stub::get_child_dir(const char *app_type, gpid child_pid, const std::str
{
std::string gpid_str = fmt::format("{}.{}", child_pid.to_string(), app_type);
std::string child_dir;
for (const std::string &data_dir : _options.data_dirs) {
for (const std::string &data_dir : _fs_manager.get_available_data_dirs()) {
std::string dir = utils::filesystem::path_combine(data_dir, gpid_str);
// <parent_dir> = <prefix>/<gpid>.<app_type>
// check if <parent_dir>'s <prefix> is equal to <data_dir>
Expand Down
2 changes: 2 additions & 0 deletions src/replica/replica_stub.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ class replica_stub : public serverlet<replica_stub>, public ref_counter
//
void initialize(const replication_options &opts, bool clear = false);
void initialize(bool clear = false);
void initialize_fs_manager(std::vector<std::string> &data_dirs,
std::vector<std::string> &data_dir_tags);
void set_options(const replication_options &opts) { _options = opts; }
void open_service();
void close();
Expand Down
18 changes: 18 additions & 0 deletions src/replica/test/replica_disk_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,5 +224,23 @@ TEST_F(replica_disk_test, disk_status_test)
mock_node_status(node_index, disk_status::NORMAL, disk_status::NORMAL);
}

TEST_F(replica_disk_test, broken_disk_test)
{
// Test cases:
// create: true, check_rw: true
// create: true, check_rw: false
// create: false
struct broken_disk_test
{
std::string mock_create_dir;
std::string mock_rw_flag;
int32_t data_dir_size;
} tests[]{{"true", "true", 3}, {"true", "false", 2}, {"false", "false", 2}};
for (const auto &test : tests) {
ASSERT_EQ(test.data_dir_size,
ignore_broken_disk_test(test.mock_create_dir, test.mock_rw_flag));
}
}

} // namespace replication
} // namespace dsn
19 changes: 17 additions & 2 deletions src/replica/test/replica_disk_test_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,21 @@ class replica_disk_test_base : public replica_test_base
return ERR_OK;
}

int32_t ignore_broken_disk_test(const std::string &mock_create_directory,
const std::string &mock_check_rw)
{
std::vector<std::string> data_dirs = {"disk1", "disk2", "disk3"};
std::vector<std::string> data_dir_tags = {"tag1", "tag2", "tag3"};
auto test_stub = make_unique<mock_replica_stub>();
fail::cfg("filesystem_create_directory", "return(" + mock_create_directory + ")");
fail::cfg("filesystem_check_dir_rw", "return(" + mock_check_rw + ")");
fail::cfg("update_disk_stat", "return()");
test_stub->initialize_fs_manager(data_dirs, data_dir_tags);
int32_t dir_size = test_stub->_fs_manager.get_available_data_dirs().size();
test_stub.reset();
return dir_size;
}

public:
int empty_dir_nodes_count = 1;
int dir_nodes_count = 5;
Expand Down Expand Up @@ -159,7 +174,7 @@ class replica_disk_test_base : public replica_test_base
dir_node *node_disk =
new dir_node(fmt::format("tag_empty_{}", num), fmt::format("./tag_empty_{}", num));
stub->_fs_manager._dir_nodes.emplace_back(node_disk);
stub->_options.data_dirs.push_back(node_disk->full_dir);
stub->_fs_manager._available_data_dirs.emplace_back(node_disk->full_dir);
utils::filesystem::create_directory(node_disk->full_dir);
num--;
}
Expand Down Expand Up @@ -188,7 +203,7 @@ class replica_disk_test_base : public replica_test_base
disk_available_mb,
disk_available_ratio);

stub->_options.data_dirs.push_back(
stub->_fs_manager._available_data_dirs.emplace_back(
node_disk->full_dir); // open replica need the options
utils::filesystem::create_directory(node_disk->full_dir);

Expand Down
70 changes: 70 additions & 0 deletions src/utils/filesystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@

#include <dsn/c/api_utilities.h>
#include <dsn/dist/fmt_logging.h>
#include <dsn/utility/defer.h>
#include <dsn/utility/fail_point.h>
#include <dsn/utility/filesystem.h>
#include <dsn/utility/utils.h>
#include <dsn/utility/safe_strerror_posix.h>
Expand Down Expand Up @@ -826,6 +828,74 @@ bool verify_file(const std::string &fname,
return true;
}

bool create_directory(const std::string &path, std::string &absolute_path, std::string &err_msg)
{
FAIL_POINT_INJECT_F("filesystem_create_directory", [path](string_view str) {
// when str contains 'false', and path contains broken_disk_dir, mock create fail(return
// false)
std::string broken_disk_dir = "disk1";
return str.find("false") == string_view::npos ||
path.find(broken_disk_dir) == std::string::npos;
});

if (!create_directory(path)) {
err_msg = fmt::format("Fail to create directory {}.", path);
return false;
}
if (!get_absolute_path(path, absolute_path)) {
err_msg = fmt::format("Fail to get absolute path from {}.", path);
return false;
}
return true;
}

bool write_file(const std::string &fname, std::string &buf)
{
if (!file_exists(fname)) {
derror_f("file({}) doesn't exist", fname);
return false;
}

std::ofstream fstream;
fstream.open(fname.c_str());
fstream << buf;
fstream.close();
return true;
}

bool check_dir_rw(const std::string &path, std::string &err_msg)
{
FAIL_POINT_INJECT_F("filesystem_check_dir_rw", [path](string_view str) {
// when str contains 'false', and path contains broken_disk_dir, mock check fail(return
// false)
std::string broken_disk_dir = "disk1";
return str.find("false") == string_view::npos ||
path.find(broken_disk_dir) == std::string::npos;
});

std::string fname = "read_write_test_file";
std::string fpath = path_combine(path, fname);
if (!create_file(fpath)) {
err_msg = fmt::format("Fail to create test file {}.", fpath);
return false;
}

auto cleanup = defer([&fpath]() { remove_path(fpath); });
std::string value = "test_value";
if (!write_file(fpath, value)) {
err_msg = fmt::format("Fail to write file {}.", fpath);
return false;
}

std::string buf;
if (read_file(fpath, buf) != ERR_OK || buf != value) {
err_msg = fmt::format("Fail to read file {} or get wrong value({}).", fpath, buf);
return false;
}

return true;
}

} // namespace filesystem
} // namespace utils
} // namespace dsn

0 comments on commit f1827ec

Please sign in to comment.