Skip to content
This repository has been archived by the owner on Jun 23, 2022. It is now read-only.

feat(disk): add broken disk check while initialize #834

Merged
merged 7 commits into from
Jun 7, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions include/dsn/utility/filesystem.h
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,16 @@ bool verify_file(const std::string &fname,
const std::string &expected_md5,
const int64_t &expected_fsize);

// create driectory and get absolute path
bool create_directory(const std::string &path,
/*out*/ std::string &absolute_path,
/*out*/ std::string &err_msg);

bool write_file(const std::string &fname, std::string &buf);

// check if directory is readable and writable
bool check_dir_rw(const std::string &path, /*out*/ std::string &err_msg);

} // namespace filesystem
} // namespace utils
} // namespace dsn
1 change: 1 addition & 0 deletions src/common/fs_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,7 @@ dsn::error_code fs_manager::initialize(const std::vector<std::string> &data_dirs
norm_path.c_str(),
tags[i].c_str());
}
_available_data_dirs = data_dirs;

if (!for_test) {
update_disk_stat();
Expand Down
7 changes: 7 additions & 0 deletions src/common/fs_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ class fs_manager
bool for_each_dir_node(const std::function<bool(const dir_node &)> &func) const;
void update_disk_stat();

const std::vector<std::string> &get_available_data_dirs() const
{
zauto_read_lock l(_lock);
return _available_data_dirs;
}

private:
void reset_disk_stat()
{
Expand All @@ -102,6 +108,7 @@ class fs_manager
int _max_available_ratio = 0;

std::vector<std::shared_ptr<dir_node>> _dir_nodes;
std::vector<std::string> _available_data_dirs;

perf_counter_wrapper _counter_total_capacity_mb;
perf_counter_wrapper _counter_total_available_mb;
Expand Down
73 changes: 45 additions & 28 deletions src/replica/replica_stub.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@
namespace dsn {
namespace replication {

DSN_DEFINE_bool("replication",
ignore_broken_disk,
true,
"true means ignore broken data disk when initialize");
DSN_TAG_VARIABLE(ignore_broken_disk, FT_MUTABLE);

bool replica_stub::s_not_exit_on_log_failure = false;

replica_stub::replica_stub(replica_state_subscriber subscriber /*= nullptr*/,
Expand Down Expand Up @@ -503,33 +509,13 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f
}

// init dirs
if (!dsn::utils::filesystem::create_directory(_options.slog_dir)) {
dassert(false, "Fail to create directory %s.", _options.slog_dir.c_str());
}
std::string cdir;
if (!dsn::utils::filesystem::get_absolute_path(_options.slog_dir, cdir)) {
dassert(false, "Fail to get absolute path from %s.", _options.slog_dir.c_str());
std::string err_msg;
if (!dsn::utils::filesystem::create_directory(_options.slog_dir, cdir, err_msg)) {
dassert(false, "{}", err_msg);
}
_options.slog_dir = cdir;
int count = 0;
for (auto &dir : _options.data_dirs) {
if (!dsn::utils::filesystem::create_directory(dir)) {
dassert(false, "Fail to create directory %s.", dir.c_str());
}
std::string cdir;
if (!dsn::utils::filesystem::get_absolute_path(dir, cdir)) {
dassert(false, "Fail to get absolute path from %s.", dir.c_str());
}
dir = cdir;
ddebug("data_dirs[%d] = %s", count, dir.c_str());
count++;
}

{
dsn::error_code err;
err = _fs_manager.initialize(_options.data_dirs, _options.data_dir_tags, false);
dassert(err == dsn::ERR_OK, "initialize fs manager failed, err(%s)", err.to_string());
}
initialize_fs_manager(_options.data_dirs, _options.data_dir_tags);

_log = new mutation_log_shared(_options.slog_dir,
_options.log_shared_file_size_mb,
Expand All @@ -541,7 +527,7 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f
ddebug("start to load replicas");

std::vector<std::string> dir_list;
for (auto &dir : _options.data_dirs) {
for (auto &dir : _fs_manager.get_available_data_dirs()) {
std::vector<std::string> tmp_list;
if (!dsn::utils::filesystem::get_subdirectories(dir, tmp_list, false)) {
dassert(false, "Fail to get subdirectories in %s.", dir.c_str());
Expand Down Expand Up @@ -789,6 +775,37 @@ void replica_stub::initialize(const replication_options &opts, bool clear /* = f
}
}

void replica_stub::initialize_fs_manager(std::vector<std::string> &data_dirs,
hycdong marked this conversation as resolved.
Show resolved Hide resolved
std::vector<std::string> &data_dir_tags)
{
std::string cdir;
std::string err_msg;
int count = 0;
std::vector<std::string> available_dirs;
std::vector<std::string> available_dir_tags;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe you should assert data_dirs.size() == data_dir_tags.size() here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

data_dirs and data_dirs_tags are _options.data_dirs and _options.data_dir_tags, you can reference how them set in replication_common.cpp function initialize, I don't add a link here because it will be refactored by pull request 831.

for (int i = 0; i < data_dir_tags.size(); ++i) {
std::string &dir = data_dirs[i];
if (!utils::filesystem::create_directory(dir, cdir, err_msg) ||
!utils::filesystem::check_dir_rw(dir, err_msg)) {
hycdong marked this conversation as resolved.
Show resolved Hide resolved
if (FLAGS_ignore_broken_disk) {
dwarn_f("data dir[{}] is broken, ignore it, error:{}", dir, err_msg);
} else {
dassert_f(false, "{}", err_msg);
}
continue;
}
ddebug_f("data_dirs[{}] = {}", count, cdir);
available_dirs.emplace_back(cdir);
available_dir_tags.emplace_back(data_dir_tags[i]);
count++;
}

dassert_f(available_dirs.size() > 0,
"initialize fs manager failed, no available data directory");
error_code err = _fs_manager.initialize(available_dirs, available_dir_tags, false);
dassert_f(err == dsn::ERR_OK, "initialize fs manager failed, err({})", err);
}

void replica_stub::initialize_start()
{
// start timer for configuration sync
Expand Down Expand Up @@ -1860,7 +1877,7 @@ void replica_stub::on_disk_stat()
uint64_t start = dsn_now_ns();
disk_cleaning_report report{};

dsn::replication::disk_remove_useless_dirs(_options.data_dirs, report);
dsn::replication::disk_remove_useless_dirs(_fs_manager.get_available_data_dirs(), report);
_fs_manager.update_disk_stat();
update_disk_holding_replicas();

Expand Down Expand Up @@ -2555,7 +2572,7 @@ std::string replica_stub::get_replica_dir(const char *app_type, gpid id, bool cr
std::string gpid_str = fmt::format("{}.{}", id, app_type);
std::string replica_dir;
bool is_dir_exist = false;
for (const std::string &data_dir : _options.data_dirs) {
for (const std::string &data_dir : _fs_manager.get_available_data_dirs()) {
std::string dir = utils::filesystem::path_combine(data_dir, gpid_str);
if (utils::filesystem::directory_exists(dir)) {
if (is_dir_exist) {
Expand All @@ -2577,7 +2594,7 @@ replica_stub::get_child_dir(const char *app_type, gpid child_pid, const std::str
{
std::string gpid_str = fmt::format("{}.{}", child_pid.to_string(), app_type);
std::string child_dir;
for (const std::string &data_dir : _options.data_dirs) {
for (const std::string &data_dir : _fs_manager.get_available_data_dirs()) {
std::string dir = utils::filesystem::path_combine(data_dir, gpid_str);
// <parent_dir> = <prefix>/<gpid>.<app_type>
// check if <parent_dir>'s <prefix> is equal to <data_dir>
Expand Down
2 changes: 2 additions & 0 deletions src/replica/replica_stub.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ class replica_stub : public serverlet<replica_stub>, public ref_counter
//
void initialize(const replication_options &opts, bool clear = false);
void initialize(bool clear = false);
void initialize_fs_manager(std::vector<std::string> &data_dirs,
std::vector<std::string> &data_dir_tags);
void set_options(const replication_options &opts) { _options = opts; }
void open_service();
void close();
Expand Down
18 changes: 18 additions & 0 deletions src/replica/test/replica_disk_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,5 +198,23 @@ TEST_F(replica_disk_test, gc_disk_useless_dir)
ASSERT_EQ(report.error_replica_count, 2);
}

TEST_F(replica_disk_test, broken_disk_test)
{
// Test cases:
// create: true, check_rw: true
// create: true, check_rw: false
// create: false
struct broken_disk_test
{
std::string mock_create_dir;
std::string mock_rw_flag;
int32_t data_dir_size;
} tests[]{{"true", "true", 3}, {"true", "false", 2}, {"false", "false", 2}};
for (const auto &test : tests) {
ASSERT_EQ(test.data_dir_size,
ignore_broken_disk_test(test.mock_create_dir, test.mock_rw_flag));
}
}

} // namespace replication
} // namespace dsn
19 changes: 17 additions & 2 deletions src/replica/test/replica_disk_test_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,21 @@ class replica_disk_test_base : public replica_test_base
}
}

int32_t ignore_broken_disk_test(const std::string &mock_create_directory,
const std::string &mock_check_rw)
{
std::vector<std::string> data_dirs = {"disk1", "disk2", "disk3"};
std::vector<std::string> data_dir_tags = {"tag1", "tag2", "tag3"};
auto test_stub = make_unique<mock_replica_stub>();
fail::cfg("filesystem_create_directory", "return(" + mock_create_directory + ")");
fail::cfg("filesystem_check_dir_rw", "return(" + mock_check_rw + ")");
fail::cfg("update_disk_stat", "return()");
test_stub->initialize_fs_manager(data_dirs, data_dir_tags);
int32_t dir_size = test_stub->_fs_manager.get_available_data_dirs().size();
test_stub.reset();
return dir_size;
}

public:
int empty_dir_nodes_count = 1;
int dir_nodes_count = 5;
Expand Down Expand Up @@ -131,7 +146,7 @@ class replica_disk_test_base : public replica_test_base
dir_node *node_disk =
new dir_node(fmt::format("tag_empty_{}", num), fmt::format("./tag_empty_{}", num));
stub->_fs_manager._dir_nodes.emplace_back(node_disk);
stub->_options.data_dirs.push_back(node_disk->full_dir);
stub->_fs_manager._available_data_dirs.emplace_back(node_disk->full_dir);
utils::filesystem::create_directory(node_disk->full_dir);
num--;
}
Expand Down Expand Up @@ -160,7 +175,7 @@ class replica_disk_test_base : public replica_test_base
disk_available_mb,
disk_available_ratio);

stub->_options.data_dirs.push_back(
stub->_fs_manager._available_data_dirs.emplace_back(
node_disk->full_dir); // open replica need the options
utils::filesystem::create_directory(node_disk->full_dir);

Expand Down
73 changes: 73 additions & 0 deletions src/utils/filesystem.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

#include <dsn/c/api_utilities.h>
#include <dsn/dist/fmt_logging.h>
#include <dsn/utility/fail_point.h>
#include <dsn/utility/filesystem.h>
#include <dsn/utility/utils.h>
#include <dsn/utility/safe_strerror_posix.h>
Expand Down Expand Up @@ -826,6 +827,78 @@ bool verify_file(const std::string &fname,
return true;
}

bool create_directory(const std::string &path, std::string &absolute_path, std::string &err_msg)
{
FAIL_POINT_INJECT_F("filesystem_create_directory", [path](string_view str) {
// when str contains 'false', and path contains broken_disk_dir, mock create fail(return
// false)
std::string broken_disk_dir = "disk1";
return str.find("false") == string_view::npos ||
path.find(broken_disk_dir) == std::string::npos;
});

if (!create_directory(path)) {
err_msg = fmt::format("Fail to create directory {}.", path);
return false;
}
if (!get_absolute_path(path, absolute_path)) {
err_msg = fmt::format("Fail to get absolute path from {}.", path);
return false;
}
return true;
}

bool write_file(const std::string &fname, std::string &buf)
{
if (!file_exists(fname)) {
derror_f("file({}) doesn't exist", fname);
return false;
}

std::ofstream fstream;
fstream.open(fname.c_str());
fstream << buf;
fstream.close();
return true;
}

bool check_dir_rw(const std::string &path, std::string &err_msg)
{
FAIL_POINT_INJECT_F("filesystem_check_dir_rw", [path](string_view str) {
// when str contains 'false', and path contains broken_disk_dir, mock check fail(return
// false)
std::string broken_disk_dir = "disk1";
return str.find("false") == string_view::npos ||
path.find(broken_disk_dir) == std::string::npos;
});

std::string fname = "read_write_test_file";
std::string fpath = path_combine(path, fname);
hycdong marked this conversation as resolved.
Show resolved Hide resolved
if (!create_file(fpath)) {
err_msg = fmt::format("Fail to create test file {}.", fpath);
return false;
}

std::string value = "test_value";
if (!write_file(fpath, value)) {
err_msg = fmt::format("Fail to write file {}.", fpath);
return false;
}

std::string buf;
if (read_file(fpath, buf) != ERR_OK || buf != value) {
err_msg = fmt::format("Fail to read file {} or get wrong value({}).", fpath, buf);
return false;
}

if (!remove_path(fpath)) {
err_msg = fmt::format("Fail to remove test file {}.", fpath);
return false;
}

return true;
}

} // namespace filesystem
} // namespace utils
} // namespace dsn