diff --git a/README.md b/README.md index 99412f1..4b47c7f 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,11 @@ Start all services, disturb (cat /dev/zero until disk is full) while write a cir #### [random_slow_disk](conf/random_slow_disk.json) Start all services, disturb (simulate slow disk io) while write a circle, then check data integrity. We use [SysytemTap](https://sourceware.org/systemtap/wiki) to simulate slow disk io. The `major` and `minor` field is the MAJOR/MINOR device id of disk where storage serveice's data path mounted. +#### [random_fiu_injection](conf/random_fiu_injection.json) +Start all services, inject fault with [libfiu](https://blitiri.com.ar/p/libfiu/) while write a circle, then check data integrity. +Here are some injection would make storage crash `posix/io/oc/open`, `posix/io/oc/close`, `posix/io/rw/write`, `posix/io/sync/fsync`. +> Be sure to compile nebula with flag `-DENABLE_JEMALLOC=OFF`, otherwise, libfiu won't be able to preload. + ``` yum install systemtap ``` diff --git a/conf/random_fiu_injection.json b/conf/random_fiu_injection.json new file mode 100644 index 0000000..b56924b --- /dev/null +++ b/conf/random_fiu_injection.json @@ -0,0 +1,186 @@ +{ + "name": "Nebula random-kill-clean plan", + "concurrency": 10, + "rolling_table": true, + "instances": [ + { + "host":"", + "install_dir":"", + "conf_dir":"", + "type": "graphd", + "user": "" + }, + { + "host":"", + "install_dir":"", + "conf_dir":"", + "type": "metad", + "user": "" + }, + { + "host":"", + "install_dir":"", + "conf_dir":"", + "type": "storaged", + "user": "" + }, + { + "host":"", + "install_dir":"", + "conf_dir":"", + "type": "storaged", + "user": "" + }, + { + "host":"", + "install_dir":"", + "conf_dir":"", + "type": "storaged", + "user": "" + } + ], + "actions" : [ + { + "type": "StartAction", + "inst_index": 0, + "depends": [2, 3, 4] + }, + { + "type": "StartAction", + "inst_index": 1, + "depends": [] + }, + { + "type": "StartAction", + "inst_index": 2, + "load_fiu": true, + "depends": [1] + }, + { + "type": "StartAction", + "inst_index": 3, + "load_fiu": true, + "depends": [1] + }, + { + "type": "StartAction", + "inst_index": 4, + "load_fiu": true, + "depends": [1] + }, + { + "type": "WaitAction", + "wait_time_ms": 15000, + "depends": [0] + }, + { + "type": "ClientConnectAction", + "depends": [5] + }, + { + "type": "CreateSpaceAction", + "space_name": "random_kill_clean", + "replica": 3, + "parts": 10, + "depends": [6] + }, + { + "type": "UseSpaceAction", + "space_name": "random_kill_clean", + "depends": [7] + }, + { + "type": "CreateSchemaAction", + "name": "circle", + "props": [ + {"name": "nextId", "type": "int"} + ], + "edge_or_tag": false, + "depends": [8] + }, + { + "type": "WaitAction", + "wait_time_ms": 10000, + "depends": [9] + }, + { + "type": "BalanceLeaderAction", + "depends": [10] + }, + { + "type": "CheckLeadersAction", + "expected_num": 10, + "space": "random_kill_clean", + "depends": [11] + }, + { + "type": "RandomFiuAction", + "storages": [2,3,4], + "loop_times": 3, + "restart_interval": 60, + "next_loop_interval": 60, + "name": "posix/io/rw/write", + "probability": 1, + "depends": [12] + }, + { + "type": "WriteCircleAction", + "tag": "circle", + "col": "nextId", + "total_rows": 400000, + "depends": [12] + }, + { + "type": "WaitAction", + "wait_time_ms": 30000, + "depends": [13, 14] + }, + { + "type": "BalanceLeaderAction", + "depends": [15] + }, + { + "type": "CheckLeadersAction", + "expected_num": 10, + "space": "random_kill_clean", + "depends": [16] + }, + { + "type": "WalkThroughAction", + "tag": "circle", + "col": "nextId", + "total_rows": 400000, + "depends": [17] + }, + { + "type": "EmptyAction", + "name": "JoinNode", + "depends": [18] + }, + { + "type": "StopAction", + "inst_index": 0, + "depends": [19] + }, + { + "type": "StopAction", + "inst_index": 1, + "depends": [19] + }, + { + "type": "StopAction", + "inst_index": 2, + "depends": [19] + }, + { + "type": "StopAction", + "inst_index": 3, + "depends": [19] + }, + { + "type": "StopAction", + "inst_index": 4, + "depends": [19] + } + ] +} diff --git a/src/nebula/NebulaAction.cpp b/src/nebula/NebulaAction.cpp index e35e3e0..a50e130 100644 --- a/src/nebula/NebulaAction.cpp +++ b/src/nebula/NebulaAction.cpp @@ -46,6 +46,9 @@ ResultCode CrashAction::doRun() { ResultCode StartAction::doRun() { CHECK_NOTNULL(inst_); auto startCommand = inst_->startCommand(); + if (loadFiu_) { + startCommand = "fiu-run -x " + startCommand; + } LOG(INFO) << startCommand << " on " << inst_->toString() << " as " << inst_->owner(); auto ret = utils::SshHelper::run( startCommand, @@ -875,5 +878,66 @@ ResultCode RestoreFromDataDirAction::doRun() { } return ResultCode::OK; } + +ResultCode RandomFiuAction::disturb() { + picked_ = Utils::randomInstance(storages_, NebulaInstance::State::RUNNING); + CHECK_NOTNULL(picked_); + auto pid = picked_->getPid(); + if (!pid.hasValue()) { + LOG(ERROR) << "Failed to get pid of " << picked_->toString(); + return ResultCode::ERR_FAILED; + } + pid_ = pid.value(); + LOG(INFO) << "Begin to fiu injection of " << picked_->toString(); + std::string inject; + if (probability_ < 1) { + inject = folly::stringPrintf("fiu-ctrl -c 'enable_random name=%s,probability=%lf' %d", + name_.c_str(), probability_, pid_); + } else { + inject = folly::stringPrintf("fiu-ctrl -c 'enable name=%s' %d", name_.c_str(), pid_); + } + LOG(INFO) << "Fiu cmd: " << inject << " on " << picked_->toString(); + + auto ret = utils::SshHelper::run( + inject, + picked_->getHost(), + [this] (const std::string& outMsg) { + VLOG(1) << "The output is " << outMsg; + }, + [] (const std::string& errMsg) { + LOG(ERROR) << "The error is " << errMsg; + }, + picked_->owner()); + CHECK_EQ(0, ret.exitStatus()); + return ResultCode::OK; +} + +ResultCode RandomFiuAction::recover() { + auto recover = folly::stringPrintf("fiu-ctrl -c 'disable name=%s' %d", name_.c_str(), pid_); + auto ret = utils::SshHelper::run( + recover, + picked_->getHost(), + [this] (const std::string& outMsg) { + VLOG(1) << "The output is " << outMsg; + }, + [] (const std::string& errMsg) { + LOG(ERROR) << "The error is " << errMsg; + }, + picked_->owner()); + CHECK_EQ(0, ret.exitStatus()); + + // storage could have been crashed because of fault injection, try reboot it. + StartAction start(picked_, true); + for (int32_t retry = 0; retry != 32; retry++) { + auto rc = start.doRun(); + if (rc == ResultCode::OK) { + return rc; + } + LOG(ERROR) << "Reboot failed, retry " << retry; + sleep(retry); + } + return ResultCode::ERR_FAILED; +} + } // namespace nebula } // namespace nebula_chaos diff --git a/src/nebula/NebulaAction.h b/src/nebula/NebulaAction.h index 78dd520..6360d71 100644 --- a/src/nebula/NebulaAction.h +++ b/src/nebula/NebulaAction.h @@ -37,8 +37,10 @@ class CrashAction : public core::Action { class StartAction : public core::Action { public: - StartAction(NebulaInstance* inst) - : inst_(inst) { + StartAction(NebulaInstance* inst, + bool loadFiu = false) + : inst_(inst) + , loadFiu_(loadFiu) { VLOG(1) << "Construct StartAction for " << inst_->toString(); } @@ -54,6 +56,7 @@ class StartAction : public core::Action { private: NebulaInstance* inst_ = nullptr; + bool loadFiu_ = false; }; class StopAction : public core::Action { @@ -660,6 +663,46 @@ class RestoreFromDataDirAction : public core::Action { NebulaInstance* inst_; std::string srcDataPaths_; }; + +/** + * Random pick one storage and start fiu injection. If probability is less than 1, this action + * will inject fault $like "fiu-ctrl -c 'enable_random name=posix/io/rw/write' pid". Otherwise, + * this action will inject fault like "fiu-ctrl -c 'enable name=posix/io/rw/write' pid" + * */ +class RandomFiuAction : public core::DisturbAction { +public: + RandomFiuAction(const std::vector& storages, + int32_t loopTimes, + int32_t timeToDisurb, + int32_t timeToRecover, + const std::string& name, + double probability = 1) + : DisturbAction(loopTimes, timeToDisurb, timeToRecover) + , storages_(storages) + , name_(name) + , probability_(probability) { + CHECK_GE(probability_, 0); + CHECK_LE(probability_, 1); + } + + ~RandomFiuAction() = default; + + std::string toString() const override { + return folly::stringPrintf("Fiu injection %s: loop %d", name_.c_str(), loopTimes_); + } + +private: + ResultCode disturb() override; + ResultCode recover() override; + +private: + std::vector storages_; + std::string name_; + double probability_; + NebulaInstance* picked_; + int32_t pid_; +}; + } // namespace nebula } // namespace nebula_chaos diff --git a/src/nebula/NebulaUtils.h b/src/nebula/NebulaUtils.h index 30cb192..46d6a5a 100644 --- a/src/nebula/NebulaUtils.h +++ b/src/nebula/NebulaUtils.h @@ -51,7 +51,8 @@ class Utils { auto instIndex = obj.at("inst_index").asInt(); CHECK_GE(instIndex, 0); CHECK_LT(instIndex, ctx.insts.size()); - return std::make_unique(ctx.insts[instIndex]); + auto loadFiu = obj.getDefault("load_fiu", false).asBool(); + return std::make_unique(ctx.insts[instIndex], loadFiu); } else if (type == "StopAction") { auto instIndex = obj.at("inst_index").asInt(); CHECK_GE(instIndex, 0); @@ -318,6 +319,24 @@ class Utils { return std::make_unique(&ctx.planCtx->actionCtx, varName, valExpr); + } else if (type == "RandomFiuAction") { + auto storageIdxs = obj.at("storages"); + std::vector storages; + for (auto iter = storageIdxs.begin(); iter != storageIdxs.end(); iter++) { + auto index = iter->asInt(); + storages.emplace_back(ctx.insts[index]); + } + auto loopTimes = obj.getDefault("loop_times", 1).asInt(); + auto nextDistubInterval = obj.getDefault("next_loop_interval", 30).asInt(); + auto recoverInterval = obj.getDefault("restart_interval", 30).asInt(); + auto name = obj.at("name").asString(); + auto probability = obj.getDefault("probability", 1).asDouble(); + return std::make_unique(storages, + loopTimes, + nextDistubInterval, + recoverInterval, + name, + probability); } LOG(FATAL) << "Unknown type " << type; return nullptr;