Skip to content
This repository has been archived by the owner on Dec 12, 2022. It is now read-only.

support fiu injection #16

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ Start all services, disturb (cat /dev/zero until disk is full) while write a cir
#### [random_slow_disk](conf/random_slow_disk.json)
Start all services, disturb (simulate slow disk io) while write a circle, then check data integrity. We use [SysytemTap](https://sourceware.org/systemtap/wiki) to simulate slow disk io. The `major` and `minor` field is the MAJOR/MINOR device id of disk where storage serveice's data path mounted.

#### [random_fiu_injection](conf/random_fiu_injection.json)
Start all services, inject fault with [libfiu](https://blitiri.com.ar/p/libfiu/) while write a circle, then check data integrity.
Here are some injection would make storage crash `posix/io/oc/open`, `posix/io/oc/close`, `posix/io/rw/write`, `posix/io/sync/fsync`.
> Be sure to compile nebula with flag `-DENABLE_JEMALLOC=OFF`, otherwise, libfiu won't be able to preload.

```
yum install systemtap
```
Expand Down
186 changes: 186 additions & 0 deletions conf/random_fiu_injection.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
{
"name": "Nebula random-kill-clean plan",
"concurrency": 10,
"rolling_table": true,
"instances": [
{
"host":"",
"install_dir":"",
"conf_dir":"",
"type": "graphd",
"user": ""
},
{
"host":"",
"install_dir":"",
"conf_dir":"",
"type": "metad",
"user": ""
},
{
"host":"",
"install_dir":"",
"conf_dir":"",
"type": "storaged",
"user": ""
},
{
"host":"",
"install_dir":"",
"conf_dir":"",
"type": "storaged",
"user": ""
},
{
"host":"",
"install_dir":"",
"conf_dir":"",
"type": "storaged",
"user": ""
}
],
"actions" : [
{
"type": "StartAction",
"inst_index": 0,
"depends": [2, 3, 4]
},
{
"type": "StartAction",
"inst_index": 1,
"depends": []
},
{
"type": "StartAction",
"inst_index": 2,
"load_fiu": true,
"depends": [1]
},
{
"type": "StartAction",
"inst_index": 3,
"load_fiu": true,
"depends": [1]
},
{
"type": "StartAction",
"inst_index": 4,
"load_fiu": true,
"depends": [1]
},
{
"type": "WaitAction",
"wait_time_ms": 15000,
"depends": [0]
},
{
"type": "ClientConnectAction",
"depends": [5]
},
{
"type": "CreateSpaceAction",
"space_name": "random_kill_clean",
"replica": 3,
"parts": 10,
"depends": [6]
},
{
"type": "UseSpaceAction",
"space_name": "random_kill_clean",
"depends": [7]
},
{
"type": "CreateSchemaAction",
"name": "circle",
"props": [
{"name": "nextId", "type": "int"}
],
"edge_or_tag": false,
"depends": [8]
},
{
"type": "WaitAction",
"wait_time_ms": 10000,
"depends": [9]
},
{
"type": "BalanceLeaderAction",
"depends": [10]
},
{
"type": "CheckLeadersAction",
"expected_num": 10,
"space": "random_kill_clean",
"depends": [11]
},
{
"type": "RandomFiuAction",
"storages": [2,3,4],
"loop_times": 3,
"restart_interval": 60,
"next_loop_interval": 60,
"name": "posix/io/rw/write",
"probability": 1,
"depends": [12]
},
{
"type": "WriteCircleAction",
"tag": "circle",
"col": "nextId",
"total_rows": 400000,
"depends": [12]
},
{
"type": "WaitAction",
"wait_time_ms": 30000,
"depends": [13, 14]
},
{
"type": "BalanceLeaderAction",
"depends": [15]
},
{
"type": "CheckLeadersAction",
"expected_num": 10,
"space": "random_kill_clean",
"depends": [16]
},
{
"type": "WalkThroughAction",
"tag": "circle",
"col": "nextId",
"total_rows": 400000,
"depends": [17]
},
{
"type": "EmptyAction",
"name": "JoinNode",
"depends": [18]
},
{
"type": "StopAction",
"inst_index": 0,
"depends": [19]
},
{
"type": "StopAction",
"inst_index": 1,
"depends": [19]
},
{
"type": "StopAction",
"inst_index": 2,
"depends": [19]
},
{
"type": "StopAction",
"inst_index": 3,
"depends": [19]
},
{
"type": "StopAction",
"inst_index": 4,
"depends": [19]
}
]
}
64 changes: 64 additions & 0 deletions src/nebula/NebulaAction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ ResultCode CrashAction::doRun() {
ResultCode StartAction::doRun() {
CHECK_NOTNULL(inst_);
auto startCommand = inst_->startCommand();
if (loadFiu_) {
startCommand = "fiu-run -x " + startCommand;
}
LOG(INFO) << startCommand << " on " << inst_->toString() << " as " << inst_->owner();
auto ret = utils::SshHelper::run(
startCommand,
Expand Down Expand Up @@ -875,5 +878,66 @@ ResultCode RestoreFromDataDirAction::doRun() {
}
return ResultCode::OK;
}

ResultCode RandomFiuAction::disturb() {
picked_ = Utils::randomInstance(storages_, NebulaInstance::State::RUNNING);
CHECK_NOTNULL(picked_);
auto pid = picked_->getPid();
if (!pid.hasValue()) {
LOG(ERROR) << "Failed to get pid of " << picked_->toString();
return ResultCode::ERR_FAILED;
}
pid_ = pid.value();
LOG(INFO) << "Begin to fiu injection of " << picked_->toString();
std::string inject;
if (probability_ < 1) {
inject = folly::stringPrintf("fiu-ctrl -c 'enable_random name=%s,probability=%lf' %d",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Where is fiu-ctrl? Install it by myself?

name_.c_str(), probability_, pid_);
} else {
inject = folly::stringPrintf("fiu-ctrl -c 'enable name=%s' %d", name_.c_str(), pid_);
}
LOG(INFO) << "Fiu cmd: " << inject << " on " << picked_->toString();

auto ret = utils::SshHelper::run(
inject,
picked_->getHost(),
[this] (const std::string& outMsg) {
VLOG(1) << "The output is " << outMsg;
},
[] (const std::string& errMsg) {
LOG(ERROR) << "The error is " << errMsg;
},
picked_->owner());
CHECK_EQ(0, ret.exitStatus());
return ResultCode::OK;
}

ResultCode RandomFiuAction::recover() {
auto recover = folly::stringPrintf("fiu-ctrl -c 'disable name=%s' %d", name_.c_str(), pid_);
auto ret = utils::SshHelper::run(
recover,
picked_->getHost(),
[this] (const std::string& outMsg) {
VLOG(1) << "The output is " << outMsg;
},
[] (const std::string& errMsg) {
LOG(ERROR) << "The error is " << errMsg;
},
picked_->owner());
CHECK_EQ(0, ret.exitStatus());

// storage could have been crashed because of fault injection, try reboot it.
StartAction start(picked_, true);
for (int32_t retry = 0; retry != 32; retry++) {
auto rc = start.doRun();
if (rc == ResultCode::OK) {
return rc;
}
LOG(ERROR) << "Reboot failed, retry " << retry;
sleep(retry);
}
return ResultCode::ERR_FAILED;
}

} // namespace nebula
} // namespace nebula_chaos
47 changes: 45 additions & 2 deletions src/nebula/NebulaAction.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ class CrashAction : public core::Action {

class StartAction : public core::Action {
public:
StartAction(NebulaInstance* inst)
: inst_(inst) {
StartAction(NebulaInstance* inst,
bool loadFiu = false)
: inst_(inst)
, loadFiu_(loadFiu) {
VLOG(1) << "Construct StartAction for " << inst_->toString();
}

Expand All @@ -54,6 +56,7 @@ class StartAction : public core::Action {

private:
NebulaInstance* inst_ = nullptr;
bool loadFiu_ = false;
};

class StopAction : public core::Action {
Expand Down Expand Up @@ -660,6 +663,46 @@ class RestoreFromDataDirAction : public core::Action {
NebulaInstance* inst_;
std::string srcDataPaths_;
};

/**
* Random pick one storage and start fiu injection. If probability is less than 1, this action
* will inject fault $like "fiu-ctrl -c 'enable_random name=posix/io/rw/write' pid". Otherwise,
* this action will inject fault like "fiu-ctrl -c 'enable name=posix/io/rw/write' pid"
* */
class RandomFiuAction : public core::DisturbAction {
public:
RandomFiuAction(const std::vector<NebulaInstance*>& storages,
int32_t loopTimes,
int32_t timeToDisurb,
int32_t timeToRecover,
const std::string& name,
double probability = 1)
: DisturbAction(loopTimes, timeToDisurb, timeToRecover)
, storages_(storages)
, name_(name)
, probability_(probability) {
CHECK_GE(probability_, 0);
CHECK_LE(probability_, 1);
}

~RandomFiuAction() = default;

std::string toString() const override {
return folly::stringPrintf("Fiu injection %s: loop %d", name_.c_str(), loopTimes_);
}

private:
ResultCode disturb() override;
ResultCode recover() override;

private:
std::vector<NebulaInstance*> storages_;
std::string name_;
double probability_;
NebulaInstance* picked_;
int32_t pid_;
};

} // namespace nebula
} // namespace nebula_chaos

Expand Down
21 changes: 20 additions & 1 deletion src/nebula/NebulaUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ class Utils {
auto instIndex = obj.at("inst_index").asInt();
CHECK_GE(instIndex, 0);
CHECK_LT(instIndex, ctx.insts.size());
return std::make_unique<StartAction>(ctx.insts[instIndex]);
auto loadFiu = obj.getDefault("load_fiu", false).asBool();
return std::make_unique<StartAction>(ctx.insts[instIndex], loadFiu);
} else if (type == "StopAction") {
auto instIndex = obj.at("inst_index").asInt();
CHECK_GE(instIndex, 0);
Expand Down Expand Up @@ -318,6 +319,24 @@ class Utils {
return std::make_unique<core::AssignAction>(&ctx.planCtx->actionCtx,
varName,
valExpr);
} else if (type == "RandomFiuAction") {
auto storageIdxs = obj.at("storages");
std::vector<NebulaInstance*> storages;
for (auto iter = storageIdxs.begin(); iter != storageIdxs.end(); iter++) {
auto index = iter->asInt();
storages.emplace_back(ctx.insts[index]);
}
auto loopTimes = obj.getDefault("loop_times", 1).asInt();
auto nextDistubInterval = obj.getDefault("next_loop_interval", 30).asInt();
auto recoverInterval = obj.getDefault("restart_interval", 30).asInt();
auto name = obj.at("name").asString();
auto probability = obj.getDefault("probability", 1).asDouble();
return std::make_unique<RandomFiuAction>(storages,
loopTimes,
nextDistubInterval,
recoverInterval,
name,
probability);
}
LOG(FATAL) << "Unknown type " << type;
return nullptr;
Expand Down