-
Notifications
You must be signed in to change notification settings - Fork 979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Clean work abort #1729
Clean work abort #1729
Changes from 1 commit
ff5e3ec
b76e667
815da92
a1ee684
51e5c16
9fa2102
0f909e6
84e5c0f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -75,6 +75,8 @@ Work::getStatus() const | |
case WORK_FAILURE_RAISE: | ||
case WORK_FAILURE_FATAL: | ||
return fmt::format("Failed: {:s}", getUniqueName()); | ||
case WORK_FAILURE_ABORTED: | ||
return fmt::format("Aborted: {:s}", getUniqueName()); | ||
default: | ||
assert(false); | ||
return ""; | ||
|
@@ -121,6 +123,8 @@ Work::stateName(State st) | |
return "WORK_FAILURE_RAISE"; | ||
case WORK_FAILURE_FATAL: | ||
return "WORK_FAILURE_FATAL"; | ||
case WORK_FAILURE_ABORTED: | ||
return "WORK_FAILURE_ABORTED"; | ||
default: | ||
throw std::runtime_error("Unknown Work::State"); | ||
} | ||
|
@@ -141,10 +145,22 @@ Work::callComplete() | |
}; | ||
} | ||
|
||
void | ||
Work::scheduleAbort(CompleteResult result) | ||
{ | ||
if (result != WORK_COMPLETE_FATAL && result != WORK_COMPLETE_FAILURE && | ||
result != WORK_COMPLETE_ABORTED) | ||
{ | ||
CLOG(ERROR, "Work") << "Cannot schedule abort with non-failure state"; | ||
return; | ||
} | ||
scheduleComplete(result); | ||
} | ||
|
||
void | ||
Work::scheduleRun() | ||
{ | ||
if (mScheduled) | ||
if (mScheduled || mAborting) | ||
{ | ||
return; | ||
} | ||
|
@@ -183,14 +199,15 @@ Work::scheduleComplete(CompleteResult result) | |
return; | ||
} | ||
self->mScheduled = false; | ||
self->mAborting = false; | ||
self->complete(result); | ||
}); | ||
} | ||
|
||
void | ||
Work::scheduleRetry() | ||
{ | ||
if (mScheduled) | ||
if (mScheduled || mAborting) | ||
{ | ||
return; | ||
} | ||
|
@@ -249,25 +266,45 @@ Work::advance() | |
} | ||
|
||
CLOG(DEBUG, "Work") << "advancing " << getUniqueName(); | ||
advanceChildren(); | ||
if (allChildrenSuccessful()) | ||
|
||
// If necessary, propagate abort signal before advancing children | ||
// This is to prevent scheduling any children to run if they are about | ||
// to be in WORK_ABORTING state (such children are scheduled to abort | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is no |
||
// properly instead) | ||
if (anyChildFatalFailure()) | ||
{ | ||
CLOG(DEBUG, "Work") << "all " << mChildren.size() << " children of " | ||
<< getUniqueName() << " successful, scheduling run"; | ||
scheduleRun(); | ||
CLOG(DEBUG, "Work") | ||
<< "some of " << mChildren.size() << " children of " | ||
<< getUniqueName() << " FATALLY failed, propagating " | ||
<< "abort"; | ||
abort(WORK_COMPLETE_FATAL); | ||
} | ||
else if (anyChildFatalFailure()) | ||
else if (anyChildRaiseFailure()) | ||
{ | ||
CLOG(DEBUG, "Work") << "some of " << mChildren.size() << " children of " | ||
<< getUniqueName() << " fatally failed, scheduling " | ||
<< "fatal failure"; | ||
scheduleFatalFailure(); | ||
<< getUniqueName() << " failed, propagating " | ||
<< "abort"; | ||
abort(WORK_COMPLETE_FAILURE); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is basically 'abort all children and then retry', I'm not sure if naming is good here |
||
} | ||
else if (anyChildRaiseFailure()) | ||
else if (anyChildAborted()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure I understand: why would a child aborting cause parents to abort as well? |
||
{ | ||
CLOG(DEBUG, "Work") << "some of " << mChildren.size() << " children of " | ||
<< getUniqueName() << " failed, scheduling failure"; | ||
scheduleFailure(); | ||
<< getUniqueName() << " aborted, propagating " | ||
<< "abort"; | ||
abort(WORK_COMPLETE_ABORTED); | ||
} | ||
|
||
advanceChildren(); | ||
if (allChildrenSuccessful()) | ||
{ | ||
if (mAborting) | ||
{ | ||
scheduleAbort(WORK_COMPLETE_ABORTED); | ||
} | ||
else | ||
{ | ||
scheduleRun(); | ||
} | ||
} | ||
} | ||
|
||
|
@@ -276,6 +313,15 @@ Work::run() | |
{ | ||
if (getState() == WORK_PENDING) | ||
{ | ||
if (mAborting) | ||
{ | ||
CLOG(DEBUG, "Work") << "aborting " << getUniqueName(); | ||
mApp.getMetrics() | ||
.NewMeter({"work", "unit", "abort"}, "unit") | ||
.Mark(); | ||
onAbort(); | ||
return; | ||
} | ||
CLOG(DEBUG, "Work") << "starting " << getUniqueName(); | ||
mApp.getMetrics().NewMeter({"work", "unit", "start"}, "unit").Mark(); | ||
onStart(); | ||
|
@@ -294,6 +340,8 @@ Work::complete(CompleteResult result) | |
mApp.getMetrics().NewMeter({"work", "unit", "success"}, "unit"); | ||
auto& fail = | ||
mApp.getMetrics().NewMeter({"work", "unit", "failure"}, "unit"); | ||
auto& aborted = | ||
mApp.getMetrics().NewMeter({"work", "unit", "abort"}, "unit"); | ||
|
||
switch (result) | ||
{ | ||
|
@@ -306,6 +354,9 @@ Work::complete(CompleteResult result) | |
case WORK_COMPLETE_FATAL: | ||
setState(WORK_FAILURE_FATAL); | ||
break; | ||
case WORK_COMPLETE_ABORTED: | ||
setState(WORK_FAILURE_ABORTED); | ||
break; | ||
} | ||
|
||
switch (getState()) | ||
|
@@ -331,6 +382,13 @@ Work::complete(CompleteResult result) | |
notifyParent(); | ||
break; | ||
|
||
case WORK_FAILURE_ABORTED: | ||
aborted.Mark(); | ||
CLOG(DEBUG, "Work") | ||
<< "notifying parent of completed abort " << getUniqueName(); | ||
notifyParent(); | ||
break; | ||
|
||
case WORK_PENDING: | ||
succ.Mark(); | ||
advance(); | ||
|
@@ -379,6 +437,12 @@ Work::onFailureRaise() | |
{ | ||
} | ||
|
||
void | ||
Work::onAbort() | ||
{ | ||
scheduleAbort(WORK_COMPLETE_ABORTED); | ||
} | ||
|
||
Work::State | ||
Work::getState() const | ||
{ | ||
|
@@ -389,7 +453,7 @@ bool | |
Work::isDone() const | ||
{ | ||
return mState == WORK_SUCCESS || mState == WORK_FAILURE_RAISE || | ||
mState == WORK_FAILURE_FATAL; | ||
mState == WORK_FAILURE_FATAL || mState == WORK_FAILURE_ABORTED; | ||
} | ||
|
||
void | ||
|
@@ -434,4 +498,43 @@ Work::notify(std::string const& child) | |
<< " of completed child " << child; | ||
advance(); | ||
} | ||
|
||
void | ||
Work::abort(CompleteResult result) | ||
{ | ||
// When `abort` signal is issued, pending work is in either | ||
// one of two states: | ||
// 1. It hasn't been scheduled to run yet. If some children are still | ||
// running, this is handled in advance where work is scheduled to abort. | ||
// Otherwise, work is scheduled to abort right away. | ||
// 2. Work is already in IO service queue, but hasn't started running yet. | ||
// This scenario is handled in `run` method, where abort is scheduled | ||
// instead of success. | ||
|
||
assert(getState() == WORK_PENDING); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. error handling is wrong: I would expect an exception to be thrown if abort is called at the wrong time. That said: I am not sure there should ever be a bad time to call abort, if Work is already complete or aborting, it can safely return (no-op)? |
||
mAborting = true; | ||
bool allDone = true; | ||
|
||
for (auto const& c : mChildren) | ||
{ | ||
if (!c.second->isDone()) | ||
{ | ||
allDone = false; | ||
} | ||
|
||
// Only abort when work is pending. Wait if it's running, as it will be | ||
// handled in `advance`. If work has finished with success or fail, | ||
// nothing to abort either | ||
if (c.second->getState() == Work::WORK_PENDING) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why isn't the logic simply if (!c.second->isDone())
{
allDone = false;
}
else
{
c.second->abort();
} |
||
{ | ||
c.second->abort(); | ||
} | ||
} | ||
|
||
if (allDone) | ||
{ | ||
// Children are ready, schedule abort for work itself. | ||
scheduleAbort(result); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems it would be better to just |
||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,14 +47,16 @@ class Work : public WorkParent | |
WORK_SUCCESS, | ||
WORK_FAILURE_RETRY, | ||
WORK_FAILURE_RAISE, | ||
WORK_FAILURE_FATAL | ||
WORK_FAILURE_FATAL, | ||
WORK_FAILURE_ABORTED | ||
}; | ||
|
||
enum CompleteResult | ||
{ | ||
WORK_COMPLETE_OK, | ||
WORK_COMPLETE_FAILURE, | ||
WORK_COMPLETE_FATAL | ||
WORK_COMPLETE_FATAL, | ||
WORK_COMPLETE_ABORTED | ||
}; | ||
|
||
Work(Application& app, WorkParent& parent, std::string uniqueName, | ||
|
@@ -78,6 +80,7 @@ class Work : public WorkParent | |
virtual void onRun(); | ||
virtual void onFailureRetry(); | ||
virtual void onFailureRaise(); | ||
virtual void onAbort(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you need to add documentation on abort semantics |
||
|
||
// onSuccess is a little different than the others: it's called on | ||
// WORK_SUCCESS, but it also returns the next sate desired: if you want | ||
|
@@ -94,6 +97,7 @@ class Work : public WorkParent | |
static std::string stateName(State st); | ||
State getState() const; | ||
bool isDone() const; | ||
void abort(CompleteResult result = WORK_COMPLETE_ABORTED); | ||
void advance(); | ||
void reset(); | ||
|
||
|
@@ -104,6 +108,7 @@ class Work : public WorkParent | |
size_t mRetries{0}; | ||
State mState{WORK_PENDING}; | ||
bool mScheduled{false}; | ||
bool mAborting{false}; | ||
|
||
std::unique_ptr<VirtualTimer> mRetryTimer; | ||
|
||
|
@@ -129,6 +134,7 @@ class Work : public WorkParent | |
scheduleComplete(WORK_COMPLETE_FATAL); | ||
} | ||
|
||
void scheduleAbort(CompleteResult result = WORK_COMPLETE_ABORTED); | ||
void setState(State s); | ||
|
||
void notifyParent(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -61,6 +61,13 @@ WorkManagerImpl::notify(std::string const& child) | |
mApp.getMetrics().NewMeter({"work", "root", "failure"}, "unit").Mark(); | ||
mChildren.erase(child); | ||
} | ||
else if (i->second->getState() == Work::WORK_FAILURE_ABORTED) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Before reviewing this PR, I opened #1755 as I thought semantics were already not super clean and error prone, now that we have abort(ing), we really need to formalize well what is going on, otherwise we're going to run into very strange bugs. Also, the semantics implied here from I would recommend going back to basics: describe a state machine, its transitions and when certain callbacks ( The two ways to abort are:
|
||
{ | ||
CLOG(WARNING, "Work") | ||
<< "WorkManager got FAILURE_ABORTED from " << child; | ||
mApp.getMetrics().NewMeter({"work", "root", "abort"}, "unit").Mark(); | ||
mChildren.erase(child); | ||
} | ||
advanceChildren(); | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is strange: I would expect
scheduleAbort
to just schedule a call toabort