Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean work abort #1729

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 118 additions & 15 deletions src/work/Work.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ Work::getStatus() const
case WORK_FAILURE_RAISE:
case WORK_FAILURE_FATAL:
return fmt::format("Failed: {:s}", getUniqueName());
case WORK_FAILURE_ABORTED:
return fmt::format("Aborted: {:s}", getUniqueName());
default:
assert(false);
return "";
Expand Down Expand Up @@ -121,6 +123,8 @@ Work::stateName(State st)
return "WORK_FAILURE_RAISE";
case WORK_FAILURE_FATAL:
return "WORK_FAILURE_FATAL";
case WORK_FAILURE_ABORTED:
return "WORK_FAILURE_ABORTED";
default:
throw std::runtime_error("Unknown Work::State");
}
Expand All @@ -141,10 +145,22 @@ Work::callComplete()
};
}

void
Work::scheduleAbort(CompleteResult result)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is strange: I would expect scheduleAbort to just schedule a call to abort

{
if (result != WORK_COMPLETE_FATAL && result != WORK_COMPLETE_FAILURE &&
result != WORK_COMPLETE_ABORTED)
{
CLOG(ERROR, "Work") << "Cannot schedule abort with non-failure state";
return;
}
scheduleComplete(result);
}

void
Work::scheduleRun()
{
if (mScheduled)
if (mScheduled || mAborting)
{
return;
}
Expand Down Expand Up @@ -183,14 +199,15 @@ Work::scheduleComplete(CompleteResult result)
return;
}
self->mScheduled = false;
self->mAborting = false;
self->complete(result);
});
}

void
Work::scheduleRetry()
{
if (mScheduled)
if (mScheduled || mAborting)
{
return;
}
Expand Down Expand Up @@ -249,25 +266,45 @@ Work::advance()
}

CLOG(DEBUG, "Work") << "advancing " << getUniqueName();
advanceChildren();
if (allChildrenSuccessful())

// If necessary, propagate abort signal before advancing children
// This is to prevent scheduling any children to run if they are about
// to be in WORK_ABORTING state (such children are scheduled to abort
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is no WORK_ABORTING state

// properly instead)
if (anyChildFatalFailure())
{
CLOG(DEBUG, "Work") << "all " << mChildren.size() << " children of "
<< getUniqueName() << " successful, scheduling run";
scheduleRun();
CLOG(DEBUG, "Work")
<< "some of " << mChildren.size() << " children of "
<< getUniqueName() << " FATALLY failed, propagating "
<< "abort";
abort(WORK_COMPLETE_FATAL);
}
else if (anyChildFatalFailure())
else if (anyChildRaiseFailure())
{
CLOG(DEBUG, "Work") << "some of " << mChildren.size() << " children of "
<< getUniqueName() << " fatally failed, scheduling "
<< "fatal failure";
scheduleFatalFailure();
<< getUniqueName() << " failed, propagating "
<< "abort";
abort(WORK_COMPLETE_FAILURE);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is basically 'abort all children and then retry', I'm not sure if naming is good here

}
else if (anyChildRaiseFailure())
else if (anyChildAborted())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I understand: why would a child aborting cause parents to abort as well?

{
CLOG(DEBUG, "Work") << "some of " << mChildren.size() << " children of "
<< getUniqueName() << " failed, scheduling failure";
scheduleFailure();
<< getUniqueName() << " aborted, propagating "
<< "abort";
abort(WORK_COMPLETE_ABORTED);
}

advanceChildren();
if (allChildrenSuccessful())
{
if (mAborting)
{
scheduleAbort(WORK_COMPLETE_ABORTED);
}
else
{
scheduleRun();
}
}
}

Expand All @@ -276,6 +313,15 @@ Work::run()
{
if (getState() == WORK_PENDING)
{
if (mAborting)
{
CLOG(DEBUG, "Work") << "aborting " << getUniqueName();
mApp.getMetrics()
.NewMeter({"work", "unit", "abort"}, "unit")
.Mark();
onAbort();
return;
}
CLOG(DEBUG, "Work") << "starting " << getUniqueName();
mApp.getMetrics().NewMeter({"work", "unit", "start"}, "unit").Mark();
onStart();
Expand All @@ -294,6 +340,8 @@ Work::complete(CompleteResult result)
mApp.getMetrics().NewMeter({"work", "unit", "success"}, "unit");
auto& fail =
mApp.getMetrics().NewMeter({"work", "unit", "failure"}, "unit");
auto& aborted =
mApp.getMetrics().NewMeter({"work", "unit", "abort"}, "unit");

switch (result)
{
Expand All @@ -306,6 +354,9 @@ Work::complete(CompleteResult result)
case WORK_COMPLETE_FATAL:
setState(WORK_FAILURE_FATAL);
break;
case WORK_COMPLETE_ABORTED:
setState(WORK_FAILURE_ABORTED);
break;
}

switch (getState())
Expand All @@ -331,6 +382,13 @@ Work::complete(CompleteResult result)
notifyParent();
break;

case WORK_FAILURE_ABORTED:
aborted.Mark();
CLOG(DEBUG, "Work")
<< "notifying parent of completed abort " << getUniqueName();
notifyParent();
break;

case WORK_PENDING:
succ.Mark();
advance();
Expand Down Expand Up @@ -379,6 +437,12 @@ Work::onFailureRaise()
{
}

void
Work::onAbort()
{
scheduleAbort(WORK_COMPLETE_ABORTED);
}

Work::State
Work::getState() const
{
Expand All @@ -389,7 +453,7 @@ bool
Work::isDone() const
{
return mState == WORK_SUCCESS || mState == WORK_FAILURE_RAISE ||
mState == WORK_FAILURE_FATAL;
mState == WORK_FAILURE_FATAL || mState == WORK_FAILURE_ABORTED;
}

void
Expand Down Expand Up @@ -434,4 +498,43 @@ Work::notify(std::string const& child)
<< " of completed child " << child;
advance();
}

void
Work::abort(CompleteResult result)
{
// When `abort` signal is issued, pending work is in either
// one of two states:
// 1. It hasn't been scheduled to run yet. If some children are still
// running, this is handled in advance where work is scheduled to abort.
// Otherwise, work is scheduled to abort right away.
// 2. Work is already in IO service queue, but hasn't started running yet.
// This scenario is handled in `run` method, where abort is scheduled
// instead of success.

assert(getState() == WORK_PENDING);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

error handling is wrong: I would expect an exception to be thrown if abort is called at the wrong time.

That said: I am not sure there should ever be a bad time to call abort, if Work is already complete or aborting, it can safely return (no-op)?

mAborting = true;
bool allDone = true;

for (auto const& c : mChildren)
{
if (!c.second->isDone())
{
allDone = false;
}

// Only abort when work is pending. Wait if it's running, as it will be
// handled in `advance`. If work has finished with success or fail,
// nothing to abort either
if (c.second->getState() == Work::WORK_PENDING)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why isn't the logic simply

if (!c.second->isDone())
{
    allDone = false;
}
else
{
     c.second->abort();
}

{
c.second->abort();
}
}

if (allDone)
{
// Children are ready, schedule abort for work itself.
scheduleAbort(result);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems it would be better to just scheduleRun here

}
}
}
10 changes: 8 additions & 2 deletions src/work/Work.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,16 @@ class Work : public WorkParent
WORK_SUCCESS,
WORK_FAILURE_RETRY,
WORK_FAILURE_RAISE,
WORK_FAILURE_FATAL
WORK_FAILURE_FATAL,
WORK_FAILURE_ABORTED
};

enum CompleteResult
{
WORK_COMPLETE_OK,
WORK_COMPLETE_FAILURE,
WORK_COMPLETE_FATAL
WORK_COMPLETE_FATAL,
WORK_COMPLETE_ABORTED
};

Work(Application& app, WorkParent& parent, std::string uniqueName,
Expand All @@ -78,6 +80,7 @@ class Work : public WorkParent
virtual void onRun();
virtual void onFailureRetry();
virtual void onFailureRaise();
virtual void onAbort();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you need to add documentation on abort semantics


// onSuccess is a little different than the others: it's called on
// WORK_SUCCESS, but it also returns the next sate desired: if you want
Expand All @@ -94,6 +97,7 @@ class Work : public WorkParent
static std::string stateName(State st);
State getState() const;
bool isDone() const;
void abort(CompleteResult result = WORK_COMPLETE_ABORTED);
void advance();
void reset();

Expand All @@ -104,6 +108,7 @@ class Work : public WorkParent
size_t mRetries{0};
State mState{WORK_PENDING};
bool mScheduled{false};
bool mAborting{false};

std::unique_ptr<VirtualTimer> mRetryTimer;

Expand All @@ -129,6 +134,7 @@ class Work : public WorkParent
scheduleComplete(WORK_COMPLETE_FATAL);
}

void scheduleAbort(CompleteResult result = WORK_COMPLETE_ABORTED);
void setState(State s);

void notifyParent();
Expand Down
7 changes: 7 additions & 0 deletions src/work/WorkManagerImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,13 @@ WorkManagerImpl::notify(std::string const& child)
mApp.getMetrics().NewMeter({"work", "root", "failure"}, "unit").Mark();
mChildren.erase(child);
}
else if (i->second->getState() == Work::WORK_FAILURE_ABORTED)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Before reviewing this PR, I opened #1755 as I thought semantics were already not super clean and error prone, now that we have abort(ing), we really need to formalize well what is going on, otherwise we're going to run into very strange bugs.

Also, the semantics implied here from onAbort don't really seem to follow what was described in #1706 (or at least it's unclear that it can work if onAbort only triggers some work).

I would recommend going back to basics: describe a state machine, its transitions and when certain callbacks (onXYZ) get called. Right now the mAborting flag makes it hard to tell which state transitions are valid vs invalid (and what is supposed to happen).

The two ways to abort are:

  • somebody wants to abort some work, this causes transitions to WORK_ABORTING to WORK_ABORTED ; any work that was in flight is now complete (and was aborted if needed).
  • abort in preparation for a retry, something like "decision to retry" -> WORK_ABORTING_FOR_RETRY (aborting work) -> WORK_PENDING (reset) -> WORK_RUNNING (run) ...

{
CLOG(WARNING, "Work")
<< "WorkManager got FAILURE_ABORTED from " << child;
mApp.getMetrics().NewMeter({"work", "root", "abort"}, "unit").Mark();
mChildren.erase(child);
}
advanceChildren();
}

Expand Down
17 changes: 14 additions & 3 deletions src/work/WorkParent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,25 @@ WorkParent::allChildrenSuccessful() const
return true;
}

bool
WorkParent::anyChildAborted() const
{
for (auto const& c : mChildren)
{
if (c.second->getState() == Work::WORK_FAILURE_ABORTED)
{
return true;
}
}
return false;
}

bool
WorkParent::allChildrenDone() const
{
for (auto& c : mChildren)
{
if (c.second->getState() != Work::WORK_SUCCESS &&
c.second->getState() != Work::WORK_FAILURE_RAISE &&
c.second->getState() != Work::WORK_FAILURE_FATAL)
if (!c.second->isDone())
{
return false;
}
Expand Down
1 change: 1 addition & 0 deletions src/work/WorkParent.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class WorkParent : public std::enable_shared_from_this<WorkParent>,
bool anyChildFatalFailure() const;
bool allChildrenSuccessful() const;
bool allChildrenDone() const;
bool anyChildAborted() const;

Application& app() const;

Expand Down