Skip to content

Commit

Permalink
Merge pull request #658 from lukemartinlogan/master
Browse files Browse the repository at this point in the history
Fix the deadlock caused in stop that occurred for single-node cases
  • Loading branch information
lukemartinlogan authored Dec 30, 2023
2 parents 603ad66 + 491cf74 commit fd6b399
Show file tree
Hide file tree
Showing 7 changed files with 10 additions and 33 deletions.
7 changes: 4 additions & 3 deletions hrun/include/hrun/hrun_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,13 @@ struct DomainId {
/** Domain has the local node */
HSHM_ALWAYS_INLINE
bool IsRemote(size_t num_hosts, u32 this_node) const {
if (num_hosts == 1) {
if (num_hosts == 1 && !flags_.Any(kNoLocal)) {
return false;
} else {
return (flags_.Any(kGlobal | kSet) || (flags_.Any(kNode) && id_ != this_node));
return
(flags_.Any(kGlobal | kSet | kNoLocal) ||
(flags_.Any(kNode) && id_ != this_node));
}
// return flags_.Any(kGlobal | kSet | kNode);
}

/** DomainId representing the local node */
Expand Down
15 changes: 0 additions & 15 deletions hrun/include/hrun/task_registry/task.h
Original file line number Diff line number Diff line change
Expand Up @@ -402,21 +402,6 @@ struct Task : public hipc::ShmContainer {
task_flags_.UnsetBits(TASK_LANE_ALL);
}

/** This task is a root task */
HSHM_ALWAYS_INLINE bool IsRoot() {
return task_flags_.Any(TASK_IS_ROOT);
}

/** Set this task as a root task */
HSHM_ALWAYS_INLINE void SetRoot() {
task_flags_.SetBits(TASK_IS_ROOT);
}

/** Unset this task a sa root task */
HSHM_ALWAYS_INLINE void UnsetRoot() {
task_flags_.UnsetBits(TASK_IS_ROOT);
}

/** Set period in nanoseconds */
HSHM_ALWAYS_INLINE void SetPeriodNs(double ns) {
period_ns_ = ns;
Expand Down
9 changes: 1 addition & 8 deletions hrun/include/hrun/work_orchestrator/worker.h
Original file line number Diff line number Diff line change
Expand Up @@ -487,14 +487,7 @@ class Worker {
task->SetStarted();
}
} else {
try {
exec->Run(task->method_, task, rctx);
} catch (std::exception &e) {
HELOG(kError, "(node {}) Worker {} caught an exception: {}", HRUN_CLIENT->node_id_, id_, e.what());
} catch (...) {
HELOG(kError, "(node {}) Worker {} caught an unknown exception", HRUN_CLIENT->node_id_, id_);

}
exec->Run(task->method_, task, rctx);
task->SetStarted();
}
task->DidRun(work_entry.cur_time_);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,9 +178,14 @@ class Client : public TaskLibClient {
}
HRUN_TASK_NODE_ADMIN_ROOT(StopRuntime);
void StopRuntimeRoot() {
HILOG(kInfo, "Beginning to flush the runtime.\n"
"If you did async I/O, this may take some time.\n"
"All unflushed data will be written to the PFS.")
FlushRoot(DomainId::GetGlobal());
HILOG(kInfo, "Stopping the runtime")
AsyncStopRuntimeRoot(DomainId::GetGlobalMinusLocal());
AsyncStopRuntimeRoot(DomainId::GetLocal());
HILOG(kInfo, "All done!")
}

/** Set work orchestrator queue policy */
Expand Down
3 changes: 0 additions & 3 deletions hrun/tasks_required/proc_queue/src/proc_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ class Server : public TaskLib {
ptr->UnsetFireAndForget();
task->is_fire_forget_ = true;
}
if (ptr->task_node_.IsRoot() || task->task_node_.IsRoot()) {
ptr->SetRoot();
}
MultiQueue *real_queue = HRUN_CLIENT->GetQueue(QueueId(ptr->task_state_));
bool ret = real_queue->EmplaceFrac(
ptr->prio_, ptr->lane_hash_, task->sub_run_.shm_);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,6 @@ class Client : public TaskLibClient {
LPointer<PushTask> push_task = HRUN_CLIENT->NewTask<PushTask>(
orig_task->task_node_ + 1, DomainId::GetLocal(), id_,
domain_ids, orig_task, exec, orig_task->method_, xfer);
if (orig_task->IsRoot()) {
push_task->SetRoot();
}
MultiQueue *queue = HRUN_CLIENT->GetQueue(queue_id_);
queue->Emplace(push_task->prio_, 0, push_task.shm_);
}
Expand Down
1 change: 0 additions & 1 deletion hrun/tasks_required/remote_queue/src/remote_queue.cc
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,6 @@ class Server : public TaskLib {
orig_task->UnsetStarted();
orig_task->UnsetDataOwner();
orig_task->UnsetLongRunning();
orig_task->UnsetRoot();
orig_task->task_flags_.SetBits(TASK_REMOTE_DEBUG_MARK);

// Execute task
Expand Down

0 comments on commit fd6b399

Please sign in to comment.