Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Resource: support partial cancel of resources external to broker ranks #1292

Merged
merged 5 commits into from
Nov 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 27 additions & 20 deletions qmanager/policies/base/queue_policy_base.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -646,14 +646,28 @@
case job_state_kind_t::ALLOC_RUNNING:
// deliberately fall through
case job_state_kind_t::RUNNING:
if (cancel (h, job_it->second->id, R, true, full_removal) != 0) {
flux_log_error (flux_h,
"%s: .free RPC partial cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
errno = EINVAL;
goto out;
if (!final) {
if (cancel (h, job_it->second->id, R, true, full_removal) != 0) {
flux_log_error (flux_h,

Check warning on line 651 in qmanager/policies/base/queue_policy_base.hpp

View check run for this annotation

Codecov / codecov/patch

qmanager/policies/base/queue_policy_base.hpp#L651

Added line #L651 was not covered by tests
"%s: .free RPC partial cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
errno = EINVAL;
goto out;

Check warning on line 657 in qmanager/policies/base/queue_policy_base.hpp

View check run for this annotation

Codecov / codecov/patch

qmanager/policies/base/queue_policy_base.hpp#L656-L657

Added lines #L656 - L657 were not covered by tests
}
} else {
// Run a full cancel to clean up all remaining allocated resources
if (cancel (h, job_it->second->id, true) != 0) {
flux_log_error (flux_h,

Check warning on line 662 in qmanager/policies/base/queue_policy_base.hpp

View check run for this annotation

Codecov / codecov/patch

qmanager/policies/base/queue_policy_base.hpp#L662

Added line #L662 was not covered by tests
"%s: .free RPC full cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
errno = EPROTO;
goto out;

Check warning on line 668 in qmanager/policies/base/queue_policy_base.hpp

View check run for this annotation

Codecov / codecov/patch

qmanager/policies/base/queue_policy_base.hpp#L667-L668

Added lines #L667 - L668 were not covered by tests
}
full_removal = true;
}
// We still want to run the sched loop even if there's an inconsistent state
set_schedulability (true);
Expand All @@ -664,24 +678,17 @@
job_it->second->state = job_state_kind_t::COMPLETE;
// hold a reference to the shared_ptr to keep it alive
// during cancel
auto job_sp = job_it->second;
m_jobs.erase (job_it);
if (final && !full_removal) {
// This error condition indicates a discrepancy between core and sched.
if (full_removal && !final) {
// This error condition can indicate a discrepancy between core and sched,
// specifically that a partial cancel removed an allocation prior to
// receiving the final .free RPC from core.
flux_log_error (flux_h,
"%s: Final .free RPC failed to remove all resources for "
"%s: removed allocation before final .free RPC for "
"jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
// Run a full cancel to clean up all remaining allocated resources
if (cancel (h, job_sp->id, true) != 0) {
flux_log_error (flux_h,
"%s: .free RPC full cancel failed for jobid "
"%jd",
__FUNCTION__,
static_cast<intmax_t> (id));
}
errno = EPROTO;
goto out;
}
Expand Down
9 changes: 7 additions & 2 deletions resource/planner/c/planner_multi_c_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -423,8 +423,13 @@
goto done;
}
for (i = 0; i < it->second.size (); ++i) {
if (planner_rem_span (ctx->plan_multi->get_planner_at (i), it->second[i]) == -1)
goto done;
// If executed after partial cancel, depending on pruning filter settings
// some spans may no longer exist. In that case the span_lookup value for
// the resource type will be -1.
if (it->second[i] != -1) {
if (planner_rem_span (ctx->plan_multi->get_planner_at (i), it->second[i]) == -1)
goto done;

Check warning on line 431 in resource/planner/c/planner_multi_c_interface.cpp

View check run for this annotation

Codecov / codecov/patch

resource/planner/c/planner_multi_c_interface.cpp#L431

Added line #L431 was not covered by tests
}
}
ctx->plan_multi->get_span_lookup ().erase (it);
rc = 0;
Expand Down
6 changes: 6 additions & 0 deletions resource/traversers/dfu_impl_update.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,12 @@ int dfu_impl_t::mod_plan (vtx_t u, int64_t jobid, modify_data_t &mod_data)
span = alloc_span->second;
if (mod_data.mod_type != job_modify_t::PARTIAL_CANCEL) {
(*m_graph)[u].schedule.allocations.erase (alloc_span);
} else {
// This condition is encountered when the vertex is
// not associated with a broker rank. We may need
// extra logic here to handle more advanced partial
// cancel in the future.
goto done;
}
} else if ((res_span = (*m_graph)[u].schedule.reservations.find (jobid))
!= (*m_graph)[u].schedule.reservations.end ()) {
Expand Down
1 change: 1 addition & 0 deletions t/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ set(ALL_TESTS
t1024-alloc-check.t
t1025-rv1-reload.t
t1026-rv1-partial-release.t
t1027-rv1-partial-release-brokerless-resources.t
t3000-jobspec.t
t3001-resource-basic.t
t3002-resource-prefix.t
Expand Down
Loading
Loading