-
-
Notifications
You must be signed in to change notification settings - Fork 720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactor missing-data command #6332
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -4683,53 +4683,48 @@ def handle_task_erred(self, key: str, stimulus_id: str, **msg) -> None: | |||||||||||||||||||||||||||||
self.send_all(client_msgs, worker_msgs) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def handle_missing_data( | ||||||||||||||||||||||||||||||
self, key: str, errant_worker: str, stimulus_id: str, **kwargs | ||||||||||||||||||||||||||||||
self, key: str, worker: str, errant_worker: str, stimulus_id: str | ||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For my clarification: how does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's automatically added by There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see logic for that anywhere in distributed/distributed/batched.py Lines 131 to 143 in b8b45c6
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's here, on the receiving end: distributed/distributed/scheduler.py Line 4817 in 6e0fe58
|
||||||||||||||||||||||||||||||
) -> None: | ||||||||||||||||||||||||||||||
"""Signal that `errant_worker` does not hold `key` | ||||||||||||||||||||||||||||||
"""Signal that `errant_worker` does not hold `key`. | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
This may either indicate that `errant_worker` is dead or that we may be | ||||||||||||||||||||||||||||||
working with stale data and need to remove `key` from the workers | ||||||||||||||||||||||||||||||
`has_what`. | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
If no replica of a task is available anymore, the task is transitioned | ||||||||||||||||||||||||||||||
back to released and rescheduled, if possible. | ||||||||||||||||||||||||||||||
This may either indicate that `errant_worker` is dead or that we may be working | ||||||||||||||||||||||||||||||
with stale data and need to remove `key` from the workers `has_what`. If no | ||||||||||||||||||||||||||||||
replica of a task is available anymore, the task is transitioned back to | ||||||||||||||||||||||||||||||
released and rescheduled, if possible. | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
Parameters | ||||||||||||||||||||||||||||||
---------- | ||||||||||||||||||||||||||||||
key : str, optional | ||||||||||||||||||||||||||||||
Task key that could not be found, by default None | ||||||||||||||||||||||||||||||
errant_worker : str, optional | ||||||||||||||||||||||||||||||
Address of the worker supposed to hold a replica, by default None | ||||||||||||||||||||||||||||||
key : str | ||||||||||||||||||||||||||||||
Task key that could not be found | ||||||||||||||||||||||||||||||
worker : str | ||||||||||||||||||||||||||||||
Address of the worker informing the scheduler | ||||||||||||||||||||||||||||||
errant_worker : str | ||||||||||||||||||||||||||||||
Address of the worker supposed to hold a replica | ||||||||||||||||||||||||||||||
""" | ||||||||||||||||||||||||||||||
logger.debug("handle missing data key=%s worker=%s", key, errant_worker) | ||||||||||||||||||||||||||||||
logger.debug(f"handle missing data {key=} {worker=} {errant_worker=}") | ||||||||||||||||||||||||||||||
self.log_event(errant_worker, {"action": "missing-data", "key": key}) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
if key not in self.tasks: | ||||||||||||||||||||||||||||||
ts = self.tasks.get(key) | ||||||||||||||||||||||||||||||
ws = self.workers.get(errant_worker) | ||||||||||||||||||||||||||||||
if not ts or not ws or ws not in ts.who_has: | ||||||||||||||||||||||||||||||
return | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
ts: TaskState = self.tasks[key] | ||||||||||||||||||||||||||||||
ws: WorkerState = self.workers.get(errant_worker) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
if ws is not None and ws in ts.who_has: | ||||||||||||||||||||||||||||||
self.remove_replica(ts, ws) | ||||||||||||||||||||||||||||||
self.remove_replica(ts, ws) | ||||||||||||||||||||||||||||||
if ts.state == "memory" and not ts.who_has: | ||||||||||||||||||||||||||||||
if ts.run_spec: | ||||||||||||||||||||||||||||||
self.transitions({key: "released"}, stimulus_id) | ||||||||||||||||||||||||||||||
else: | ||||||||||||||||||||||||||||||
self.transitions({key: "forgotten"}, stimulus_id) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def release_worker_data(self, key, worker, stimulus_id): | ||||||||||||||||||||||||||||||
ws: WorkerState = self.workers.get(worker) | ||||||||||||||||||||||||||||||
ts: TaskState = self.tasks.get(key) | ||||||||||||||||||||||||||||||
if not ws or not ts: | ||||||||||||||||||||||||||||||
def release_worker_data(self, key: str, worker: str, stimulus_id: str) -> None: | ||||||||||||||||||||||||||||||
ts = self.tasks.get(key) | ||||||||||||||||||||||||||||||
ws = self.workers.get(worker) | ||||||||||||||||||||||||||||||
if not ts or not ws or ws not in ts.who_has: | ||||||||||||||||||||||||||||||
return | ||||||||||||||||||||||||||||||
recommendations: dict = {} | ||||||||||||||||||||||||||||||
if ws in ts.who_has: | ||||||||||||||||||||||||||||||
self.remove_replica(ts, ws) | ||||||||||||||||||||||||||||||
if not ts.who_has: | ||||||||||||||||||||||||||||||
recommendations[ts.key] = "released" | ||||||||||||||||||||||||||||||
if recommendations: | ||||||||||||||||||||||||||||||
self.transitions(recommendations, stimulus_id) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
self.remove_replica(ts, ws) | ||||||||||||||||||||||||||||||
if not ts.who_has: | ||||||||||||||||||||||||||||||
self.transitions({key: "released"}, stimulus_id) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def handle_long_running(self, key=None, worker=None, compute_duration=None): | ||||||||||||||||||||||||||||||
"""A task has seceded from the thread pool | ||||||||||||||||||||||||||||||
|
@@ -4907,7 +4902,7 @@ async def register_scheduler_plugin(self, plugin, name=None, idempotent=None): | |||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
self.add_plugin(plugin, name=name, idempotent=idempotent) | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
def worker_send(self, worker, msg): | ||||||||||||||||||||||||||||||
def worker_send(self, worker: str, msg: dict[str, Any]) -> None: | ||||||||||||||||||||||||||||||
"""Send message to worker | ||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||
This also handles connection failures by adding a callback to remove | ||||||||||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -120,6 +120,7 @@ | |
Instructions, | ||
InvalidTransition, | ||
LongRunningMsg, | ||
MissingDataMsg, | ||
Recs, | ||
RecsInstrs, | ||
ReleaseWorkerDataMsg, | ||
|
@@ -2128,7 +2129,7 @@ def transition_long_running_rescheduled( | |
self, ts: TaskState, *, stimulus_id: str | ||
) -> RecsInstrs: | ||
recs: Recs = {ts: "released"} | ||
smsg = RescheduleMsg(key=ts.key, worker=self.address, stimulus_id=stimulus_id) | ||
smsg = RescheduleMsg(key=ts.key, stimulus_id=stimulus_id) | ||
return recs, [smsg] | ||
|
||
def transition_executing_rescheduled( | ||
|
@@ -2141,11 +2142,7 @@ def transition_executing_rescheduled( | |
return merge_recs_instructions( | ||
( | ||
{ts: "released"}, | ||
[ | ||
RescheduleMsg( | ||
key=ts.key, worker=self.address, stimulus_id=stimulus_id | ||
) | ||
], | ||
[RescheduleMsg(key=ts.key, stimulus_id=stimulus_id)], | ||
), | ||
self._ensure_computing(), | ||
) | ||
|
@@ -3274,6 +3271,7 @@ async def gather_dep( | |
return None | ||
|
||
recommendations: Recs = {} | ||
instructions: Instructions = [] | ||
response = {} | ||
to_gather_keys: set[str] = set() | ||
cancelled_keys: set[str] = set() | ||
|
@@ -3395,17 +3393,17 @@ def done_event(): | |
ts.who_has.discard(worker) | ||
self.has_what[worker].discard(ts.key) | ||
self.log.append((d, "missing-dep", stimulus_id, time())) | ||
self.batched_stream.send( | ||
{ | ||
"op": "missing-data", | ||
"errant_worker": worker, | ||
"key": d, | ||
"stimulus_id": stimulus_id, | ||
} | ||
instructions.append( | ||
MissingDataMsg( | ||
key=d, | ||
errant_worker=worker, | ||
stimulus_id=stimulus_id, | ||
) | ||
) | ||
recommendations[ts] = "fetch" | ||
del data, response | ||
self.transitions(recommendations, stimulus_id=stimulus_id) | ||
self._handle_instructions(instructions) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would be seriously worried if order mattered There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ordering does sometimes matter and I suggest to not reject such a possibility just because it would be bad design. The scheduler should be able to handle all possible kind of orderings but this can still introduce subtle and unintended changes in behavior and I would advise caution either way |
||
|
||
if refresh_who_has: | ||
# All workers that hold known replicas of our tasks are busy. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all changes to scheduler.py are purely cosmetic