Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core][Autoscaler] Configure idleTimeoutSeconds per node type #48813

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ def _node_type_from_group_spec(

resources = _get_ray_resources_from_group_spec(group_spec, is_head)

return {
node_type = {
"min_workers": min_workers,
"max_workers": max_workers,
# `node_config` is a legacy field required for compatibility.
Expand All @@ -228,6 +228,12 @@ def _node_type_from_group_spec(
"resources": resources,
}

idle_timeout_s = group_spec.get("idleTimeoutSeconds")
ryanaoleary marked this conversation as resolved.
Show resolved Hide resolved
if idle_timeout_s is not None:
node_type["idle_timeout_s"] = idle_timeout_s

return node_type


def _get_ray_resources_from_group_spec(
group_spec: Dict[str, Any], is_head: bool
Expand Down
1 change: 1 addition & 0 deletions python/ray/autoscaler/ray-schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@
},
"min_workers": {"type": "integer"},
"max_workers": {"type": "integer"},
"idle_timeout_s": {"type": "integer", "nullable": true},
"resources": {
"type": "object",
"patternProperties": {
Expand Down
3 changes: 3 additions & 0 deletions python/ray/autoscaler/v2/instance_manager/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ class NodeTypeConfig:
min_worker_nodes: int
# The maximal number of worker nodes can be launched for this node type.
max_worker_nodes: int
# Idle timeout seconds for worker nodes of this node type.
idle_timeout_s: Optional[int] = None
ryanaoleary marked this conversation as resolved.
Show resolved Hide resolved
# The total resources on the node.
resources: Dict[str, float] = field(default_factory=dict)
# The labels on the node.
Expand Down Expand Up @@ -346,6 +348,7 @@ def get_node_type_configs(self) -> Dict[NodeType, NodeTypeConfig]:
name=node_type,
min_worker_nodes=node_config.get("min_workers", 0),
max_worker_nodes=max_workers_nodes,
idle_timeout_s=node_config.get("idle_timeout_s", None),
resources=node_config.get("resources", {}),
labels=node_config.get("labels", {}),
launch_config_hash=launch_config_hash,
Expand Down
6 changes: 6 additions & 0 deletions python/ray/autoscaler/v2/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1584,6 +1584,12 @@ def _enforce_idle_termination(
continue

idle_timeout_s = ctx.get_idle_timeout_s()
# Override the scheduler idle_timeout_s if set for this node_type.
ryanaoleary marked this conversation as resolved.
Show resolved Hide resolved
for node_type in node_type_configs:
if node_type != node.node_type:
continue
if node_type_configs[node_type].idle_timeout_s is not None:
idle_timeout_s = node_type_configs[node_type].idle_timeout_s
if idle_timeout_s is None:
# No idle timeout is set, skip the idle termination.
continue
Expand Down