Skip to content

Commit

Permalink
patch affinity to work with dws (#7)
Browse files Browse the repository at this point in the history
* patch affinity to work with dws

* lint
  • Loading branch information
asaiacai committed Oct 25, 2024
1 parent 4bab82f commit c9316e5
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 28 deletions.
9 changes: 6 additions & 3 deletions examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ name: torch-ddp-bench
num_nodes: 2

resources:
accelerators: A100:8 # Make sure you use 8 GPU instances
use_spot: True
cloud: gcp
accelerators: H100-MEGA-80GB:8 # Make sure you use 8 GPU instances
cloud: kubernetes
labels:
kueue.x-k8s.io/queue-name: user-queue # this is assigned by your admin
kueue.x-k8s.io/priority-class: low-priority
max-run-duration-seconds: "3000"

file_mounts:
./torch_ddp_benchmark.py: ./examples/torch_ddp_benchmark/torch_ddp_benchmark.py
Expand Down
50 changes: 25 additions & 25 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -585,31 +585,30 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
continue
pod_spec['metadata']['name'] = pod_name
pod_spec['metadata']['labels']['component'] = pod_name
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}

pod = _create_namespaced_pod_with_retries(namespace, pod_spec, context)
Expand Down Expand Up @@ -728,6 +727,7 @@ def _terminate_node(namespace: str, context: Optional[str],
logger.warning('terminate_instances: Error occurred when analyzing '
f'SSH Jump pod: {e}')
try:

kubernetes.core_api(context).delete_namespaced_service(
pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
kubernetes.core_api(context).delete_namespaced_service(
Expand Down

0 comments on commit c9316e5

Please sign in to comment.