Skip to content

Commit

Permalink
dws
Browse files Browse the repository at this point in the history
  • Loading branch information
asaiacai committed Nov 22, 2024
1 parent 9aefba6 commit 783792a
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 29 deletions.
9 changes: 6 additions & 3 deletions examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ name: torch-ddp-bench
num_nodes: 2

resources:
accelerators: A100:8 # Make sure you use 8 GPU instances
use_spot: True
cloud: gcp
accelerators: H100-MEGA-80GB:8 # Make sure you use 8 GPU instances
cloud: kubernetes
labels:
kueue.x-k8s.io/queue-name: user-queue # this is assigned by your admin
kueue.x-k8s.io/priority-class: low-priority
max-run-duration-seconds: "3000"

file_mounts:
./torch_ddp_benchmark.py: ./examples/torch_ddp_benchmark/torch_ddp_benchmark.py
Expand Down
51 changes: 25 additions & 26 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import json
import time
from typing import Any, Callable, Dict, List, Optional, Union
import uuid

from sky import exceptions
from sky import sky_logging
Expand Down Expand Up @@ -754,32 +753,32 @@ def _create_pod_thread(i: int):
continue
pod_spec['metadata']['name'] = pod_name
pod_spec['metadata']['labels']['component'] = pod_name
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec_copy['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}
}

# TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
# This is to prevent from non-TPU workloads from being scheduled on TPU
Expand Down

0 comments on commit 783792a

Please sign in to comment.