Skip to content

Commit

Permalink
patch affinity to work with dws (#7)
Browse files Browse the repository at this point in the history
* patch affinity to work with dws

* lint

patch affinity to work with dws (#7)

* patch affinity to work with dws

* lint

lint

lint

Update pypi-nightly-build.yml

point build to our pypi project

patch affinity to work with dws (#7)

* patch affinity to work with dws

* lint

lint
  • Loading branch information
asaiacai committed Nov 13, 2024
1 parent 06c6d27 commit 9e9888a
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 29 deletions.
9 changes: 6 additions & 3 deletions examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,12 @@ name: torch-ddp-bench
num_nodes: 2

resources:
accelerators: A100:8 # Make sure you use 8 GPU instances
use_spot: True
cloud: gcp
accelerators: H100-MEGA-80GB:8 # Make sure you use 8 GPU instances
cloud: kubernetes
labels:
kueue.x-k8s.io/queue-name: user-queue # this is assigned by your admin
kueue.x-k8s.io/priority-class: low-priority
max-run-duration-seconds: "3000"

file_mounts:
./torch_ddp_benchmark.py: ./examples/torch_ddp_benchmark/torch_ddp_benchmark.py
Expand Down
52 changes: 26 additions & 26 deletions sky/provision/kubernetes/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import json
import time
from typing import Any, Callable, Dict, List, Optional, Union
import uuid

from sky import exceptions
from sky import sky_logging
Expand Down Expand Up @@ -708,32 +707,32 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
continue
pod_spec['metadata']['name'] = pod_name
pod_spec['metadata']['labels']['component'] = pod_name
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}
# For multi-node support, we put a soft-constraint to schedule
# worker pods on different nodes than the head pod.
# This is not set as a hard constraint because if different nodes
# are not available, we still want to be able to schedule worker
# pods on larger nodes which may be able to fit multiple SkyPilot
# "nodes".
pod_spec['spec']['affinity'] = {
'podAntiAffinity': {
# Set as a soft constraint
'preferredDuringSchedulingIgnoredDuringExecution': [{
# Max weight to avoid scheduling on the
# same physical node unless necessary.
'weight': 100,
'podAffinityTerm': {
'labelSelector': {
'matchExpressions': [{
'key': TAG_SKYPILOT_CLUSTER_NAME,
'operator': 'In',
'values': [cluster_name_on_cloud]
}]
},
'topologyKey': 'kubernetes.io/hostname'
}
}]
}
}

# TPU slice nodes are given a taint, google.com/tpu=present:NoSchedule.
# This is to prevent from non-TPU workloads from being scheduled on TPU
Expand Down Expand Up @@ -862,6 +861,7 @@ def _terminate_node(namespace: str, context: Optional[str],
logger.warning('terminate_instances: Error occurred when analyzing '
f'SSH Jump pod: {e}')
try:

kubernetes.core_api(context).delete_namespaced_service(
pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
kubernetes.core_api(context).delete_namespaced_service(
Expand Down

0 comments on commit 9e9888a

Please sign in to comment.