From c9316e504e5ec88ac8cb3fdf75023394b93fd96a Mon Sep 17 00:00:00 2001
From: Andrew Aikawa <asai@berkeley.edu>
Date: Fri, 11 Oct 2024 02:19:10 -0700
Subject: [PATCH] patch affinity to work with dws (#7)

* patch affinity to work with dws

* lint
---
 .../torch_ddp_benchmark.yaml                  |  9 ++--
 sky/provision/kubernetes/instance.py          | 50 +++++++++----------
 2 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml b/examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
index 362789610061..1ff09e2fea0e 100644
--- a/examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
+++ b/examples/torch_ddp_benchmark/torch_ddp_benchmark.yaml
@@ -28,9 +28,12 @@ name: torch-ddp-bench
 num_nodes: 2
 
 resources:
-  accelerators: A100:8 # Make sure you use 8 GPU instances
-  use_spot: True
-  cloud: gcp
+  accelerators: H100-MEGA-80GB:8 # Make sure you use 8 GPU instances
+  cloud: kubernetes
+  labels:
+    kueue.x-k8s.io/queue-name: user-queue # this is assigned by your admin
+    kueue.x-k8s.io/priority-class: low-priority
+    max-run-duration-seconds: "3000"
 
 file_mounts: 
   ./torch_ddp_benchmark.py: ./examples/torch_ddp_benchmark/torch_ddp_benchmark.py
diff --git a/sky/provision/kubernetes/instance.py b/sky/provision/kubernetes/instance.py
index e5932e5e19a9..8242439461af 100644
--- a/sky/provision/kubernetes/instance.py
+++ b/sky/provision/kubernetes/instance.py
@@ -585,31 +585,30 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
                 continue
             pod_spec['metadata']['name'] = pod_name
             pod_spec['metadata']['labels']['component'] = pod_name
-            # For multi-node support, we put a soft-constraint to schedule
-            # worker pods on different nodes than the head pod.
-            # This is not set as a hard constraint because if different nodes
-            # are not available, we still want to be able to schedule worker
-            # pods on larger nodes which may be able to fit multiple SkyPilot
-            # "nodes".
-            pod_spec['spec']['affinity'] = {
-                'podAntiAffinity': {
-                    # Set as a soft constraint
-                    'preferredDuringSchedulingIgnoredDuringExecution': [{
-                        # Max weight to avoid scheduling on the
-                        # same physical node unless necessary.
-                        'weight': 100,
-                        'podAffinityTerm': {
-                            'labelSelector': {
-                                'matchExpressions': [{
-                                    'key': TAG_SKYPILOT_CLUSTER_NAME,
-                                    'operator': 'In',
-                                    'values': [cluster_name_on_cloud]
-                                }]
-                            },
-                            'topologyKey': 'kubernetes.io/hostname'
-                        }
-                    }]
-                }
+        # For multi-node support, we put a soft-constraint to schedule
+        # worker pods on different nodes than the head pod.
+        # This is not set as a hard constraint because if different nodes
+        # are not available, we still want to be able to schedule worker
+        # pods on larger nodes which may be able to fit multiple SkyPilot
+        # "nodes".
+        pod_spec['spec']['affinity'] = {
+            'podAntiAffinity': {
+                # Set as a soft constraint
+                'preferredDuringSchedulingIgnoredDuringExecution': [{
+                    # Max weight to avoid scheduling on the
+                    # same physical node unless necessary.
+                    'weight': 100,
+                    'podAffinityTerm': {
+                        'labelSelector': {
+                            'matchExpressions': [{
+                                'key': TAG_SKYPILOT_CLUSTER_NAME,
+                                'operator': 'In',
+                                'values': [cluster_name_on_cloud]
+                            }]
+                        },
+                        'topologyKey': 'kubernetes.io/hostname'
+                    }
+                }]
             }
 
         pod = _create_namespaced_pod_with_retries(namespace, pod_spec, context)
@@ -728,6 +727,7 @@ def _terminate_node(namespace: str, context: Optional[str],
         logger.warning('terminate_instances: Error occurred when analyzing '
                        f'SSH Jump pod: {e}')
     try:
+
         kubernetes.core_api(context).delete_namespaced_service(
             pod_name, namespace, _request_timeout=config_lib.DELETION_TIMEOUT)
         kubernetes.core_api(context).delete_namespaced_service(