[ci][tune][train] Update release test compute configs to not schedule…

… work on head node (ray-project#48103) This PR updates the compute configs for benchmark release tests to not schedule workers onto the head node. This reflects the best practice not to schedule heavy work on the head node for cluster stability. --------- Signed-off-by: Justin Yu <[email protected]>
JP-sDEV · Nov 14, 2024 · 3ac3da9 · 3ac3da9
1 parent 5150434
commit 3ac3da9
Show file tree

Hide file tree

Showing 16 changed files with 90 additions and 45 deletions.
diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml
@@ -1,10 +1,15 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.2xlarge
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml
@@ -3,10 +3,15 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: n1-standard-8
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml
@@ -1,15 +1,17 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5.2xlarge
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml
@@ -3,15 +3,17 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-8
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml
@@ -6,10 +6,12 @@ max_workers: 7
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5.2xlarge
-      max_workers: 7
-      min_workers: 7
+      max_workers: 8
+      min_workers: 8
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml
@@ -3,15 +3,17 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 7
+max_workers: 8
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-8
-      max_workers: 7
-      min_workers: 7
+      max_workers: 8
+      min_workers: 8
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
@@ -1,13 +1,18 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
-    instance_type: g3.8xlarge
+    instance_type: m5.2xlarge
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: g3.8xlarge
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
 
 aws:
     BlockDeviceMappings:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml
@@ -3,13 +3,18 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-32-nvidia-tesla-t4-2
+    instance_type: n1-standard-8
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: n1-standard-32-nvidia-tesla-t4-2
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
 
 gcp_advanced_configurations_json:
   instance_properties:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml
@@ -1,15 +1,17 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 1
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g3.8xlarge
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: g3.8xlarge
-      max_workers: 1
-      min_workers: 1
+      max_workers: 2
+      min_workers: 2
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
@@ -3,15 +3,17 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 1
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-32-nvidia-tesla-t4-2
+    instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-32-nvidia-tesla-t4-2
-      max_workers: 1
-      min_workers: 1
+      max_workers: 2
+      min_workers: 2
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
@@ -1,17 +1,19 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
-    instance_type: g4dn.12xlarge
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: g4dn.12xlarge
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
 
 aws:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
@@ -3,17 +3,19 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-64-nvidia-tesla-t4-4
+    instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-64-nvidia-tesla-t4-4
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
 
 gcp_advanced_configurations_json:

diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
@@ -11,8 +11,11 @@
 
 
 def schedule_remote_fn_on_all_nodes(
-    remote_fn, exclude_head: bool = False, *args, **kwargs
+    remote_fn, exclude_head: bool = True, *args, **kwargs
 ):
+    """Runs remote fn on all worker nodes.
+    Also schedules on the head node if `exclude_head` is False.
+    """
     head_ip = ray.util.get_node_ip_address()
 
     futures = []
@@ -55,13 +58,15 @@ def upload_file_to_all_nodes(path: str):
     return ray.get(futures)
 
 
-@ray.remote
+@ray.remote(num_cpus=0)
 def _run_command(cmd: str):
     return subprocess.check_call(cmd)
 
 
-def run_command_on_all_nodes(cmd: List[str]):
-    futures = schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd)
+def run_command_on_all_nodes(cmd: List[str], exclude_head: bool = True):
+    futures = schedule_remote_fn_on_all_nodes(
+        _run_command, cmd=cmd, exclude_head=exclude_head
+    )
     return ray.get(futures)
 
 

diff --git a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
@@ -269,7 +269,9 @@ def run(
     path = str((Path(__file__).parent / "_tensorflow_prepare.py").absolute())
 
     upload_file_to_all_nodes(path)
-    run_command_on_all_nodes(["python", path])
+    # NOTE: This includes the head node ƒor the release smoke test that only
+    # runs on a single node.
+    run_command_on_all_nodes(["python", path], exclude_head=False)
 
     times_ray = []
     times_local_ray = []

diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
@@ -418,7 +418,9 @@ def run(
 
     path = str((Path(__file__).parent / "_torch_prepare.py").absolute())
     upload_file_to_all_nodes(path)
-    run_command_on_all_nodes(["python", path])
+    # NOTE: This includes the head node ƒor the release smoke test that only
+    # runs on a single node.
+    run_command_on_all_nodes(["python", path], exclude_head=False)
 
     times_ray = []
     times_local_ray = []

diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
@@ -21,14 +21,14 @@ def prepare_mnist():
 
     print("Preparing Torch benchmark: Downloading MNIST")
 
-    @ray.remote
+    @ray.remote(num_cpus=0)
     def _download_data():
         import torchvision
 
         torchvision.datasets.FashionMNIST("/tmp/data_fashion_mnist", download=True)
         return True
 
-    ray.get(schedule_remote_fn_on_all_nodes(_download_data))
+    ray.get(schedule_remote_fn_on_all_nodes(_download_data, exclude_head=False))
 
 
 def get_trainer(