ray-project · justinvyu · Oct 29, 2024 · Oct 18, 2024 · Oct 18, 2024 · Oct 21, 2024
diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml
@@ -1,10 +1,15 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: m5.2xlarge
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml
@@ -3,10 +3,15 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: n1-standard-8
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml
@@ -1,15 +1,17 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5.2xlarge
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml
@@ -3,15 +3,17 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-8
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml
@@ -6,10 +6,12 @@ max_workers: 7
 head_node_type:
     name: head_node
     instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: m5.2xlarge
-      max_workers: 7
-      min_workers: 7
+      max_workers: 8
+      min_workers: 8
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml b/release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml
@@ -3,15 +3,17 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 7
+max_workers: 8
 
 head_node_type:
     name: head_node
     instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-8
-      max_workers: 7
-      min_workers: 7
+      max_workers: 8
+      min_workers: 8
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
@@ -1,13 +1,18 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
-    instance_type: g3.8xlarge
+    instance_type: m5.2xlarge
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: g3.8xlarge
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
 
 aws:
     BlockDeviceMappings:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml
@@ -3,13 +3,18 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 0
+max_workers: 1
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-32-nvidia-tesla-t4-2
+    instance_type: n1-standard-8
 
-worker_node_types: []
+worker_node_types:
+    - name: worker_node
+      instance_type: n1-standard-32-nvidia-tesla-t4-2
+      max_workers: 1
+      min_workers: 1
+      use_spot: false
 
 gcp_advanced_configurations_json:
   instance_properties:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml
@@ -1,15 +1,17 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 1
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: g3.8xlarge
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: g3.8xlarge
-      max_workers: 1
-      min_workers: 1
+      max_workers: 2
+      min_workers: 2
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
@@ -3,15 +3,17 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 1
+max_workers: 2
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-32-nvidia-tesla-t4-2
+    instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-32-nvidia-tesla-t4-2
-      max_workers: 1
-      min_workers: 1
+      max_workers: 2
+      min_workers: 2
       use_spot: false
diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
@@ -1,17 +1,19 @@
 cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
 region: us-west-2
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
-    instance_type: g4dn.12xlarge
+    instance_type: m5.2xlarge
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: g4dn.12xlarge
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
 
 aws:

diff --git a/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml b/release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
@@ -3,17 +3,19 @@ region: us-west1
 allowed_azs:
     - us-west1-b
 
-max_workers: 3
+max_workers: 4
 
 head_node_type:
     name: head_node
-    instance_type: n1-standard-64-nvidia-tesla-t4-4
+    instance_type: n1-standard-8
+    resources:
+      cpu: 0
 
 worker_node_types:
     - name: worker_node
       instance_type: n1-standard-64-nvidia-tesla-t4-4
-      max_workers: 3
-      min_workers: 3
+      max_workers: 4
+      min_workers: 4
       use_spot: false
 
 gcp_advanced_configurations_json:

diff --git a/release/air_tests/air_benchmarks/workloads/benchmark_util.py b/release/air_tests/air_benchmarks/workloads/benchmark_util.py
@@ -11,8 +11,11 @@
 
 
 def schedule_remote_fn_on_all_nodes(
-    remote_fn, exclude_head: bool = False, *args, **kwargs
+    remote_fn, exclude_head: bool = True, *args, **kwargs
 ):
+    """Runs remote fn on all worker nodes.
+    Also schedules on the head node if `exclude_head` is False.
+    """
     head_ip = ray.util.get_node_ip_address()
 
     futures = []
@@ -55,13 +58,15 @@ def upload_file_to_all_nodes(path: str):
     return ray.get(futures)
 
 
-@ray.remote
+@ray.remote(num_cpus=0)
 def _run_command(cmd: str):
     return subprocess.check_call(cmd)
 
 
-def run_command_on_all_nodes(cmd: List[str]):
-    futures = schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd)
+def run_command_on_all_nodes(cmd: List[str], exclude_head: bool = True):
+    futures = schedule_remote_fn_on_all_nodes(
+        _run_command, cmd=cmd, exclude_head=exclude_head
+    )
     return ray.get(futures)
 
 

diff --git a/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py b/release/air_tests/air_benchmarks/workloads/tensorflow_benchmark.py
@@ -269,7 +269,9 @@ def run(
     path = str((Path(__file__).parent / "_tensorflow_prepare.py").absolute())
 
     upload_file_to_all_nodes(path)
-    run_command_on_all_nodes(["python", path])
+    # NOTE: This includes the head node ƒor the release smoke test that only
+    # runs on a single node.
+    run_command_on_all_nodes(["python", path], exclude_head=False)
 
     times_ray = []
     times_local_ray = []

diff --git a/release/air_tests/air_benchmarks/workloads/torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/torch_benchmark.py
@@ -418,7 +418,9 @@ def run(
 
     path = str((Path(__file__).parent / "_torch_prepare.py").absolute())
     upload_file_to_all_nodes(path)
-    run_command_on_all_nodes(["python", path])
+    # NOTE: This includes the head node ƒor the release smoke test that only
+    # runs on a single node.
+    run_command_on_all_nodes(["python", path], exclude_head=False)
 
     times_ray = []
     times_local_ray = []

diff --git a/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py b/release/air_tests/air_benchmarks/workloads/tune_torch_benchmark.py
@@ -21,14 +21,14 @@ def prepare_mnist():
 
     print("Preparing Torch benchmark: Downloading MNIST")
 
-    @ray.remote
+    @ray.remote(num_cpus=0)
     def _download_data():
         import torchvision
 
         torchvision.datasets.FashionMNIST("/tmp/data_fashion_mnist", download=True)
         return True
 
-    ray.get(schedule_remote_fn_on_all_nodes(_download_data))
+    ray.get(schedule_remote_fn_on_all_nodes(_download_data, exclude_head=False))
 
 
 def get_trainer(