Skip to content

Commit

Permalink
[ci][tune][train] Update release test compute configs to not schedule…
Browse files Browse the repository at this point in the history
… work on head node (ray-project#48103)

This PR updates the compute configs for benchmark release tests to not
schedule workers onto the head node. This reflects the best practice not
to schedule heavy work on the head node for cluster stability.

---------

Signed-off-by: Justin Yu <[email protected]>
  • Loading branch information
justinvyu authored and JP-sDEV committed Nov 14, 2024
1 parent 5150434 commit 3ac3da9
Show file tree
Hide file tree
Showing 16 changed files with 90 additions and 45 deletions.
9 changes: 7 additions & 2 deletions release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: m5.2xlarge

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 1
min_workers: 1
use_spot: false
9 changes: 7 additions & 2 deletions release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: n1-standard-8

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 1
min_workers: 1
use_spot: false
8 changes: 5 additions & 3 deletions release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false
8 changes: 5 additions & 3 deletions release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false
6 changes: 4 additions & 2 deletions release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ max_workers: 7
head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 7
min_workers: 7
max_workers: 8
min_workers: 8
use_spot: false
8 changes: 5 additions & 3 deletions release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 7
max_workers: 8

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 7
min_workers: 7
max_workers: 8
min_workers: 8
use_spot: false
11 changes: 8 additions & 3 deletions release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: g3.8xlarge
instance_type: m5.2xlarge

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
max_workers: 1
min_workers: 1
use_spot: false

aws:
BlockDeviceMappings:
Expand Down
11 changes: 8 additions & 3 deletions release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
instance_type: n1-standard-8

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
max_workers: 1
min_workers: 1
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
Expand Down
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 1
max_workers: 2

head_node_type:
name: head_node
instance_type: g3.8xlarge
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
max_workers: 1
min_workers: 1
max_workers: 2
min_workers: 2
use_spot: false
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 1
max_workers: 2

head_node_type:
name: head_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
max_workers: 1
min_workers: 1
max_workers: 2
min_workers: 2
use_spot: false
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: g4dn.12xlarge
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: g4dn.12xlarge
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false

aws:
Expand Down
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: n1-standard-64-nvidia-tesla-t4-4
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-64-nvidia-tesla-t4-4
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false

gcp_advanced_configurations_json:
Expand Down
13 changes: 9 additions & 4 deletions release/air_tests/air_benchmarks/workloads/benchmark_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@


def schedule_remote_fn_on_all_nodes(
remote_fn, exclude_head: bool = False, *args, **kwargs
remote_fn, exclude_head: bool = True, *args, **kwargs
):
"""Runs remote fn on all worker nodes.
Also schedules on the head node if `exclude_head` is False.
"""
head_ip = ray.util.get_node_ip_address()

futures = []
Expand Down Expand Up @@ -55,13 +58,15 @@ def upload_file_to_all_nodes(path: str):
return ray.get(futures)


@ray.remote
@ray.remote(num_cpus=0)
def _run_command(cmd: str):
return subprocess.check_call(cmd)


def run_command_on_all_nodes(cmd: List[str]):
futures = schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd)
def run_command_on_all_nodes(cmd: List[str], exclude_head: bool = True):
futures = schedule_remote_fn_on_all_nodes(
_run_command, cmd=cmd, exclude_head=exclude_head
)
return ray.get(futures)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,9 @@ def run(
path = str((Path(__file__).parent / "_tensorflow_prepare.py").absolute())

upload_file_to_all_nodes(path)
run_command_on_all_nodes(["python", path])
# NOTE: This includes the head node ƒor the release smoke test that only
# runs on a single node.
run_command_on_all_nodes(["python", path], exclude_head=False)

times_ray = []
times_local_ray = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,9 @@ def run(

path = str((Path(__file__).parent / "_torch_prepare.py").absolute())
upload_file_to_all_nodes(path)
run_command_on_all_nodes(["python", path])
# NOTE: This includes the head node ƒor the release smoke test that only
# runs on a single node.
run_command_on_all_nodes(["python", path], exclude_head=False)

times_ray = []
times_local_ray = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def prepare_mnist():

print("Preparing Torch benchmark: Downloading MNIST")

@ray.remote
@ray.remote(num_cpus=0)
def _download_data():
import torchvision

torchvision.datasets.FashionMNIST("/tmp/data_fashion_mnist", download=True)
return True

ray.get(schedule_remote_fn_on_all_nodes(_download_data))
ray.get(schedule_remote_fn_on_all_nodes(_download_data, exclude_head=False))


def get_trainer(
Expand Down

0 comments on commit 3ac3da9

Please sign in to comment.