Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ci][tune][train] Update release test compute configs to not schedule work on head node #48103

Merged
9 changes: 7 additions & 2 deletions release/air_tests/air_benchmarks/compute_cpu_1_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: m5.2xlarge

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 1
min_workers: 1
use_spot: false
9 changes: 7 additions & 2 deletions release/air_tests/air_benchmarks/compute_cpu_1_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: n1-standard-8

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 1
min_workers: 1
use_spot: false
8 changes: 5 additions & 3 deletions release/air_tests/air_benchmarks/compute_cpu_4_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false
8 changes: 5 additions & 3 deletions release/air_tests/air_benchmarks/compute_cpu_4_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false
6 changes: 4 additions & 2 deletions release/air_tests/air_benchmarks/compute_cpu_8_aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ max_workers: 7
head_node_type:
name: head_node
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: m5.2xlarge
max_workers: 7
min_workers: 7
max_workers: 8
min_workers: 8
use_spot: false
8 changes: 5 additions & 3 deletions release/air_tests/air_benchmarks/compute_cpu_8_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 7
max_workers: 8

head_node_type:
name: head_node
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-8
max_workers: 7
min_workers: 7
max_workers: 8
min_workers: 8
use_spot: false
11 changes: 8 additions & 3 deletions release/air_tests/air_benchmarks/compute_gpu_1_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,13 +1,18 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: g3.8xlarge
instance_type: m5.2xlarge

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
max_workers: 1
min_workers: 1
use_spot: false

aws:
BlockDeviceMappings:
Expand Down
11 changes: 8 additions & 3 deletions release/air_tests/air_benchmarks/compute_gpu_1_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,18 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 0
max_workers: 1

head_node_type:
name: head_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
instance_type: n1-standard-8

worker_node_types: []
worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
max_workers: 1
min_workers: 1
use_spot: false

gcp_advanced_configurations_json:
instance_properties:
Expand Down
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_2x2_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 1
max_workers: 2

head_node_type:
name: head_node
instance_type: g3.8xlarge
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: g3.8xlarge
max_workers: 1
min_workers: 1
max_workers: 2
min_workers: 2
use_spot: false
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_2x2_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 1
max_workers: 2

head_node_type:
name: head_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-32-nvidia-tesla-t4-2
max_workers: 1
min_workers: 1
max_workers: 2
min_workers: 2
use_spot: false
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_4x4_aws.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,19 @@
cloud_id: {{env["ANYSCALE_CLOUD_ID"]}}
region: us-west-2

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: g4dn.12xlarge
instance_type: m5.2xlarge
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: g4dn.12xlarge
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false

aws:
Expand Down
10 changes: 6 additions & 4 deletions release/air_tests/air_benchmarks/compute_gpu_4x4_gce.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,19 @@ region: us-west1
allowed_azs:
- us-west1-b

max_workers: 3
max_workers: 4

head_node_type:
name: head_node
instance_type: n1-standard-64-nvidia-tesla-t4-4
instance_type: n1-standard-8
resources:
cpu: 0

worker_node_types:
- name: worker_node
instance_type: n1-standard-64-nvidia-tesla-t4-4
max_workers: 3
min_workers: 3
max_workers: 4
min_workers: 4
use_spot: false

gcp_advanced_configurations_json:
Expand Down
13 changes: 9 additions & 4 deletions release/air_tests/air_benchmarks/workloads/benchmark_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,11 @@


def schedule_remote_fn_on_all_nodes(
remote_fn, exclude_head: bool = False, *args, **kwargs
remote_fn, exclude_head: bool = True, *args, **kwargs
):
"""Runs remote fn on all worker nodes.
Also schedules on the head node if `exclude_head` is False.
"""
head_ip = ray.util.get_node_ip_address()

futures = []
Expand Down Expand Up @@ -55,13 +58,15 @@ def upload_file_to_all_nodes(path: str):
return ray.get(futures)


@ray.remote
@ray.remote(num_cpus=0)
def _run_command(cmd: str):
return subprocess.check_call(cmd)


def run_command_on_all_nodes(cmd: List[str]):
futures = schedule_remote_fn_on_all_nodes(_run_command, cmd=cmd)
def run_command_on_all_nodes(cmd: List[str], exclude_head: bool = True):
futures = schedule_remote_fn_on_all_nodes(
_run_command, cmd=cmd, exclude_head=exclude_head
)
return ray.get(futures)


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,9 @@ def run(
path = str((Path(__file__).parent / "_tensorflow_prepare.py").absolute())

upload_file_to_all_nodes(path)
run_command_on_all_nodes(["python", path])
# NOTE: This includes the head node ƒor the release smoke test that only
# runs on a single node.
run_command_on_all_nodes(["python", path], exclude_head=False)

times_ray = []
times_local_ray = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,9 @@ def run(

path = str((Path(__file__).parent / "_torch_prepare.py").absolute())
upload_file_to_all_nodes(path)
run_command_on_all_nodes(["python", path])
# NOTE: This includes the head node ƒor the release smoke test that only
# runs on a single node.
run_command_on_all_nodes(["python", path], exclude_head=False)

times_ray = []
times_local_ray = []
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,14 @@ def prepare_mnist():

print("Preparing Torch benchmark: Downloading MNIST")

@ray.remote
@ray.remote(num_cpus=0)
def _download_data():
import torchvision

torchvision.datasets.FashionMNIST("/tmp/data_fashion_mnist", download=True)
return True

ray.get(schedule_remote_fn_on_all_nodes(_download_data))
ray.get(schedule_remote_fn_on_all_nodes(_download_data, exclude_head=False))


def get_trainer(
Expand Down
Loading