Skip to content

Commit

Permalink
[core] Migrate many_nodes_actor_tests to new cloud. (#31863)
Browse files Browse the repository at this point in the history
This PR make the test run with the new cloud to prevent regression.
  • Loading branch information
fishbone authored Jan 25, 2023
1 parent b7d6f2f commit d9dd326
Show file tree
Hide file tree
Showing 5 changed files with 17 additions and 7 deletions.
2 changes: 1 addition & 1 deletion release/nightly_tests/many_nodes_tests/actor_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,11 +34,11 @@ def main():
args, unknown = parse_script_args()

ray.init(address="auto")

actor_launch_start = perf_counter()
actors = test_max_actors_launch(args.cpus_per_actor, args.total_actors)
actor_launch_end = perf_counter()
actor_launch_time = actor_launch_end - actor_launch_start

if args.fail:
sleep(10)
return
Expand Down
2 changes: 1 addition & 1 deletion release/nightly_tests/many_nodes_tests/compute_config.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cloud_id: cld_4F7k8814aZzGG8TNUGPKnc
cloud_id: cld_kvedZWag2qA8i5BjxUevf5i7

region: us-west-2

Expand Down
9 changes: 8 additions & 1 deletion release/ray_release/command_runner/command_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@

from ray_release.cluster_manager.cluster_manager import ClusterManager
from ray_release.file_manager.file_manager import FileManager
from ray_release.util import exponential_backoff_retry
from click.exceptions import ClickException


class CommandRunner(abc.ABC):
Expand Down Expand Up @@ -85,7 +87,12 @@ def run_prepare_command(
Command runners may choose to run this differently than the
test command.
"""
return self.run_command(command, env, timeout)
return exponential_backoff_retry(
lambda: self.run_command(command, env, timeout),
ClickException,
initial_retry_delay_s=5,
max_retries=3,
)

def get_last_logs(self):
raise NotImplementedError
Expand Down
5 changes: 3 additions & 2 deletions release/ray_release/file_manager/job_file_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@ def __init__(self, cluster_manager: ClusterManager):
self.s3_client = boto3.client("s3")
self.bucket = str(RELEASE_AWS_BUCKET)
self.job_manager = JobManager(cluster_manager)

sys.path.insert(0, f"{anyscale.ANYSCALE_RAY_DIR}/bin")
# Backward compatible
if "ANYSCALE_RAY_DIR" in anyscale.__dict__:
sys.path.insert(0, f"{anyscale.ANYSCALE_RAY_DIR}/bin")

def _run_with_retry(self, f, initial_retry_delay_s: int = 10):
assert callable(f)
Expand Down
6 changes: 4 additions & 2 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3881,16 +3881,18 @@

frequency: nightly-3x
team: core
env: staging
cluster:
cluster_env: many_nodes_tests/app_config.yaml
cluster_compute: many_nodes_tests/compute_config.yaml

run:
timeout: 7200
script: python many_nodes_tests/actor_test.py
# 4cpus per node x 250 nodes / 0.2 cpus per actor = 5k
script: python many_nodes_tests/actor_test.py --cpus-per-actor=0.2 --total-actors=5000
wait_for_nodes:
num_nodes: 251

type: job

#- name: many_nodes_multi_master_test
# group: core-daily-test
Expand Down

0 comments on commit d9dd326

Please sign in to comment.