From 51c98d3213bc9889fd2432270e6bedf0a0386f95 Mon Sep 17 00:00:00 2001 From: Christopher Cooper Date: Wed, 23 Oct 2024 17:18:10 -0700 Subject: [PATCH] [UX] skip provisioning stages if cluster is already available --- sky/backends/cloud_vm_ray_backend.py | 2 +- sky/execution.py | 35 +++++++++++++++++++++++++++- 2 files changed, 35 insertions(+), 2 deletions(-) diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index f0fb4d97ba1..e9f9092469d 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -2737,7 +2737,7 @@ def _provision( (e.g., cluster name invalid) or a region/zone throwing resource unavailability. exceptions.CommandError: any ssh command error. - RuntimeErorr: raised when 'rsync' is not installed. + RuntimeError: raised when 'rsync' is not installed. # TODO(zhwu): complete the list of exceptions. """ # FIXME: ray up for Azure with different cluster_names will overwrite diff --git a/sky/execution.py b/sky/execution.py index d9a346a99cf..0826b512e2d 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -15,6 +15,7 @@ from sky import optimizer from sky import sky_logging from sky.backends import backend_utils +from sky.exceptions import ClusterNotUpError from sky.usage import usage_lib from sky.utils import admin_policy_utils from sky.utils import controller_utils @@ -451,15 +452,47 @@ def launch( controller_utils.check_cluster_name_not_controller( cluster_name, operation_str='sky.launch') + handle = None + stages = None + # Check if cluster exists + if cluster_name is not None: + maybe_handle = global_user_state.get_handle_from_cluster_name( + cluster_name) + if maybe_handle is not None: + try: + # This will throw if the cluster is not available + backend_utils.check_cluster_available( + cluster_name, + operation='executing tasks', + check_cloud_vm_ray_backend=False, + dryrun=dryrun) + # If the cluster is available, restrict stages + handle = maybe_handle + stages = [ + # Stage.CLONE_DISK, + # Stage.PROVISION, + # Stage.OPTIMIZE, + Stage.SYNC_WORKDIR, + Stage.SYNC_FILE_MOUNTS, + # Stage.SETUP, + Stage.PRE_EXEC, + Stage.EXEC, + Stage.DOWN + ] + except ClusterNotUpError: + # Proceed with normal provisioning + pass + return _execute( entrypoint=entrypoint, dryrun=dryrun, down=down, stream_logs=stream_logs, - handle=None, + handle=handle, backend=backend, retry_until_up=retry_until_up, optimize_target=optimize_target, + stages=stages, cluster_name=cluster_name, detach_setup=detach_setup, detach_run=detach_run,