diff --git a/docs/source-app/workflows/byoc/index.rst b/docs/source-app/workflows/byoc/index.rst index 2cabf046939ba..1db2965dc704e 100644 --- a/docs/source-app/workflows/byoc/index.rst +++ b/docs/source-app/workflows/byoc/index.rst @@ -63,7 +63,7 @@ Parameters ^^^^^^^^^^ +------------------------+----------------------------------------------------------------------------------------------------+ -|Parameter | Descritption | +|Parameter | Description | +========================+====================================================================================================+ | provider | The cloud provider where your cluster is located. | | | | @@ -78,10 +78,6 @@ Parameters +------------------------+----------------------------------------------------------------------------------------------------+ | region | AWS region containing compute resources | +------------------------+----------------------------------------------------------------------------------------------------+ -| instance-types | Instance types that you want to support, for computer jobs within the cluster. | -| | | -| | For now, this is the AWS instance types supported by the cluster. | -+------------------------+----------------------------------------------------------------------------------------------------+ | enable-performance | Specifies if the cluster uses cost savings mode. | | | | | | In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters | @@ -89,7 +85,7 @@ Parameters +------------------------+----------------------------------------------------------------------------------------------------+ | edit-before-creation | Enables interactive editing of requests before submitting it to Lightning AI. | +------------------------+----------------------------------------------------------------------------------------------------+ -| wait | Waits for the cluster to be in a RUNNING state. Only use this for debugging. | +| no-wait | Cluster creation will happen in the background. | +------------------------+----------------------------------------------------------------------------------------------------+ ---- diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 5c750bb8f48a4..ddd5469a2c0fe 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- Cluster creation and deletion now waits by default [#15458](https://github.com/Lightning-AI/lightning/pull/15458) - Changed the `flow.flows` to be recursive wont to align the behavior with the `flow.works` ([#15466](https://github.com/Lightning-AI/lightning/pull/15466)) - diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 4bb8b1fdb793f..9211e120d952e 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -2,6 +2,7 @@ import re import time from datetime import datetime +from enum import Enum from textwrap import dedent from typing import Any, List @@ -29,6 +30,23 @@ MAX_CLUSTER_WAIT_TIME = 5400 +class ClusterState(Enum): + UNSPECIFIED = "unspecified" + QUEUED = "queued" + PENDING = "pending" + RUNNING = "running" + FAILED = "failed" + DELETED = "deleted" + + def __str__(self) -> str: + return str(self.value) + + @classmethod + def from_api(cls, status: V1ClusterState) -> "ClusterState": + parsed = str(status).lower().split("_", maxsplit=2)[-1] + return cls(parsed) + + class ClusterList(Formatable): def __init__(self, clusters: List[Externalv1Cluster]): self.clusters = clusters @@ -130,9 +148,6 @@ def create( click.echo("cluster unchanged") resp = self.api_client.cluster_service_create_cluster(body=new_body) - if wait: - _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) - click.echo( dedent( f"""\ @@ -146,6 +161,13 @@ def create( """ ) ) + if wait: + click.echo("Waiting for cluster to enter state running...") + click.echo( + "Canceling this operation will NOT stop the cluster from creating" + f"(use `lightning delete cluster {resp.id}`)" + ) + _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) def get_clusters(self) -> ClusterList: resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) @@ -171,6 +193,8 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No click.echo("Cluster deletion triggered successfully") if wait: + click.echo("Waiting for cluster to delete...") + click.echo("Canceling the operation will NOT stop the cluster from deleting") _wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED) @@ -183,6 +207,8 @@ def _wait_for_cluster_state( ) -> None: """_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed. + Messages will be displayed to the user as the cluster changes state. + Args: api_client: LightningClient used for polling cluster_id: Specifies the cluster to wait for @@ -192,6 +218,7 @@ def _wait_for_cluster_state( """ start = time.time() elapsed = 0 + while elapsed < max_wait_time: cluster_resp = api_client.cluster_service_list_clusters() new_cluster = None @@ -200,10 +227,14 @@ def _wait_for_cluster_state( new_cluster = clust break if new_cluster is not None: + echo_cluster_status_long( + cluster_id=cluster_id, + current_state=new_cluster.status.phase, + current_reason=new_cluster.status.reason, + desired_state=target_state, + ) if new_cluster.status.phase == target_state: break - elif new_cluster.status.phase == V1ClusterState.FAILED: - raise click.ClickException(f"Cluster {cluster_id} is in failed state.") time.sleep(check_timeout) elapsed = int(time.time() - start) else: @@ -219,3 +250,49 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str: Provide a cluster name using valid characters and try again.""" ) return value + + +def echo_cluster_status_long( + cluster_id: str, + current_state: V1ClusterState, + current_reason: str, + desired_state: V1ClusterState, +) -> None: + """Echos a long-form status message to the user about the cluster state. + + Args: + cluster_id: The name of the cluster + current_state: The cluster's current state + reason: The reason for the cluster's state + """ + + state_str = ClusterState.from_api(current_state) + + message = f"Cluster {cluster_id} is now {state_str}" + ( + f" with the following reason: {current_reason}" if current_reason else "" + ) + + if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.RUNNING: + message = "\n".join( + [ + f"Cluster {cluster_id} is now running and ready to use.", + f"To launch an app on this cluster use `lightning run app app.py --cloud --cluster-id {cluster_id}`", + ] + ) + if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.DELETED: + message = f"Cluster {cluster_id} is terminating" + if current_state == V1ClusterState.FAILED: + message = "\n".join( + [ + message, + "We are automatically retrying cluster creation.", + "In case you want to delete this cluster:", + "1. Stop this command", + f"2. Run `lightning delete cluster {cluster_id}", + "WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.", + ] + ) + if current_state == V1ClusterState.DELETED: + message = f"Cluster {cluster_id} has been deleted." + + click.echo(message) diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index 34c5d356f4989..a34b2964ea3b7 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -33,6 +33,7 @@ def create() -> None: type=bool, required=False, default=False, + hidden=True, is_flag=True, help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance. This makes runs more expensive but start-up times decrease.""", @@ -41,16 +42,17 @@ def create() -> None: "--edit-before-creation", default=False, is_flag=True, + hidden=True, help="Edit the cluster specs before submitting them to the API server.", ) @click.option( - "--wait", - "wait", + "--async", + "no_wait", type=bool, required=False, default=False, is_flag=True, - help="Enabling this flag makes the CLI wait until the cluster is running.", + help="This flag makes the CLI return immediately and lets the cluster creation happen in the background.", ) def create_cluster( cluster_name: str, @@ -60,7 +62,7 @@ def create_cluster( provider: str, edit_before_creation: bool, enable_performance: bool, - wait: bool, + no_wait: bool, **kwargs: Any, ) -> None: """Create a Lightning AI BYOC compute cluster with your cloud provider credentials.""" @@ -75,5 +77,5 @@ def create_cluster( external_id=external_id, edit_before_creation=edit_before_creation, cost_savings=not enable_performance, - wait=wait, + wait=not no_wait, ) diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index 91b0222b28c04..eed2e3481a302 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -24,15 +24,15 @@ def delete() -> None: WARNING: You should NOT use this under normal circumstances.""", ) @click.option( - "--wait", - "wait", + "--no-wait", + "no_wait", type=bool, required=False, default=False, is_flag=True, - help="Enabling this flag makes the CLI wait until the cluster is deleted.", + help="This flag makes the CLI return immediately and lets the cluster deletion happen in the background", ) -def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> None: +def delete_cluster(cluster: str, force: bool = False, no_wait: bool = False) -> None: """Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources. Deleting a run also deletes all Runs and Experiments that were started on the cluster. @@ -46,4 +46,4 @@ def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> Non All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered. """ cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id=cluster, force=force, wait=wait) + cluster_manager.delete(cluster_id=cluster, force=force, wait=not no_wait) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 9ccf543f9dac7..f5d3a31b9240c 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -130,7 +130,7 @@ def test_create_cluster(create_command: mock.MagicMock, extra_arguments, expecte external_id="dummy", edit_before_creation=False, cost_savings=expected_cost_savings_mode, - wait=False, + wait=True, ) @@ -158,7 +158,7 @@ def test_delete_cluster(delete: mock.MagicMock): runner = CliRunner() runner.invoke(delete_cluster, ["test-7"]) - delete.assert_called_once_with(cluster_id="test-7", force=False, wait=False) + delete.assert_called_once_with(cluster_id="test-7", force=False, wait=True) @mock.patch("lightning_app.utilities.login.Auth._run_server") diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index 92df6c172c9f0..a0ba91cb490e5 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -1,5 +1,5 @@ from unittest import mock -from unittest.mock import MagicMock +from unittest.mock import call, MagicMock import click import pytest @@ -30,7 +30,7 @@ def __init__(self, list_responses=[], consume=True): def cluster_service_list_clusters(self, phase_not_in=None): self.list_call_count = self.list_call_count + 1 if self.consume: - return self.list_responses.pop() + return self.list_responses.pop(0) return self.list_responses[0] @@ -113,8 +113,8 @@ def test_happy_path(self, target_state, previous_state): for state in [previous_state, target_state] ] ) - cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, check_timeout=0.1) - assert client.list_call_count == 1 + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, poll_duration=0.1) + assert client.list_call_count == 2 @pytest.mark.parametrize("target_state", [V1ClusterState.RUNNING, V1ClusterState.DELETED]) def test_times_out(self, target_state): @@ -129,7 +129,115 @@ def test_times_out(self, target_state): consume=False, ) with pytest.raises(click.ClickException) as e: - cmd_clusters._wait_for_cluster_state( - client, "test-cluster", target_state, max_wait_time=0.4, check_timeout=0.2 - ) + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, timeout=0.4, poll_duration=0.2) assert "Max wait time elapsed" in str(e.value) + + @mock.patch("click.echo") + def test_echo_state_change_on_desired_running(self, echo: MagicMock): + client = FakeLightningClient( + list_responses=[ + V1ListClustersResponse( + clusters=[ + Externalv1Cluster( + id="test-cluster", + status=V1ClusterStatus( + phase=state, + reason=reason, + ), + ), + ] + ) + for state, reason in [ + (V1ClusterState.QUEUED, ""), + (V1ClusterState.PENDING, ""), + (V1ClusterState.PENDING, ""), + (V1ClusterState.PENDING, ""), + (V1ClusterState.FAILED, "some error"), + (V1ClusterState.PENDING, "retrying failure"), + (V1ClusterState.RUNNING, ""), + ] + ], + ) + + cmd_clusters._wait_for_cluster_state( + client, + "test-cluster", + target_state=V1ClusterState.RUNNING, + timeout=0.6, + poll_duration=0.1, + ) + + assert client.list_call_count == 7 + assert echo.call_count == 7 + echo.assert_has_calls( + [ + call("Cluster test-cluster is now queued"), + call("Cluster test-cluster is now pending"), + call("Cluster test-cluster is now pending"), + call("Cluster test-cluster is now pending"), + call( + "\n".join( + [ + "Cluster test-cluster is now failed with the following reason: some error", + "We are automatically retrying cluster creation.", + "In case you want to delete this cluster:", + "1. Stop this command", + "2. Run `lightning delete cluster test-cluster", + "WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.", + ] + ) + ), + call("Cluster test-cluster is now pending with the following reason: retrying failure"), + call( + "\n".join( + [ + "Cluster test-cluster is now running and ready to use.", + "To launch an app on this cluster use " + "`lightning run app app.py --cloud --cluster-id test-cluster`", + ] + ) + ), + ] + ) + + @mock.patch("click.echo") + def test_echo_state_change_on_desired_deleted(self, echo: MagicMock): + client = FakeLightningClient( + list_responses=[ + V1ListClustersResponse( + clusters=[ + Externalv1Cluster( + id="test-cluster", + status=V1ClusterStatus( + phase=state, + ), + ), + ] + ) + for state in [ + V1ClusterState.RUNNING, + V1ClusterState.RUNNING, + V1ClusterState.RUNNING, + V1ClusterState.DELETED, + ] + ], + ) + + cmd_clusters._wait_for_cluster_state( + client, + "test-cluster", + target_state=V1ClusterState.DELETED, + timeout=0.4, + poll_duration=0.1, + ) + + assert client.list_call_count == 4 + assert echo.call_count == 4 + echo.assert_has_calls( + [ + call("Cluster test-cluster is terminating"), + call("Cluster test-cluster is terminating"), + call("Cluster test-cluster is terminating"), + call("Cluster test-cluster has been deleted."), + ] + )