From d3215ad0a43c93c3a18d3f2d74413e6e82c19973 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Tue, 1 Nov 2022 16:46:49 -0400 Subject: [PATCH 01/23] Wait by default and notify on state changes --- docs/source-app/workflows/byoc/index.rst | 8 +- src/lightning_app/CHANGELOG.md | 1 + src/lightning_app/cli/cmd_clusters.py | 87 ++++++++++++- src/lightning_app/cli/lightning_cli_create.py | 12 +- src/lightning_app/cli/lightning_cli_delete.py | 10 +- tests/tests_app/cli/test_cli.py | 4 +- tests/tests_app/cli/test_cmd_clusters.py | 122 +++++++++++++++++- 7 files changed, 214 insertions(+), 30 deletions(-) diff --git a/docs/source-app/workflows/byoc/index.rst b/docs/source-app/workflows/byoc/index.rst index 2cabf046939ba..1db2965dc704e 100644 --- a/docs/source-app/workflows/byoc/index.rst +++ b/docs/source-app/workflows/byoc/index.rst @@ -63,7 +63,7 @@ Parameters ^^^^^^^^^^ +------------------------+----------------------------------------------------------------------------------------------------+ -|Parameter | Descritption | +|Parameter | Description | +========================+====================================================================================================+ | provider | The cloud provider where your cluster is located. | | | | @@ -78,10 +78,6 @@ Parameters +------------------------+----------------------------------------------------------------------------------------------------+ | region | AWS region containing compute resources | +------------------------+----------------------------------------------------------------------------------------------------+ -| instance-types | Instance types that you want to support, for computer jobs within the cluster. | -| | | -| | For now, this is the AWS instance types supported by the cluster. | -+------------------------+----------------------------------------------------------------------------------------------------+ | enable-performance | Specifies if the cluster uses cost savings mode. | | | | | | In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters | @@ -89,7 +85,7 @@ Parameters +------------------------+----------------------------------------------------------------------------------------------------+ | edit-before-creation | Enables interactive editing of requests before submitting it to Lightning AI. | +------------------------+----------------------------------------------------------------------------------------------------+ -| wait | Waits for the cluster to be in a RUNNING state. Only use this for debugging. | +| no-wait | Cluster creation will happen in the background. | +------------------------+----------------------------------------------------------------------------------------------------+ ---- diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 5c750bb8f48a4..ddd5469a2c0fe 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- Cluster creation and deletion now waits by default [#15458](https://github.com/Lightning-AI/lightning/pull/15458) - Changed the `flow.flows` to be recursive wont to align the behavior with the `flow.works` ([#15466](https://github.com/Lightning-AI/lightning/pull/15466)) - diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 4bb8b1fdb793f..9211e120d952e 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -2,6 +2,7 @@ import re import time from datetime import datetime +from enum import Enum from textwrap import dedent from typing import Any, List @@ -29,6 +30,23 @@ MAX_CLUSTER_WAIT_TIME = 5400 +class ClusterState(Enum): + UNSPECIFIED = "unspecified" + QUEUED = "queued" + PENDING = "pending" + RUNNING = "running" + FAILED = "failed" + DELETED = "deleted" + + def __str__(self) -> str: + return str(self.value) + + @classmethod + def from_api(cls, status: V1ClusterState) -> "ClusterState": + parsed = str(status).lower().split("_", maxsplit=2)[-1] + return cls(parsed) + + class ClusterList(Formatable): def __init__(self, clusters: List[Externalv1Cluster]): self.clusters = clusters @@ -130,9 +148,6 @@ def create( click.echo("cluster unchanged") resp = self.api_client.cluster_service_create_cluster(body=new_body) - if wait: - _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) - click.echo( dedent( f"""\ @@ -146,6 +161,13 @@ def create( """ ) ) + if wait: + click.echo("Waiting for cluster to enter state running...") + click.echo( + "Canceling this operation will NOT stop the cluster from creating" + f"(use `lightning delete cluster {resp.id}`)" + ) + _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) def get_clusters(self) -> ClusterList: resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) @@ -171,6 +193,8 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No click.echo("Cluster deletion triggered successfully") if wait: + click.echo("Waiting for cluster to delete...") + click.echo("Canceling the operation will NOT stop the cluster from deleting") _wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED) @@ -183,6 +207,8 @@ def _wait_for_cluster_state( ) -> None: """_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed. + Messages will be displayed to the user as the cluster changes state. + Args: api_client: LightningClient used for polling cluster_id: Specifies the cluster to wait for @@ -192,6 +218,7 @@ def _wait_for_cluster_state( """ start = time.time() elapsed = 0 + while elapsed < max_wait_time: cluster_resp = api_client.cluster_service_list_clusters() new_cluster = None @@ -200,10 +227,14 @@ def _wait_for_cluster_state( new_cluster = clust break if new_cluster is not None: + echo_cluster_status_long( + cluster_id=cluster_id, + current_state=new_cluster.status.phase, + current_reason=new_cluster.status.reason, + desired_state=target_state, + ) if new_cluster.status.phase == target_state: break - elif new_cluster.status.phase == V1ClusterState.FAILED: - raise click.ClickException(f"Cluster {cluster_id} is in failed state.") time.sleep(check_timeout) elapsed = int(time.time() - start) else: @@ -219,3 +250,49 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str: Provide a cluster name using valid characters and try again.""" ) return value + + +def echo_cluster_status_long( + cluster_id: str, + current_state: V1ClusterState, + current_reason: str, + desired_state: V1ClusterState, +) -> None: + """Echos a long-form status message to the user about the cluster state. + + Args: + cluster_id: The name of the cluster + current_state: The cluster's current state + reason: The reason for the cluster's state + """ + + state_str = ClusterState.from_api(current_state) + + message = f"Cluster {cluster_id} is now {state_str}" + ( + f" with the following reason: {current_reason}" if current_reason else "" + ) + + if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.RUNNING: + message = "\n".join( + [ + f"Cluster {cluster_id} is now running and ready to use.", + f"To launch an app on this cluster use `lightning run app app.py --cloud --cluster-id {cluster_id}`", + ] + ) + if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.DELETED: + message = f"Cluster {cluster_id} is terminating" + if current_state == V1ClusterState.FAILED: + message = "\n".join( + [ + message, + "We are automatically retrying cluster creation.", + "In case you want to delete this cluster:", + "1. Stop this command", + f"2. Run `lightning delete cluster {cluster_id}", + "WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.", + ] + ) + if current_state == V1ClusterState.DELETED: + message = f"Cluster {cluster_id} has been deleted." + + click.echo(message) diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index 34c5d356f4989..a34b2964ea3b7 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -33,6 +33,7 @@ def create() -> None: type=bool, required=False, default=False, + hidden=True, is_flag=True, help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance. This makes runs more expensive but start-up times decrease.""", @@ -41,16 +42,17 @@ def create() -> None: "--edit-before-creation", default=False, is_flag=True, + hidden=True, help="Edit the cluster specs before submitting them to the API server.", ) @click.option( - "--wait", - "wait", + "--async", + "no_wait", type=bool, required=False, default=False, is_flag=True, - help="Enabling this flag makes the CLI wait until the cluster is running.", + help="This flag makes the CLI return immediately and lets the cluster creation happen in the background.", ) def create_cluster( cluster_name: str, @@ -60,7 +62,7 @@ def create_cluster( provider: str, edit_before_creation: bool, enable_performance: bool, - wait: bool, + no_wait: bool, **kwargs: Any, ) -> None: """Create a Lightning AI BYOC compute cluster with your cloud provider credentials.""" @@ -75,5 +77,5 @@ def create_cluster( external_id=external_id, edit_before_creation=edit_before_creation, cost_savings=not enable_performance, - wait=wait, + wait=not no_wait, ) diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index 91b0222b28c04..eed2e3481a302 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -24,15 +24,15 @@ def delete() -> None: WARNING: You should NOT use this under normal circumstances.""", ) @click.option( - "--wait", - "wait", + "--no-wait", + "no_wait", type=bool, required=False, default=False, is_flag=True, - help="Enabling this flag makes the CLI wait until the cluster is deleted.", + help="This flag makes the CLI return immediately and lets the cluster deletion happen in the background", ) -def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> None: +def delete_cluster(cluster: str, force: bool = False, no_wait: bool = False) -> None: """Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources. Deleting a run also deletes all Runs and Experiments that were started on the cluster. @@ -46,4 +46,4 @@ def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> Non All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered. """ cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id=cluster, force=force, wait=wait) + cluster_manager.delete(cluster_id=cluster, force=force, wait=not no_wait) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 9ccf543f9dac7..f5d3a31b9240c 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -130,7 +130,7 @@ def test_create_cluster(create_command: mock.MagicMock, extra_arguments, expecte external_id="dummy", edit_before_creation=False, cost_savings=expected_cost_savings_mode, - wait=False, + wait=True, ) @@ -158,7 +158,7 @@ def test_delete_cluster(delete: mock.MagicMock): runner = CliRunner() runner.invoke(delete_cluster, ["test-7"]) - delete.assert_called_once_with(cluster_id="test-7", force=False, wait=False) + delete.assert_called_once_with(cluster_id="test-7", force=False, wait=True) @mock.patch("lightning_app.utilities.login.Auth._run_server") diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index 92df6c172c9f0..a0ba91cb490e5 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -1,5 +1,5 @@ from unittest import mock -from unittest.mock import MagicMock +from unittest.mock import call, MagicMock import click import pytest @@ -30,7 +30,7 @@ def __init__(self, list_responses=[], consume=True): def cluster_service_list_clusters(self, phase_not_in=None): self.list_call_count = self.list_call_count + 1 if self.consume: - return self.list_responses.pop() + return self.list_responses.pop(0) return self.list_responses[0] @@ -113,8 +113,8 @@ def test_happy_path(self, target_state, previous_state): for state in [previous_state, target_state] ] ) - cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, check_timeout=0.1) - assert client.list_call_count == 1 + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, poll_duration=0.1) + assert client.list_call_count == 2 @pytest.mark.parametrize("target_state", [V1ClusterState.RUNNING, V1ClusterState.DELETED]) def test_times_out(self, target_state): @@ -129,7 +129,115 @@ def test_times_out(self, target_state): consume=False, ) with pytest.raises(click.ClickException) as e: - cmd_clusters._wait_for_cluster_state( - client, "test-cluster", target_state, max_wait_time=0.4, check_timeout=0.2 - ) + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, timeout=0.4, poll_duration=0.2) assert "Max wait time elapsed" in str(e.value) + + @mock.patch("click.echo") + def test_echo_state_change_on_desired_running(self, echo: MagicMock): + client = FakeLightningClient( + list_responses=[ + V1ListClustersResponse( + clusters=[ + Externalv1Cluster( + id="test-cluster", + status=V1ClusterStatus( + phase=state, + reason=reason, + ), + ), + ] + ) + for state, reason in [ + (V1ClusterState.QUEUED, ""), + (V1ClusterState.PENDING, ""), + (V1ClusterState.PENDING, ""), + (V1ClusterState.PENDING, ""), + (V1ClusterState.FAILED, "some error"), + (V1ClusterState.PENDING, "retrying failure"), + (V1ClusterState.RUNNING, ""), + ] + ], + ) + + cmd_clusters._wait_for_cluster_state( + client, + "test-cluster", + target_state=V1ClusterState.RUNNING, + timeout=0.6, + poll_duration=0.1, + ) + + assert client.list_call_count == 7 + assert echo.call_count == 7 + echo.assert_has_calls( + [ + call("Cluster test-cluster is now queued"), + call("Cluster test-cluster is now pending"), + call("Cluster test-cluster is now pending"), + call("Cluster test-cluster is now pending"), + call( + "\n".join( + [ + "Cluster test-cluster is now failed with the following reason: some error", + "We are automatically retrying cluster creation.", + "In case you want to delete this cluster:", + "1. Stop this command", + "2. Run `lightning delete cluster test-cluster", + "WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.", + ] + ) + ), + call("Cluster test-cluster is now pending with the following reason: retrying failure"), + call( + "\n".join( + [ + "Cluster test-cluster is now running and ready to use.", + "To launch an app on this cluster use " + "`lightning run app app.py --cloud --cluster-id test-cluster`", + ] + ) + ), + ] + ) + + @mock.patch("click.echo") + def test_echo_state_change_on_desired_deleted(self, echo: MagicMock): + client = FakeLightningClient( + list_responses=[ + V1ListClustersResponse( + clusters=[ + Externalv1Cluster( + id="test-cluster", + status=V1ClusterStatus( + phase=state, + ), + ), + ] + ) + for state in [ + V1ClusterState.RUNNING, + V1ClusterState.RUNNING, + V1ClusterState.RUNNING, + V1ClusterState.DELETED, + ] + ], + ) + + cmd_clusters._wait_for_cluster_state( + client, + "test-cluster", + target_state=V1ClusterState.DELETED, + timeout=0.4, + poll_duration=0.1, + ) + + assert client.list_call_count == 4 + assert echo.call_count == 4 + echo.assert_has_calls( + [ + call("Cluster test-cluster is terminating"), + call("Cluster test-cluster is terminating"), + call("Cluster test-cluster is terminating"), + call("Cluster test-cluster has been deleted."), + ] + ) From 14670e27038357fc5c32e5a08b24d58b6a4efbe7 Mon Sep 17 00:00:00 2001 From: Neven Miculinic Date: Fri, 4 Nov 2022 11:59:52 +0000 Subject: [PATCH 02/23] tt --- src/lightning_app/cli/cmd_clusters.py | 148 ++++++++++-------- src/lightning_app/cli/lightning_cli_delete.py | 18 +-- src/lightning_app/utilities/network.py | 2 +- 3 files changed, 91 insertions(+), 77 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 9211e120d952e..3dc2b796efe31 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -16,17 +16,17 @@ V1ClusterState, V1ClusterType, V1CreateClusterRequest, + V1GetClusterResponse, V1KubernetesClusterDriver, ) from rich.console import Console from rich.table import Table from rich.text import Text -from lightning_app.cli.core import Formatable -from lightning_app.utilities.network import LightningClient -from lightning_app.utilities.openapi import create_openapi_object, string2dict +from lightning.app.cli.core import Formatable +from lightning.app.utilities.network import LightningClient +from lightning.app.utilities.openapi import create_openapi_object, string2dict -CLUSTER_STATE_CHECKING_TIMEOUT = 60 MAX_CLUSTER_WAIT_TIME = 5400 @@ -151,24 +151,33 @@ def create( click.echo( dedent( f"""\ - {resp.id} is now being created... This can take up to an hour. + BYOC cluster {cluster_name} is now being created... This can take up to an hour. To view the status of your clusters use: - `lightning list clusters` + lightning list clusters To view cluster logs use: - `lightning show cluster logs {resp.id}` - """ + lightning show cluster logs {cluster_name} + """ ) ) if wait: click.echo("Waiting for cluster to enter state running...") click.echo( "Canceling this operation will NOT stop the cluster from creating" - f"(use `lightning delete cluster {resp.id}`)" + f"(use `lightning delete cluster {cluster_name}`)" ) _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) + click.echo( + dedent( + f"""\ + Cluster {cluster_name} is now running and ready to use.", + To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id {cluster_name}, + """ + ) + ) + def get_clusters(self) -> ClusterList: resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) return ClusterList(resp.clusters) @@ -189,6 +198,9 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No ) click.confirm("Do you want to continue?", abort=True) + resp: V1GetClusterResponse = self.api_client.cluster_service_get_cluster(id=cluster_id) + bucket_name = resp.spec.driver.kubernetes.aws.bucket_name + self.api_client.cluster_service_delete_cluster(id=cluster_id, force=force) click.echo("Cluster deletion triggered successfully") @@ -197,48 +209,77 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No click.echo("Canceling the operation will NOT stop the cluster from deleting") _wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED) + click.echo( + dedent( + f"""\ + Cluster {cluster_id} has been successfully deleted, and almost all AWS resources have been removed + + For safety purposes we kept the S3 bucket associated with the cluster: {bucket_name} + + You may want to delete it manually using the AWS CLI: + + aws s3 rb --force s3://{bucket_name} + """ + ) + ) + def _wait_for_cluster_state( api_client: LightningClient, cluster_id: str, target_state: V1ClusterState, - max_wait_time: int = MAX_CLUSTER_WAIT_TIME, - check_timeout: int = CLUSTER_STATE_CHECKING_TIMEOUT, + timeout: int = MAX_CLUSTER_WAIT_TIME, + poll_duration: int = 60, ) -> None: """_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed. Messages will be displayed to the user as the cluster changes state. + We poll the API server for any changes check_ Args: api_client: LightningClient used for polling cluster_id: Specifies the cluster to wait for target_state: Specifies the desired state the target cluster needs to meet - max_wait_time: Maximum duration to wait (in seconds) - check_timeout: duration between polling for the cluster state (in seconds) + timeout: Maximum duration to wait (in seconds) + poll_duration: duration between polling for the cluster state (in seconds) """ start = time.time() elapsed = 0 - while elapsed < max_wait_time: - cluster_resp = api_client.cluster_service_list_clusters() - new_cluster = None - for clust in cluster_resp.clusters: - if clust.id == cluster_id: - new_cluster = clust - break - if new_cluster is not None: - echo_cluster_status_long( + while elapsed < timeout: + try: + resp: V1GetClusterResponse = api_client.cluster_service_get_cluster(id=cluster_id) + _echo_cluster_status_long( cluster_id=cluster_id, - current_state=new_cluster.status.phase, - current_reason=new_cluster.status.reason, + current_state=resp.status.phase, + current_reason=resp.status.reason, desired_state=target_state, + elapsed=elapsed, ) - if new_cluster.status.phase == target_state: + if resp.status.phase == target_state: break - time.sleep(check_timeout) - elapsed = int(time.time() - start) + time.sleep(poll_duration) + elapsed = int(time.time() - start) + except Exception as e: # TODO Handle not found exception + if target_state == V1ClusterState.DELETED: + return else: - raise click.ClickException("Max wait time elapsed") + raise click.ClickException( + dedent( + f"""\ + The cluster has not been created within {timeout} seconds. + + The cluster may still be created afterwards, please check its status using: + + lighting list clusters + + To view cluster logs use: + lightning show cluster logs {cluster_id} + + Feel free to reaching out to support@lightning.ai for any additional help + """ + ) + ) def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str: @@ -252,47 +293,32 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str: return value -def echo_cluster_status_long( - cluster_id: str, - current_state: V1ClusterState, - current_reason: str, - desired_state: V1ClusterState, -) -> None: +def _echo_cluster_status_long( + cluster_id: str, current_state: V1ClusterState, current_reason: str, desired_state: V1ClusterState, elapsed: float +) -> str: """Echos a long-form status message to the user about the cluster state. Args: cluster_id: The name of the cluster current_state: The cluster's current state reason: The reason for the cluster's state + elapsed: Seconds since we've started polling """ - state_str = ClusterState.from_api(current_state) + if current_state == V1ClusterState.FAILED: + return f"""\ +The requested cluster operation for cluster {cluster_id} has errors: +{current_reason} - message = f"Cluster {cluster_id} is now {state_str}" + ( - f" with the following reason: {current_reason}" if current_reason else "" - ) +--- +We are automatically retrying, and automated alert has been created - if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.RUNNING: - message = "\n".join( - [ - f"Cluster {cluster_id} is now running and ready to use.", - f"To launch an app on this cluster use `lightning run app app.py --cloud --cluster-id {cluster_id}`", - ] - ) - if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.DELETED: - message = f"Cluster {cluster_id} is terminating" - if current_state == V1ClusterState.FAILED: - message = "\n".join( - [ - message, - "We are automatically retrying cluster creation.", - "In case you want to delete this cluster:", - "1. Stop this command", - f"2. Run `lightning delete cluster {cluster_id}", - "WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.", - ] - ) - if current_state == V1ClusterState.DELETED: - message = f"Cluster {cluster_id} has been deleted." +WARNING: Any non-deleted cluster may consume cloud resources and incur cost to you.""" + + if desired_state == V1ClusterState.RUNNING: + return f"Cluster {cluster_id} is being created [elapsed={elapsed}s]" + + if desired_state == V1ClusterState.DELETED: + return f"Cluster {cluster_id} is being deleted [elapsed={elapsed}s]" - click.echo(message) + raise click.ClickException(f"Unknown cluster desired state {desired_state}") diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index eed2e3481a302..c0ad6ba33c2b2 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -12,19 +12,7 @@ def delete() -> None: @delete.command("cluster") @click.argument("cluster", type=str) @click.option( - "--force", - "force", - type=bool, - required=False, - default=False, - is_flag=True, - help="""Delete a BYOC cluster from Lightning AI. This does NOT delete any resources created by the cluster, - it just removes the entry from Lightning AI. - - WARNING: You should NOT use this under normal circumstances.""", -) -@click.option( - "--no-wait", + "--async", "no_wait", type=bool, required=False, @@ -32,7 +20,7 @@ def delete() -> None: is_flag=True, help="This flag makes the CLI return immediately and lets the cluster deletion happen in the background", ) -def delete_cluster(cluster: str, force: bool = False, no_wait: bool = False) -> None: +def delete_cluster(cluster: str, no_wait: bool = False) -> None: """Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources. Deleting a run also deletes all Runs and Experiments that were started on the cluster. @@ -46,4 +34,4 @@ def delete_cluster(cluster: str, force: bool = False, no_wait: bool = False) -> All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered. """ cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id=cluster, force=force, wait=not no_wait) + cluster_manager.delete(cluster_id=cluster, force=False, wait=not no_wait) diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index f8d80af2dcc0c..2ea9de3bae239 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -118,7 +118,7 @@ def __new__(mcs, name, bases, dct): return new_class -class LightningClient(GridRestClient, metaclass=_MethodsRetryWrapperMeta): +class LightningClient(GridRestClient): # , metaclass=_MethodsRetryWrapperMeta): """The LightningClient is a wrapper around the GridRestClient. It wraps all methods to monitor connection exceptions and employs a retry strategy. From c93525c4c0e72f7530b741b1455985e0ae2a3a90 Mon Sep 17 00:00:00 2001 From: Neven Miculinic Date: Fri, 4 Nov 2022 12:24:29 +0000 Subject: [PATCH 03/23] Further update --- src/lightning_app/cli/cmd_clusters.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 3dc2b796efe31..bcf0924d560d6 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -7,6 +7,7 @@ from typing import Any, List import click +import lightning_cloud from lightning_cloud.openapi import ( Externalv1Cluster, V1AWSClusterDriverSpec, @@ -229,7 +230,7 @@ def _wait_for_cluster_state( cluster_id: str, target_state: V1ClusterState, timeout: int = MAX_CLUSTER_WAIT_TIME, - poll_duration: int = 60, + poll_duration: int = 10, ) -> None: """_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed. @@ -249,20 +250,24 @@ def _wait_for_cluster_state( while elapsed < timeout: try: resp: V1GetClusterResponse = api_client.cluster_service_get_cluster(id=cluster_id) - _echo_cluster_status_long( - cluster_id=cluster_id, - current_state=resp.status.phase, - current_reason=resp.status.reason, - desired_state=target_state, - elapsed=elapsed, + click.echo( + _echo_cluster_status_long( + cluster_id=cluster_id, + current_state=resp.status.phase, + current_reason=resp.status.reason, + desired_state=target_state, + elapsed=elapsed, + ) ) if resp.status.phase == target_state: break time.sleep(poll_duration) elapsed = int(time.time() - start) - except Exception as e: # TODO Handle not found exception - if target_state == V1ClusterState.DELETED: + + except lightning_cloud.openapi.rest.ApiException as e: + if e.status == 404 and target_state == V1ClusterState.DELETED: return + raise else: raise click.ClickException( dedent( From 9b8db794a0def777d33c36f8defaf2e9d5ff0689 Mon Sep 17 00:00:00 2001 From: Neven Miculinic Date: Fri, 4 Nov 2022 12:25:37 +0000 Subject: [PATCH 04/23] updated docs --- docs/source-app/workflows/byoc/index.rst | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/docs/source-app/workflows/byoc/index.rst b/docs/source-app/workflows/byoc/index.rst index 1db2965dc704e..1134301e6dd43 100644 --- a/docs/source-app/workflows/byoc/index.rst +++ b/docs/source-app/workflows/byoc/index.rst @@ -78,14 +78,7 @@ Parameters +------------------------+----------------------------------------------------------------------------------------------------+ | region | AWS region containing compute resources | +------------------------+----------------------------------------------------------------------------------------------------+ -| enable-performance | Specifies if the cluster uses cost savings mode. | -| | | -| | In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters | -| | with low utilization. | -+------------------------+----------------------------------------------------------------------------------------------------+ -| edit-before-creation | Enables interactive editing of requests before submitting it to Lightning AI. | -+------------------------+----------------------------------------------------------------------------------------------------+ -| no-wait | Cluster creation will happen in the background. | +| async | Cluster creation will happen in the background. | +------------------------+----------------------------------------------------------------------------------------------------+ ---- From fe90dd6819adfdcee4cade9c3710400abbb544ef Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Sat, 5 Nov 2022 18:37:03 -0400 Subject: [PATCH 05/23] Improve user messages and test output --- src/lightning_app/cli/cmd_clusters.py | 174 ++++----- src/lightning_app/cli/lightning_cli_create.py | 6 +- src/lightning_app/cli/lightning_cli_delete.py | 4 +- tests/tests_app/cli/test_cmd_clusters.py | 344 ++++++++++++------ 4 files changed, 324 insertions(+), 204 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index bcf0924d560d6..853671550bbd9 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -24,9 +24,9 @@ from rich.table import Table from rich.text import Text -from lightning.app.cli.core import Formatable -from lightning.app.utilities.network import LightningClient -from lightning.app.utilities.openapi import create_openapi_object, string2dict +from lightning_app.cli.core import Formatable +from lightning_app.utilities.network import LightningClient +from lightning_app.utilities.openapi import create_openapi_object, string2dict MAX_CLUSTER_WAIT_TIME = 5400 @@ -105,7 +105,7 @@ def create( region: str = "us-east-1", external_id: str = None, edit_before_creation: bool = False, - wait: bool = False, + do_async: bool = False, ) -> None: """request Lightning AI BYOC compute cluster creation. @@ -149,35 +149,26 @@ def create( click.echo("cluster unchanged") resp = self.api_client.cluster_service_create_cluster(body=new_body) - click.echo( - dedent( - f"""\ - BYOC cluster {cluster_name} is now being created... This can take up to an hour. + click.echo(dedent(f"""\ + BYOC cluster creation triggered successfully! + This can take up to an hour to complete. To view the status of your clusters use: lightning list clusters To view cluster logs use: lightning show cluster logs {cluster_name} - """ - ) - ) - if wait: - click.echo("Waiting for cluster to enter state running...") - click.echo( - "Canceling this operation will NOT stop the cluster from creating" - f"(use `lightning delete cluster {cluster_name}`)" - ) - _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) - click.echo( - dedent( - f"""\ - Cluster {cluster_name} is now running and ready to use.", - To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id {cluster_name}, - """ - ) - ) + To delete the cluster run: + lightning delete cluster {cluster_name} + """)) + if not do_async: + try: + _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) + return + except KeyboardInterrupt: + pass + click.echo("Cluster will be created in the background!") def get_clusters(self) -> ClusterList: resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) @@ -188,7 +179,7 @@ def list(self) -> None: console = Console() console.print(clusters.as_table()) - def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> None: + def delete(self, cluster_id: str, force: bool = False, do_async: bool = False) -> None: if force: click.echo( """ @@ -203,26 +194,23 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No bucket_name = resp.spec.driver.kubernetes.aws.bucket_name self.api_client.cluster_service_delete_cluster(id=cluster_id, force=force) - click.echo("Cluster deletion triggered successfully") + click.echo(dedent(f"""\ + Cluster deletion triggered successfully - if wait: - click.echo("Waiting for cluster to delete...") - click.echo("Canceling the operation will NOT stop the cluster from deleting") - _wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED) - - click.echo( - dedent( - f"""\ - Cluster {cluster_id} has been successfully deleted, and almost all AWS resources have been removed - - For safety purposes we kept the S3 bucket associated with the cluster: {bucket_name} + For safety purposes we will not delete anything in the S3 bucket associated with the cluster: + {bucket_name} You may want to delete it manually using the AWS CLI: + aws s3 rb --force s3://{bucket_name} + """)) - aws s3 rb --force s3://{bucket_name} - """ - ) - ) + if not do_async: + try: + _wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED) + return + except KeyboardInterrupt: + pass + click.echo("Cluster will be deleted in the background!") def _wait_for_cluster_state( @@ -235,7 +223,7 @@ def _wait_for_cluster_state( """_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed. Messages will be displayed to the user as the cluster changes state. - We poll the API server for any changes check_ + We poll the API server for any changes Args: api_client: LightningClient used for polling @@ -247,44 +235,32 @@ def _wait_for_cluster_state( start = time.time() elapsed = 0 + click.echo(f"Waiting for cluster to be {ClusterState.from_api(target_state)}...") while elapsed < timeout: try: resp: V1GetClusterResponse = api_client.cluster_service_get_cluster(id=cluster_id) - click.echo( - _echo_cluster_status_long( - cluster_id=cluster_id, - current_state=resp.status.phase, - current_reason=resp.status.reason, - desired_state=target_state, - elapsed=elapsed, - ) - ) + click.echo(_cluster_status_long(cluster=resp, desired_state=target_state, elapsed=elapsed)) if resp.status.phase == target_state: break time.sleep(poll_duration) elapsed = int(time.time() - start) - except lightning_cloud.openapi.rest.ApiException as e: if e.status == 404 and target_state == V1ClusterState.DELETED: return raise else: - raise click.ClickException( - dedent( - f"""\ - The cluster has not been created within {timeout} seconds. - - The cluster may still be created afterwards, please check its status using: + state_str = ClusterState.from_api(target_state) + raise click.ClickException(dedent(f"""\ + The cluster has not entered the {state_str} state within {_format_elapsed_seconds(timeout)}. - lighting list clusters + The cluster may eventually be {state_str} afterwards, please check its status using: + lighting list clusters - To view cluster logs use: - lightning show cluster logs {cluster_id} + To view cluster logs use: + lightning show cluster logs {cluster_id} - Feel free to reaching out to support@lightning.ai for any additional help - """ - ) - ) + Contact support@lightning.ai for additional help + """)) def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str: @@ -298,32 +274,68 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str: return value -def _echo_cluster_status_long( - cluster_id: str, current_state: V1ClusterState, current_reason: str, desired_state: V1ClusterState, elapsed: float -) -> str: +def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1ClusterState, elapsed: float) -> str: """Echos a long-form status message to the user about the cluster state. Args: - cluster_id: The name of the cluster - current_state: The cluster's current state - reason: The reason for the cluster's state + cluster: The cluster object elapsed: Seconds since we've started polling """ + cluster_name = cluster.name + current_state = cluster.status.phase + current_reason = cluster.status.reason + bucket_name = cluster.spec.driver.kubernetes.aws.bucket_name + + duration = _format_elapsed_seconds(elapsed) + if current_state == V1ClusterState.FAILED: - return f"""\ -The requested cluster operation for cluster {cluster_id} has errors: -{current_reason} + return dedent(f"""\ + The requested cluster operation for cluster {cluster_name} has errors: + {current_reason} + + --- + We are automatically retrying, and an automated alert has been created + + WARNING: Any non-deleted cluster may be using resources. + To avoid incuring cost on your cloud provider, delete the cluster using the following command: + lightning delete cluster {cluster_name} ---- -We are automatically retrying, and automated alert has been created + Contact support@lightning.ai for additional help + """) -WARNING: Any non-deleted cluster may consume cloud resources and incur cost to you.""" + if desired_state == current_state == V1ClusterState.RUNNING: + return dedent(f"""\ + Cluster {cluster_name} is now running and ready to use. + To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id {cluster_name} + """) if desired_state == V1ClusterState.RUNNING: - return f"Cluster {cluster_id} is being created [elapsed={elapsed}s]" + return f"Cluster {cluster_name} is being created [elapsed={duration}]" + + if desired_state == current_state == V1ClusterState.DELETED: + return dedent(f"""\ + Cluster {cluster_name} has been successfully deleted, and almost all AWS resources have been removed + + For safety purposes we kept the S3 bucket associated with the cluster: {bucket_name} + + You may want to delete it manually using the AWS CLI: + aws s3 rb --force s3://{bucket_name} + """) if desired_state == V1ClusterState.DELETED: - return f"Cluster {cluster_id} is being deleted [elapsed={elapsed}s]" + return f"Cluster {cluster_name} is being deleted [elapsed={duration}]" raise click.ClickException(f"Unknown cluster desired state {desired_state}") + + +def _format_elapsed_seconds(seconds: int) -> str: + """Turns seconds into a duration string + + >>> _format_elapsed_seconds(5) + 05s + >>> _format_elapsed_seconds(60) + 01m00s + """ + minutes, seconds = divmod(seconds, 60) + return (f"{minutes:02}m" if minutes else "") + f"{seconds:02}s" diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index a34b2964ea3b7..e1e80949685f0 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -47,7 +47,7 @@ def create() -> None: ) @click.option( "--async", - "no_wait", + "do_async", type=bool, required=False, default=False, @@ -62,7 +62,7 @@ def create_cluster( provider: str, edit_before_creation: bool, enable_performance: bool, - no_wait: bool, + do_async: bool, **kwargs: Any, ) -> None: """Create a Lightning AI BYOC compute cluster with your cloud provider credentials.""" @@ -77,5 +77,5 @@ def create_cluster( external_id=external_id, edit_before_creation=edit_before_creation, cost_savings=not enable_performance, - wait=not no_wait, + do_async=do_async, ) diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index c0ad6ba33c2b2..4b5327266bf15 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -20,7 +20,7 @@ def delete() -> None: is_flag=True, help="This flag makes the CLI return immediately and lets the cluster deletion happen in the background", ) -def delete_cluster(cluster: str, no_wait: bool = False) -> None: +def delete_cluster(cluster: str, do_async: bool = False) -> None: """Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources. Deleting a run also deletes all Runs and Experiments that were started on the cluster. @@ -34,4 +34,4 @@ def delete_cluster(cluster: str, no_wait: bool = False) -> None: All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered. """ cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id=cluster, force=False, wait=not no_wait) + cluster_manager.delete(cluster_id=cluster, force=False, do_async=do_async) diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index a0ba91cb490e5..29ac0e321a1ad 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -1,10 +1,10 @@ from unittest import mock from unittest.mock import call, MagicMock +from textwrap import dedent import click import pytest from lightning_cloud.openapi import ( - Externalv1Cluster, V1AWSClusterDriverSpec, V1ClusterDriver, V1ClusterPerformanceProfile, @@ -13,56 +13,114 @@ V1ClusterStatus, V1ClusterType, V1CreateClusterRequest, + V1CreateClusterResponse, V1KubernetesClusterDriver, - V1ListClustersResponse, + V1GetClusterResponse, ) from lightning_app.cli import cmd_clusters from lightning_app.cli.cmd_clusters import AWSClusterManager +@pytest.fixture(params=[True, False]) +def async_or_interrupt(request, monkeypatch): + # Simulate hitting ctrl-c immediately while waiting for cluster to create + if not request.param: + monkeypatch.setattr(cmd_clusters, "_wait_for_cluster_state", mock.MagicMock(side_effect=KeyboardInterrupt)) + return request.param + + +@pytest.fixture +def spec(): + return V1ClusterSpec( + driver=V1ClusterDriver( + kubernetes=V1KubernetesClusterDriver( + aws=V1AWSClusterDriverSpec( + bucket_name="test-bucket", + ), + ), + ), + ) + + class FakeLightningClient: - def __init__(self, list_responses=[], consume=True): - self.list_responses = list_responses - self.list_call_count = 0 + def __init__(self, get_responses=[], consume=True): + self.get_responses = get_responses + self.get_call_count = 0 self.consume = consume - def cluster_service_list_clusters(self, phase_not_in=None): - self.list_call_count = self.list_call_count + 1 + def cluster_service_get_cluster(self, id: str): + self.get_call_count = self.get_call_count + 1 if self.consume: - return self.list_responses.pop(0) - return self.list_responses[0] + return self.get_responses.pop(0) + return self.get_responses[0] -@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) -@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster") -def test_create_cluster(api: mock.MagicMock): - cluster_manager = AWSClusterManager() - cluster_manager.create( - cluster_name="test-7", - external_id="dummy", - role_arn="arn:aws:iam::1234567890:role/lai-byoc", - region="us-west-2", - ) +class Test_create: + @mock.patch("click.echo") + @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) + @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster", + MagicMock(return_value=V1CreateClusterResponse(id="test-cluster"))) + def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): + cluster_manager = AWSClusterManager() + cluster_manager.create( + cluster_name="test-cluster", + external_id="dummy", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + region="us-west-2", + do_async=async_or_interrupt, + ) + + expected_output = [ + call(dedent(f"""\ + BYOC cluster creation triggered successfully! + This can take up to an hour to complete. + + To view the status of your clusters use: + lightning list clusters + + To view cluster logs use: + lightning show cluster logs test-cluster + + To delete the cluster run: + lightning delete cluster test-cluster + """)), + call("Cluster will be created in the background!"), + ] + for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): + assert expected == actual, f"Call {i} does not match" + - api.assert_called_once_with( - body=V1CreateClusterRequest( - name="test-7", - spec=V1ClusterSpec( - cluster_type=V1ClusterType.BYOC, - performance_profile=V1ClusterPerformanceProfile.DEFAULT, - driver=V1ClusterDriver( - kubernetes=V1KubernetesClusterDriver( - aws=V1AWSClusterDriverSpec( - region="us-west-2", - role_arn="arn:aws:iam::1234567890:role/lai-byoc", - external_id="dummy", + @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) + @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster") + def test_create_cluster_api(self, api: mock.MagicMock, async_or_interrupt): + cluster_manager = AWSClusterManager() + cluster_manager.create( + cluster_name="test-7", + external_id="dummy", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + region="us-west-2", + do_async=async_or_interrupt, + ) + + api.assert_called_once_with( + body=V1CreateClusterRequest( + name="test-7", + spec=V1ClusterSpec( + cluster_type=V1ClusterType.BYOC, + performance_profile=V1ClusterPerformanceProfile.DEFAULT, + driver=V1ClusterDriver( + kubernetes=V1KubernetesClusterDriver( + aws=V1AWSClusterDriverSpec( + region="us-west-2", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + external_id="dummy", + ) ) - ) + ), ), - ), + ) ) - ) @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @@ -74,14 +132,47 @@ def test_list_clusters(api: mock.MagicMock): api.assert_called_once_with(phase_not_in=[V1ClusterState.DELETED]) -@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) -@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster") -def test_delete_cluster(api: mock.MagicMock): - cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id="test-7") +class Test_delete: + @mock.patch("click.echo") + @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) + @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster", MagicMock()) + @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_get_cluster") + def test_delete_cluster_output( + self, + api_get: mock.MagicMock, + echo: mock.MagicMock, + async_or_interrupt, + spec, + ): + api_get.return_value = V1GetClusterResponse(spec=spec) + + cluster_manager = AWSClusterManager() + cluster_manager.delete(cluster_id="test-7", do_async=async_or_interrupt) + + expected_output = [ + call(dedent(f"""\ + Cluster deletion triggered successfully + + For safety purposes we will not delete anything in the S3 bucket associated with the cluster: + test-bucket + + You may want to delete it manually using the AWS CLI: + aws s3 rb --force s3://test-bucket + """)), + call("Cluster will be deleted in the background!"), + ] + for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): + assert expected == actual, f"Call {i} does not match" - api.assert_called_once_with(id="test-7", force=False) + @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) + @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster") + @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_get_cluster") + def test_delete_cluster_api(self, api_get:mock.MagicMock, api_delete: mock.MagicMock, async_or_interrupt, spec): + api_get.return_value = V1GetClusterResponse(spec=spec) + cluster_manager = AWSClusterManager() + cluster_manager.delete(cluster_id="test-7", do_async=async_or_interrupt) + api_delete.assert_called_once_with(id="test-7", force=False) class Test_check_cluster_name_is_valid: @pytest.mark.parametrize("name", ["test-7", "0wildgoat"]) @@ -104,48 +195,63 @@ class Test_wait_for_cluster_state: @pytest.mark.parametrize( "previous_state", [V1ClusterState.QUEUED, V1ClusterState.PENDING, V1ClusterState.UNSPECIFIED] ) - def test_happy_path(self, target_state, previous_state): + def test_happy_path(self, target_state, previous_state, spec): client = FakeLightningClient( - list_responses=[ - V1ListClustersResponse( - clusters=[Externalv1Cluster(id="test-cluster", status=V1ClusterStatus(phase=state))] + get_responses=[ + V1GetClusterResponse( + id="test-cluster", + status=V1ClusterStatus(phase=state), + spec=spec, ) for state in [previous_state, target_state] ] ) cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, poll_duration=0.1) - assert client.list_call_count == 2 + assert client.get_call_count == 2 @pytest.mark.parametrize("target_state", [V1ClusterState.RUNNING, V1ClusterState.DELETED]) - def test_times_out(self, target_state): + def test_times_out(self, target_state, spec): client = FakeLightningClient( - list_responses=[ - V1ListClustersResponse( - clusters=[ - Externalv1Cluster(id="test-cluster", status=V1ClusterStatus(phase=V1ClusterState.UNSPECIFIED)) - ] + get_responses=[ + V1GetClusterResponse( + id="test-cluster", + status=V1ClusterStatus(phase=V1ClusterState.UNSPECIFIED), + spec=spec, ) ], consume=False, ) with pytest.raises(click.ClickException) as e: - cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, timeout=0.4, poll_duration=0.2) - assert "Max wait time elapsed" in str(e.value) + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, timeout=0.1, poll_duration=0.1) + + if target_state == V1ClusterState.DELETED: + expected_state = "deleted" + if target_state == V1ClusterState.RUNNING: + expected_state = "running" + + assert str(e.value) == dedent(f"""\ + The cluster has not entered the {expected_state} state within 0.1s. + + The cluster may eventually be {expected_state} afterwards, please check its status using: + lighting list clusters + + To view cluster logs use: + lightning show cluster logs test-cluster + + Contact support@lightning.ai for additional help + """) @mock.patch("click.echo") - def test_echo_state_change_on_desired_running(self, echo: MagicMock): + def test_echo_state_change_on_desired_running(self, echo: MagicMock, spec): client = FakeLightningClient( - list_responses=[ - V1ListClustersResponse( - clusters=[ - Externalv1Cluster( - id="test-cluster", - status=V1ClusterStatus( - phase=state, - reason=reason, - ), - ), - ] + get_responses=[ + V1GetClusterResponse( + name="test-cluster", + status=V1ClusterStatus( + phase=state, + reason=reason, + ), + spec=spec, ) for state, reason in [ (V1ClusterState.QUEUED, ""), @@ -167,52 +273,45 @@ def test_echo_state_change_on_desired_running(self, echo: MagicMock): poll_duration=0.1, ) - assert client.list_call_count == 7 - assert echo.call_count == 7 - echo.assert_has_calls( - [ - call("Cluster test-cluster is now queued"), - call("Cluster test-cluster is now pending"), - call("Cluster test-cluster is now pending"), - call("Cluster test-cluster is now pending"), - call( - "\n".join( - [ - "Cluster test-cluster is now failed with the following reason: some error", - "We are automatically retrying cluster creation.", - "In case you want to delete this cluster:", - "1. Stop this command", - "2. Run `lightning delete cluster test-cluster", - "WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.", - ] - ) - ), - call("Cluster test-cluster is now pending with the following reason: retrying failure"), - call( - "\n".join( - [ - "Cluster test-cluster is now running and ready to use.", - "To launch an app on this cluster use " - "`lightning run app app.py --cloud --cluster-id test-cluster`", - ] - ) - ), - ] - ) + assert client.get_call_count == 7 + assert echo.call_count == 8 + expected_calls = [ + call("Waiting for cluster to be running..."), + call("Cluster test-cluster is being created [elapsed=00s]"), + call("Cluster test-cluster is being created [elapsed=00s]"), + call("Cluster test-cluster is being created [elapsed=00s]"), + call("Cluster test-cluster is being created [elapsed=00s]"), + call(dedent("""\ + The requested cluster operation for cluster test-cluster has errors: + some error + + --- + We are automatically retrying, and an automated alert has been created + + WARNING: Any non-deleted cluster may be using resources. + To avoid incuring cost on your cloud provider, delete the cluster using the following command: + lightning delete cluster test-cluster + + Contact support@lightning.ai for additional help + """)), + call("Cluster test-cluster is being created [elapsed=00s]"), + call(dedent("""\ + Cluster test-cluster is now running and ready to use. + To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id test-cluster + """) + ), + ] + for i, (expected, actual) in enumerate(zip(expected_calls, echo.call_args_list)): + assert expected == actual, f"call {i} did not match" @mock.patch("click.echo") - def test_echo_state_change_on_desired_deleted(self, echo: MagicMock): + def test_echo_state_change_on_desired_deleted(self, echo: MagicMock, spec): client = FakeLightningClient( - list_responses=[ - V1ListClustersResponse( - clusters=[ - Externalv1Cluster( - id="test-cluster", - status=V1ClusterStatus( - phase=state, - ), - ), - ] + get_responses=[ + V1GetClusterResponse( + name="test-cluster", + status=V1ClusterStatus(phase=state), + spec=spec, ) for state in [ V1ClusterState.RUNNING, @@ -231,13 +330,22 @@ def test_echo_state_change_on_desired_deleted(self, echo: MagicMock): poll_duration=0.1, ) - assert client.list_call_count == 4 - assert echo.call_count == 4 - echo.assert_has_calls( - [ - call("Cluster test-cluster is terminating"), - call("Cluster test-cluster is terminating"), - call("Cluster test-cluster is terminating"), - call("Cluster test-cluster has been deleted."), - ] - ) + assert client.get_call_count == 4 + assert echo.call_count == 5 + expected_calls = [ + call("Waiting for cluster to be deleted..."), + call("Cluster test-cluster is being deleted [elapsed=00s]"), + call("Cluster test-cluster is being deleted [elapsed=00s]"), + call("Cluster test-cluster is being deleted [elapsed=00s]"), + call(dedent(f"""\ + Cluster test-cluster has been successfully deleted, and almost all AWS resources have been removed + + For safety purposes we kept the S3 bucket associated with the cluster: test-bucket + + You may want to delete it manually using the AWS CLI: + aws s3 rb --force s3://test-bucket + """) + ), + ] + for i, (expected, actual) in enumerate(zip(expected_calls, echo.call_args_list)): + assert expected == actual, f"Call {i} did not match" From 2e1e2724af2c11eafed84042ac596e2a0eede59a Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 5 Nov 2022 22:51:14 +0000 Subject: [PATCH 06/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning_app/cli/cmd_clusters.py | 44 ++++++++++++----- tests/tests_app/cli/test_cmd_clusters.py | 62 ++++++++++++++++-------- 2 files changed, 73 insertions(+), 33 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 853671550bbd9..41cff2cce22ab 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -149,7 +149,9 @@ def create( click.echo("cluster unchanged") resp = self.api_client.cluster_service_create_cluster(body=new_body) - click.echo(dedent(f"""\ + click.echo( + dedent( + f"""\ BYOC cluster creation triggered successfully! This can take up to an hour to complete. @@ -161,7 +163,9 @@ def create( To delete the cluster run: lightning delete cluster {cluster_name} - """)) + """ + ) + ) if not do_async: try: _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) @@ -194,7 +198,9 @@ def delete(self, cluster_id: str, force: bool = False, do_async: bool = False) - bucket_name = resp.spec.driver.kubernetes.aws.bucket_name self.api_client.cluster_service_delete_cluster(id=cluster_id, force=force) - click.echo(dedent(f"""\ + click.echo( + dedent( + f"""\ Cluster deletion triggered successfully For safety purposes we will not delete anything in the S3 bucket associated with the cluster: @@ -202,7 +208,9 @@ def delete(self, cluster_id: str, force: bool = False, do_async: bool = False) - You may want to delete it manually using the AWS CLI: aws s3 rb --force s3://{bucket_name} - """)) + """ + ) + ) if not do_async: try: @@ -250,7 +258,9 @@ def _wait_for_cluster_state( raise else: state_str = ClusterState.from_api(target_state) - raise click.ClickException(dedent(f"""\ + raise click.ClickException( + dedent( + f"""\ The cluster has not entered the {state_str} state within {_format_elapsed_seconds(timeout)}. The cluster may eventually be {state_str} afterwards, please check its status using: @@ -260,7 +270,9 @@ def _wait_for_cluster_state( lightning show cluster logs {cluster_id} Contact support@lightning.ai for additional help - """)) + """ + ) + ) def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str: @@ -290,7 +302,8 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster duration = _format_elapsed_seconds(elapsed) if current_state == V1ClusterState.FAILED: - return dedent(f"""\ + return dedent( + f"""\ The requested cluster operation for cluster {cluster_name} has errors: {current_reason} @@ -302,26 +315,31 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster lightning delete cluster {cluster_name} Contact support@lightning.ai for additional help - """) + """ + ) if desired_state == current_state == V1ClusterState.RUNNING: - return dedent(f"""\ + return dedent( + f"""\ Cluster {cluster_name} is now running and ready to use. To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id {cluster_name} - """) + """ + ) if desired_state == V1ClusterState.RUNNING: return f"Cluster {cluster_name} is being created [elapsed={duration}]" if desired_state == current_state == V1ClusterState.DELETED: - return dedent(f"""\ + return dedent( + f"""\ Cluster {cluster_name} has been successfully deleted, and almost all AWS resources have been removed For safety purposes we kept the S3 bucket associated with the cluster: {bucket_name} You may want to delete it manually using the AWS CLI: aws s3 rb --force s3://{bucket_name} - """) + """ + ) if desired_state == V1ClusterState.DELETED: return f"Cluster {cluster_name} is being deleted [elapsed={duration}]" @@ -330,7 +348,7 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster def _format_elapsed_seconds(seconds: int) -> str: - """Turns seconds into a duration string + """Turns seconds into a duration string. >>> _format_elapsed_seconds(5) 05s diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index 29ac0e321a1ad..c2b3e7e661060 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -1,6 +1,6 @@ +from textwrap import dedent from unittest import mock from unittest.mock import call, MagicMock -from textwrap import dedent import click import pytest @@ -14,8 +14,8 @@ V1ClusterType, V1CreateClusterRequest, V1CreateClusterResponse, - V1KubernetesClusterDriver, V1GetClusterResponse, + V1KubernetesClusterDriver, ) from lightning_app.cli import cmd_clusters @@ -59,8 +59,10 @@ def cluster_service_get_cluster(self, id: str): class Test_create: @mock.patch("click.echo") @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) - @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster", - MagicMock(return_value=V1CreateClusterResponse(id="test-cluster"))) + @mock.patch( + "lightning_app.utilities.network.LightningClient.cluster_service_create_cluster", + MagicMock(return_value=V1CreateClusterResponse(id="test-cluster")), + ) def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): cluster_manager = AWSClusterManager() cluster_manager.create( @@ -72,7 +74,9 @@ def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): ) expected_output = [ - call(dedent(f"""\ + call( + dedent( + f"""\ BYOC cluster creation triggered successfully! This can take up to an hour to complete. @@ -84,13 +88,14 @@ def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): To delete the cluster run: lightning delete cluster test-cluster - """)), + """ + ) + ), call("Cluster will be created in the background!"), ] for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): assert expected == actual, f"Call {i} does not match" - @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster") def test_create_cluster_api(self, api: mock.MagicMock, async_or_interrupt): @@ -150,7 +155,9 @@ def test_delete_cluster_output( cluster_manager.delete(cluster_id="test-7", do_async=async_or_interrupt) expected_output = [ - call(dedent(f"""\ + call( + dedent( + f"""\ Cluster deletion triggered successfully For safety purposes we will not delete anything in the S3 bucket associated with the cluster: @@ -158,7 +165,9 @@ def test_delete_cluster_output( You may want to delete it manually using the AWS CLI: aws s3 rb --force s3://test-bucket - """)), + """ + ) + ), call("Cluster will be deleted in the background!"), ] for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): @@ -167,13 +176,14 @@ def test_delete_cluster_output( @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster") @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_get_cluster") - def test_delete_cluster_api(self, api_get:mock.MagicMock, api_delete: mock.MagicMock, async_or_interrupt, spec): + def test_delete_cluster_api(self, api_get: mock.MagicMock, api_delete: mock.MagicMock, async_or_interrupt, spec): api_get.return_value = V1GetClusterResponse(spec=spec) cluster_manager = AWSClusterManager() cluster_manager.delete(cluster_id="test-7", do_async=async_or_interrupt) api_delete.assert_called_once_with(id="test-7", force=False) + class Test_check_cluster_name_is_valid: @pytest.mark.parametrize("name", ["test-7", "0wildgoat"]) def test_valid(self, name): @@ -229,7 +239,8 @@ def test_times_out(self, target_state, spec): if target_state == V1ClusterState.RUNNING: expected_state = "running" - assert str(e.value) == dedent(f"""\ + assert str(e.value) == dedent( + f"""\ The cluster has not entered the {expected_state} state within 0.1s. The cluster may eventually be {expected_state} afterwards, please check its status using: @@ -239,7 +250,8 @@ def test_times_out(self, target_state, spec): lightning show cluster logs test-cluster Contact support@lightning.ai for additional help - """) + """ + ) @mock.patch("click.echo") def test_echo_state_change_on_desired_running(self, echo: MagicMock, spec): @@ -281,7 +293,9 @@ def test_echo_state_change_on_desired_running(self, echo: MagicMock, spec): call("Cluster test-cluster is being created [elapsed=00s]"), call("Cluster test-cluster is being created [elapsed=00s]"), call("Cluster test-cluster is being created [elapsed=00s]"), - call(dedent("""\ + call( + dedent( + """\ The requested cluster operation for cluster test-cluster has errors: some error @@ -293,13 +307,18 @@ def test_echo_state_change_on_desired_running(self, echo: MagicMock, spec): lightning delete cluster test-cluster Contact support@lightning.ai for additional help - """)), + """ + ) + ), call("Cluster test-cluster is being created [elapsed=00s]"), - call(dedent("""\ + call( + dedent( + """\ Cluster test-cluster is now running and ready to use. To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id test-cluster - """) - ), + """ + ) + ), ] for i, (expected, actual) in enumerate(zip(expected_calls, echo.call_args_list)): assert expected == actual, f"call {i} did not match" @@ -337,15 +356,18 @@ def test_echo_state_change_on_desired_deleted(self, echo: MagicMock, spec): call("Cluster test-cluster is being deleted [elapsed=00s]"), call("Cluster test-cluster is being deleted [elapsed=00s]"), call("Cluster test-cluster is being deleted [elapsed=00s]"), - call(dedent(f"""\ + call( + dedent( + f"""\ Cluster test-cluster has been successfully deleted, and almost all AWS resources have been removed For safety purposes we kept the S3 bucket associated with the cluster: test-bucket You may want to delete it manually using the AWS CLI: aws s3 rb --force s3://test-bucket - """) - ), + """ + ) + ), ] for i, (expected, actual) in enumerate(zip(expected_calls, echo.call_args_list)): assert expected == actual, f"Call {i} did not match" From 296966851b71edc0f357972770fc3ca8329af0b3 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Sat, 5 Nov 2022 18:54:32 -0400 Subject: [PATCH 07/23] fixup! Improve user messages and test output --- src/lightning_app/cli/cmd_clusters.py | 4 ++-- tests/tests_app/cli/test_cmd_clusters.py | 12 ++++-------- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 41cff2cce22ab..2a8dfe1952c8f 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -172,7 +172,7 @@ def create( return except KeyboardInterrupt: pass - click.echo("Cluster will be created in the background!") + click.echo("\nCluster will be created in the background!") def get_clusters(self) -> ClusterList: resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) @@ -218,7 +218,7 @@ def delete(self, cluster_id: str, force: bool = False, do_async: bool = False) - return except KeyboardInterrupt: pass - click.echo("Cluster will be deleted in the background!") + click.echo("\nCluster will be deleted in the background!") def _wait_for_cluster_state( diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index c2b3e7e661060..af62b1c5c3682 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -88,10 +88,8 @@ def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): To delete the cluster run: lightning delete cluster test-cluster - """ - ) - ), - call("Cluster will be created in the background!"), + """)), + call("\nCluster will be created in the background!"), ] for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): assert expected == actual, f"Call {i} does not match" @@ -165,10 +163,8 @@ def test_delete_cluster_output( You may want to delete it manually using the AWS CLI: aws s3 rb --force s3://test-bucket - """ - ) - ), - call("Cluster will be deleted in the background!"), + """)), + call("\nCluster will be deleted in the background!"), ] for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): assert expected == actual, f"Call {i} does not match" From 5d8bf349edc0bd5c04d04378ac5c94830635df71 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 5 Nov 2022 22:57:03 +0000 Subject: [PATCH 08/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_app/cli/test_cmd_clusters.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index af62b1c5c3682..aeb8ca5d09af9 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -88,7 +88,9 @@ def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): To delete the cluster run: lightning delete cluster test-cluster - """)), + """ + ) + ), call("\nCluster will be created in the background!"), ] for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): @@ -163,7 +165,9 @@ def test_delete_cluster_output( You may want to delete it manually using the AWS CLI: aws s3 rb --force s3://test-bucket - """)), + """ + ) + ), call("\nCluster will be deleted in the background!"), ] for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): From a8551342e1de9faa0472c59e9bfe2892c424ee5f Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Sat, 5 Nov 2022 19:02:49 -0400 Subject: [PATCH 09/23] fixup! fixup! Improve user messages and test output --- src/lightning_app/cli/lightning_cli_delete.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index 4b5327266bf15..a2d0d1cdeb1a4 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -13,7 +13,7 @@ def delete() -> None: @click.argument("cluster", type=str) @click.option( "--async", - "no_wait", + "do_async", type=bool, required=False, default=False, From f469b8d6b3a829f9db0ab31639e8a23d5701957e Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Sat, 5 Nov 2022 19:20:44 -0400 Subject: [PATCH 10/23] fix mypy and flake8 --- src/lightning_app/cli/cmd_clusters.py | 4 ++-- tests/tests_app/cli/test_cmd_clusters.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 2a8dfe1952c8f..e8ce60c744a6d 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -4,7 +4,7 @@ from datetime import datetime from enum import Enum from textwrap import dedent -from typing import Any, List +from typing import Any, List, Union import click import lightning_cloud @@ -347,7 +347,7 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster raise click.ClickException(f"Unknown cluster desired state {desired_state}") -def _format_elapsed_seconds(seconds: int) -> str: +def _format_elapsed_seconds(seconds: Union[float,int]) -> str: """Turns seconds into a duration string. >>> _format_elapsed_seconds(5) diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index aeb8ca5d09af9..1ecb29d7aeda1 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -76,7 +76,7 @@ def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): expected_output = [ call( dedent( - f"""\ + """\ BYOC cluster creation triggered successfully! This can take up to an hour to complete. @@ -157,7 +157,7 @@ def test_delete_cluster_output( expected_output = [ call( dedent( - f"""\ + """\ Cluster deletion triggered successfully For safety purposes we will not delete anything in the S3 bucket associated with the cluster: @@ -358,7 +358,7 @@ def test_echo_state_change_on_desired_deleted(self, echo: MagicMock, spec): call("Cluster test-cluster is being deleted [elapsed=00s]"), call( dedent( - f"""\ + """\ Cluster test-cluster has been successfully deleted, and almost all AWS resources have been removed For safety purposes we kept the S3 bucket associated with the cluster: test-bucket From 9404c32b122ab0a907e8348ab9f34fc6ff21917f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 5 Nov 2022 23:22:08 +0000 Subject: [PATCH 11/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning_app/cli/cmd_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index e8ce60c744a6d..25f2bf78207d8 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -347,7 +347,7 @@ def _cluster_status_long(cluster: V1GetClusterResponse, desired_state: V1Cluster raise click.ClickException(f"Unknown cluster desired state {desired_state}") -def _format_elapsed_seconds(seconds: Union[float,int]) -> str: +def _format_elapsed_seconds(seconds: Union[float, int]) -> str: """Turns seconds into a duration string. >>> _format_elapsed_seconds(5) From 7bd79b76b31f9d151b87104483bdf7e122421aa2 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Mon, 7 Nov 2022 09:36:03 -0500 Subject: [PATCH 12/23] PR feedback --- src/lightning_app/cli/cmd_clusters.py | 14 +++++++------- src/lightning_app/utilities/network.py | 2 +- tests/tests_app/cli/test_cmd_clusters.py | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 25f2bf78207d8..9accd322a48be 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -225,8 +225,8 @@ def _wait_for_cluster_state( api_client: LightningClient, cluster_id: str, target_state: V1ClusterState, - timeout: int = MAX_CLUSTER_WAIT_TIME, - poll_duration: int = 10, + timeout_seconds: int = MAX_CLUSTER_WAIT_TIME, + poll_duration_seconds: int = 10, ) -> None: """_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed. @@ -237,20 +237,20 @@ def _wait_for_cluster_state( api_client: LightningClient used for polling cluster_id: Specifies the cluster to wait for target_state: Specifies the desired state the target cluster needs to meet - timeout: Maximum duration to wait (in seconds) - poll_duration: duration between polling for the cluster state (in seconds) + timeout_seconds: Maximum duration to wait + poll_duration_seconds: duration between polling for the cluster state """ start = time.time() elapsed = 0 click.echo(f"Waiting for cluster to be {ClusterState.from_api(target_state)}...") - while elapsed < timeout: + while elapsed < timeout_seconds: try: resp: V1GetClusterResponse = api_client.cluster_service_get_cluster(id=cluster_id) click.echo(_cluster_status_long(cluster=resp, desired_state=target_state, elapsed=elapsed)) if resp.status.phase == target_state: break - time.sleep(poll_duration) + time.sleep(poll_duration_seconds) elapsed = int(time.time() - start) except lightning_cloud.openapi.rest.ApiException as e: if e.status == 404 and target_state == V1ClusterState.DELETED: @@ -261,7 +261,7 @@ def _wait_for_cluster_state( raise click.ClickException( dedent( f"""\ - The cluster has not entered the {state_str} state within {_format_elapsed_seconds(timeout)}. + The cluster has not entered the {state_str} state within {_format_elapsed_seconds(timeout_seconds)}. The cluster may eventually be {state_str} afterwards, please check its status using: lighting list clusters diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index 2ea9de3bae239..ffdda139065ec 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -118,7 +118,7 @@ def __new__(mcs, name, bases, dct): return new_class -class LightningClient(GridRestClient): # , metaclass=_MethodsRetryWrapperMeta): +class LightningClient(GridRestClient , metaclass=_MethodsRetryWrapperMeta): """The LightningClient is a wrapper around the GridRestClient. It wraps all methods to monitor connection exceptions and employs a retry strategy. diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index 1ecb29d7aeda1..79b8652358e97 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -216,7 +216,7 @@ def test_happy_path(self, target_state, previous_state, spec): for state in [previous_state, target_state] ] ) - cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, poll_duration=0.1) + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, poll_duration_seconds=0.1) assert client.get_call_count == 2 @pytest.mark.parametrize("target_state", [V1ClusterState.RUNNING, V1ClusterState.DELETED]) @@ -232,7 +232,7 @@ def test_times_out(self, target_state, spec): consume=False, ) with pytest.raises(click.ClickException) as e: - cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, timeout=0.1, poll_duration=0.1) + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, timeout_seconds=0.1, poll_duration_seconds=0.1) if target_state == V1ClusterState.DELETED: expected_state = "deleted" @@ -281,8 +281,8 @@ def test_echo_state_change_on_desired_running(self, echo: MagicMock, spec): client, "test-cluster", target_state=V1ClusterState.RUNNING, - timeout=0.6, - poll_duration=0.1, + timeout_seconds=0.6, + poll_duration_seconds=0.1, ) assert client.get_call_count == 7 @@ -345,8 +345,8 @@ def test_echo_state_change_on_desired_deleted(self, echo: MagicMock, spec): client, "test-cluster", target_state=V1ClusterState.DELETED, - timeout=0.4, - poll_duration=0.1, + timeout_seconds=0.4, + poll_duration_seconds=0.1, ) assert client.get_call_count == 4 From 5d9b2f0e589223edb47e46a8947fccdbb11a8dfd Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Nov 2022 14:37:48 +0000 Subject: [PATCH 13/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning_app/utilities/network.py | 2 +- tests/tests_app/cli/test_cmd_clusters.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index ffdda139065ec..f8d80af2dcc0c 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -118,7 +118,7 @@ def __new__(mcs, name, bases, dct): return new_class -class LightningClient(GridRestClient , metaclass=_MethodsRetryWrapperMeta): +class LightningClient(GridRestClient, metaclass=_MethodsRetryWrapperMeta): """The LightningClient is a wrapper around the GridRestClient. It wraps all methods to monitor connection exceptions and employs a retry strategy. diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index 79b8652358e97..b2871250e5e45 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -232,7 +232,9 @@ def test_times_out(self, target_state, spec): consume=False, ) with pytest.raises(click.ClickException) as e: - cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, timeout_seconds=0.1, poll_duration_seconds=0.1) + cmd_clusters._wait_for_cluster_state( + client, "test-cluster", target_state, timeout_seconds=0.1, poll_duration_seconds=0.1 + ) if target_state == V1ClusterState.DELETED: expected_state = "deleted" From 3331cc1b368d095e5382a4fea21ce13159ca02e8 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Mon, 7 Nov 2022 09:42:02 -0500 Subject: [PATCH 14/23] format doctest --- src/lightning_app/cli/cmd_clusters.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 9accd322a48be..7f82f7f693f51 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -351,9 +351,9 @@ def _format_elapsed_seconds(seconds: Union[float, int]) -> str: """Turns seconds into a duration string. >>> _format_elapsed_seconds(5) - 05s + '05s' >>> _format_elapsed_seconds(60) - 01m00s + '01m00s' """ minutes, seconds = divmod(seconds, 60) return (f"{minutes:02}m" if minutes else "") + f"{seconds:02}s" From 36a966cd63a68d105bea2b61b68d83b09581df6f Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Mon, 7 Nov 2022 09:58:16 -0500 Subject: [PATCH 15/23] Fix tests in other files --- tests/tests_app/cli/test_cli.py | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index f5d3a31b9240c..3a69c8883c2b4 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -100,14 +100,7 @@ def test_main_lightning_cli_help(): @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") -@pytest.mark.parametrize( - "extra_arguments,expected_cost_savings_mode", - [ - ([], True), - (["--enable-performance"], False), - ], -) -def test_create_cluster(create_command: mock.MagicMock, extra_arguments, expected_cost_savings_mode): +def test_create_cluster(create_command: mock.MagicMock): runner = CliRunner() runner.invoke( create_cluster, @@ -120,7 +113,6 @@ def test_create_cluster(create_command: mock.MagicMock, extra_arguments, expecte "--role-arn", "arn:aws:iam::1234567890:role/lai-byoc", ] - + extra_arguments, ) create_command.assert_called_once_with( @@ -129,8 +121,8 @@ def test_create_cluster(create_command: mock.MagicMock, extra_arguments, expecte role_arn="arn:aws:iam::1234567890:role/lai-byoc", external_id="dummy", edit_before_creation=False, - cost_savings=expected_cost_savings_mode, - wait=True, + cost_savings=True, + do_async=False, ) @@ -158,7 +150,7 @@ def test_delete_cluster(delete: mock.MagicMock): runner = CliRunner() runner.invoke(delete_cluster, ["test-7"]) - delete.assert_called_once_with(cluster_id="test-7", force=False, wait=True) + delete.assert_called_once_with(cluster_id="test-7", force=False, do_async=False) @mock.patch("lightning_app.utilities.login.Auth._run_server") From d71da41148263eaf72c118c4c17b9eced5aaeaea Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 7 Nov 2022 15:00:17 +0000 Subject: [PATCH 16/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- tests/tests_app/cli/test_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 3a69c8883c2b4..171dc3fb6e575 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -112,7 +112,7 @@ def test_create_cluster(create_command: mock.MagicMock): "dummy", "--role-arn", "arn:aws:iam::1234567890:role/lai-byoc", - ] + ], ) create_command.assert_called_once_with( From 7a2ebba532ebc21603782854307980603fa56cd5 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Mon, 7 Nov 2022 10:46:02 -0500 Subject: [PATCH 17/23] Fixup param name --- docs/source-app/workflows/byoc/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source-app/workflows/byoc/index.rst b/docs/source-app/workflows/byoc/index.rst index 1134301e6dd43..c1dae7859e795 100644 --- a/docs/source-app/workflows/byoc/index.rst +++ b/docs/source-app/workflows/byoc/index.rst @@ -78,7 +78,7 @@ Parameters +------------------------+----------------------------------------------------------------------------------------------------+ | region | AWS region containing compute resources | +------------------------+----------------------------------------------------------------------------------------------------+ -| async | Cluster creation will happen in the background. | +| do-async | Cluster creation will happen in the background. | +------------------------+----------------------------------------------------------------------------------------------------+ ---- From cdab08455899f5f3915a981349c188d5c2045796 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Wed, 9 Nov 2022 11:51:22 -0500 Subject: [PATCH 18/23] Address some feedback --- docs/source-app/workflows/byoc/index.rst | 2 +- src/lightning_app/cli/cmd_clusters.py | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/docs/source-app/workflows/byoc/index.rst b/docs/source-app/workflows/byoc/index.rst index c1dae7859e795..1134301e6dd43 100644 --- a/docs/source-app/workflows/byoc/index.rst +++ b/docs/source-app/workflows/byoc/index.rst @@ -78,7 +78,7 @@ Parameters +------------------------+----------------------------------------------------------------------------------------------------+ | region | AWS region containing compute resources | +------------------------+----------------------------------------------------------------------------------------------------+ -| do-async | Cluster creation will happen in the background. | +| async | Cluster creation will happen in the background. | +------------------------+----------------------------------------------------------------------------------------------------+ ---- diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 7f82f7f693f51..a9e6e5aa7f7fc 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -116,7 +116,7 @@ def create( region: AWS region containing compute resources external_id: AWS IAM Role external ID edit_before_creation: Enables interactive editing of requests before submitting it to Lightning AI. - wait: Waits for the cluster to be in a RUNNING state. Only use this for debugging. + do_async: Triggers cluster creation in the background and exits """ performance_profile = V1ClusterPerformanceProfile.DEFAULT if cost_savings: @@ -166,13 +166,14 @@ def create( """ ) ) - if not do_async: + background_message = "\nCluster will be created in the background!" + if do_async: + click.echo(background_message) + else: try: _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) - return except KeyboardInterrupt: - pass - click.echo("\nCluster will be created in the background!") + click.echo(background_message) def get_clusters(self) -> ClusterList: resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) @@ -212,13 +213,14 @@ def delete(self, cluster_id: str, force: bool = False, do_async: bool = False) - ) ) - if not do_async: + background_message = "\nCluster will be deleted in the background!" + if do_async: + click.echo(background_message) + else: try: _wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED) - return except KeyboardInterrupt: - pass - click.echo("\nCluster will be deleted in the background!") + click.echo(background_message) def _wait_for_cluster_state( From 5c12f39c1f115cc108da0301db0d598a5587d6e3 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Wed, 9 Nov 2022 17:28:59 -0500 Subject: [PATCH 19/23] Remove frivolous tests --- tests/tests_app/cli/test_cmd_clusters.py | 279 +++-------------------- 1 file changed, 35 insertions(+), 244 deletions(-) diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py index b2871250e5e45..a39610f0bb4a8 100644 --- a/tests/tests_app/cli/test_cmd_clusters.py +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -1,6 +1,5 @@ -from textwrap import dedent from unittest import mock -from unittest.mock import call, MagicMock +from unittest.mock import MagicMock import click import pytest @@ -13,7 +12,6 @@ V1ClusterStatus, V1ClusterType, V1CreateClusterRequest, - V1CreateClusterResponse, V1GetClusterResponse, V1KubernetesClusterDriver, ) @@ -56,76 +54,36 @@ def cluster_service_get_cluster(self, id: str): return self.get_responses[0] -class Test_create: - @mock.patch("click.echo") - @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) - @mock.patch( - "lightning_app.utilities.network.LightningClient.cluster_service_create_cluster", - MagicMock(return_value=V1CreateClusterResponse(id="test-cluster")), +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster") +def test_create_cluster_api(api: mock.MagicMock, async_or_interrupt): + cluster_manager = AWSClusterManager() + cluster_manager.create( + cluster_name="test-7", + external_id="dummy", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + region="us-west-2", + do_async=async_or_interrupt, ) - def test_delete_cluster_output(self, echo: mock.MagicMock, async_or_interrupt): - cluster_manager = AWSClusterManager() - cluster_manager.create( - cluster_name="test-cluster", - external_id="dummy", - role_arn="arn:aws:iam::1234567890:role/lai-byoc", - region="us-west-2", - do_async=async_or_interrupt, - ) - - expected_output = [ - call( - dedent( - """\ - BYOC cluster creation triggered successfully! - This can take up to an hour to complete. - - To view the status of your clusters use: - lightning list clusters - - To view cluster logs use: - lightning show cluster logs test-cluster - - To delete the cluster run: - lightning delete cluster test-cluster - """ - ) - ), - call("\nCluster will be created in the background!"), - ] - for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): - assert expected == actual, f"Call {i} does not match" - - @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) - @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster") - def test_create_cluster_api(self, api: mock.MagicMock, async_or_interrupt): - cluster_manager = AWSClusterManager() - cluster_manager.create( - cluster_name="test-7", - external_id="dummy", - role_arn="arn:aws:iam::1234567890:role/lai-byoc", - region="us-west-2", - do_async=async_or_interrupt, - ) - api.assert_called_once_with( - body=V1CreateClusterRequest( - name="test-7", - spec=V1ClusterSpec( - cluster_type=V1ClusterType.BYOC, - performance_profile=V1ClusterPerformanceProfile.DEFAULT, - driver=V1ClusterDriver( - kubernetes=V1KubernetesClusterDriver( - aws=V1AWSClusterDriverSpec( - region="us-west-2", - role_arn="arn:aws:iam::1234567890:role/lai-byoc", - external_id="dummy", - ) + api.assert_called_once_with( + body=V1CreateClusterRequest( + name="test-7", + spec=V1ClusterSpec( + cluster_type=V1ClusterType.BYOC, + performance_profile=V1ClusterPerformanceProfile.DEFAULT, + driver=V1ClusterDriver( + kubernetes=V1KubernetesClusterDriver( + aws=V1AWSClusterDriverSpec( + region="us-west-2", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + external_id="dummy", ) - ), + ) ), - ) + ), ) + ) @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @@ -137,51 +95,15 @@ def test_list_clusters(api: mock.MagicMock): api.assert_called_once_with(phase_not_in=[V1ClusterState.DELETED]) -class Test_delete: - @mock.patch("click.echo") - @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) - @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster", MagicMock()) - @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_get_cluster") - def test_delete_cluster_output( - self, - api_get: mock.MagicMock, - echo: mock.MagicMock, - async_or_interrupt, - spec, - ): - api_get.return_value = V1GetClusterResponse(spec=spec) - - cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id="test-7", do_async=async_or_interrupt) - - expected_output = [ - call( - dedent( - """\ - Cluster deletion triggered successfully - - For safety purposes we will not delete anything in the S3 bucket associated with the cluster: - test-bucket - - You may want to delete it manually using the AWS CLI: - aws s3 rb --force s3://test-bucket - """ - ) - ), - call("\nCluster will be deleted in the background!"), - ] - for i, (expected, actual) in enumerate(zip(expected_output, echo.call_args_list)): - assert expected == actual, f"Call {i} does not match" - - @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) - @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster") - @mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_get_cluster") - def test_delete_cluster_api(self, api_get: mock.MagicMock, api_delete: mock.MagicMock, async_or_interrupt, spec): - api_get.return_value = V1GetClusterResponse(spec=spec) - cluster_manager = AWSClusterManager() - cluster_manager.delete(cluster_id="test-7", do_async=async_or_interrupt) +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster") +@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_get_cluster") +def test_delete_cluster_api(api_get: mock.MagicMock, api_delete: mock.MagicMock, async_or_interrupt, spec): + api_get.return_value = V1GetClusterResponse(spec=spec) + cluster_manager = AWSClusterManager() + cluster_manager.delete(cluster_id="test-7", do_async=async_or_interrupt) - api_delete.assert_called_once_with(id="test-7", force=False) + api_delete.assert_called_once_with(id="test-7", force=False) class Test_check_cluster_name_is_valid: @@ -241,135 +163,4 @@ def test_times_out(self, target_state, spec): if target_state == V1ClusterState.RUNNING: expected_state = "running" - assert str(e.value) == dedent( - f"""\ - The cluster has not entered the {expected_state} state within 0.1s. - - The cluster may eventually be {expected_state} afterwards, please check its status using: - lighting list clusters - - To view cluster logs use: - lightning show cluster logs test-cluster - - Contact support@lightning.ai for additional help - """ - ) - - @mock.patch("click.echo") - def test_echo_state_change_on_desired_running(self, echo: MagicMock, spec): - client = FakeLightningClient( - get_responses=[ - V1GetClusterResponse( - name="test-cluster", - status=V1ClusterStatus( - phase=state, - reason=reason, - ), - spec=spec, - ) - for state, reason in [ - (V1ClusterState.QUEUED, ""), - (V1ClusterState.PENDING, ""), - (V1ClusterState.PENDING, ""), - (V1ClusterState.PENDING, ""), - (V1ClusterState.FAILED, "some error"), - (V1ClusterState.PENDING, "retrying failure"), - (V1ClusterState.RUNNING, ""), - ] - ], - ) - - cmd_clusters._wait_for_cluster_state( - client, - "test-cluster", - target_state=V1ClusterState.RUNNING, - timeout_seconds=0.6, - poll_duration_seconds=0.1, - ) - - assert client.get_call_count == 7 - assert echo.call_count == 8 - expected_calls = [ - call("Waiting for cluster to be running..."), - call("Cluster test-cluster is being created [elapsed=00s]"), - call("Cluster test-cluster is being created [elapsed=00s]"), - call("Cluster test-cluster is being created [elapsed=00s]"), - call("Cluster test-cluster is being created [elapsed=00s]"), - call( - dedent( - """\ - The requested cluster operation for cluster test-cluster has errors: - some error - - --- - We are automatically retrying, and an automated alert has been created - - WARNING: Any non-deleted cluster may be using resources. - To avoid incuring cost on your cloud provider, delete the cluster using the following command: - lightning delete cluster test-cluster - - Contact support@lightning.ai for additional help - """ - ) - ), - call("Cluster test-cluster is being created [elapsed=00s]"), - call( - dedent( - """\ - Cluster test-cluster is now running and ready to use. - To launch an app on this cluster use: lightning run app app.py --cloud --cluster-id test-cluster - """ - ) - ), - ] - for i, (expected, actual) in enumerate(zip(expected_calls, echo.call_args_list)): - assert expected == actual, f"call {i} did not match" - - @mock.patch("click.echo") - def test_echo_state_change_on_desired_deleted(self, echo: MagicMock, spec): - client = FakeLightningClient( - get_responses=[ - V1GetClusterResponse( - name="test-cluster", - status=V1ClusterStatus(phase=state), - spec=spec, - ) - for state in [ - V1ClusterState.RUNNING, - V1ClusterState.RUNNING, - V1ClusterState.RUNNING, - V1ClusterState.DELETED, - ] - ], - ) - - cmd_clusters._wait_for_cluster_state( - client, - "test-cluster", - target_state=V1ClusterState.DELETED, - timeout_seconds=0.4, - poll_duration_seconds=0.1, - ) - - assert client.get_call_count == 4 - assert echo.call_count == 5 - expected_calls = [ - call("Waiting for cluster to be deleted..."), - call("Cluster test-cluster is being deleted [elapsed=00s]"), - call("Cluster test-cluster is being deleted [elapsed=00s]"), - call("Cluster test-cluster is being deleted [elapsed=00s]"), - call( - dedent( - """\ - Cluster test-cluster has been successfully deleted, and almost all AWS resources have been removed - - For safety purposes we kept the S3 bucket associated with the cluster: test-bucket - - You may want to delete it manually using the AWS CLI: - aws s3 rb --force s3://test-bucket - """ - ) - ), - ] - for i, (expected, actual) in enumerate(zip(expected_calls, echo.call_args_list)): - assert expected == actual, f"Call {i} did not match" + assert e.match(f"The cluster has not entered the {expected_state} state") From 51a9732430dbc64de5f4444c0d63e01722366551 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Thu, 17 Nov 2022 09:49:42 +0100 Subject: [PATCH 20/23] Fixup changelog --- src/lightning_app/CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 3f274cd8f2826..c45b7d538b832 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -20,10 +20,14 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- + + - Changed the root directory of the app (which gets uploaded) to be the folder containing the app file, rather than any parent folder containing a `.lightning` file ([#15654](https://github.com/Lightning-AI/lightning/pull/15654)) - Cluster creation and deletion now waits by default [#15458](https://github.com/Lightning-AI/lightning/pull/15458) + ### Deprecated - From e6cec27edab2280a5b54f5cc5f158bb2a3a87be8 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Mon, 21 Nov 2022 10:28:59 -0500 Subject: [PATCH 21/23] Use StrEnum from lightning_utilities --- src/lightning_app/cli/cmd_clusters.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index ec3cdbe484169..273beaeabc7c5 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -2,7 +2,7 @@ import re import time from datetime import datetime -from enum import Enum +from lightning_utilities.core.enums import StrEnum from textwrap import dedent from typing import Any, List, Union @@ -31,7 +31,7 @@ MAX_CLUSTER_WAIT_TIME = 5400 -class ClusterState(Enum): +class ClusterState(StrEnum): UNSPECIFIED = "unspecified" QUEUED = "queued" PENDING = "pending" @@ -45,7 +45,7 @@ def __str__(self) -> str: @classmethod def from_api(cls, status: V1ClusterState) -> "ClusterState": parsed = str(status).lower().split("_", maxsplit=2)[-1] - return cls(parsed) + return cls.from_str(parsed) class ClusterList(Formatable): From 355b6a14a0e13bcc553afd6b1405f383ab0e6096 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 21 Nov 2022 15:32:17 +0000 Subject: [PATCH 22/23] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning_app/cli/cmd_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 273beaeabc7c5..01175a3202343 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -2,7 +2,6 @@ import re import time from datetime import datetime -from lightning_utilities.core.enums import StrEnum from textwrap import dedent from typing import Any, List, Union @@ -20,6 +19,7 @@ V1GetClusterResponse, V1KubernetesClusterDriver, ) +from lightning_utilities.core.enums import StrEnum from rich.console import Console from rich.table import Table from rich.text import Text From 42a6b4a7d5e453718f8682345ccfecd9f7bd0ced Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Mon, 28 Nov 2022 10:08:14 -0500 Subject: [PATCH 23/23] Rename failed to error --- src/lightning_app/cli/cmd_clusters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 01175a3202343..c1baf11c5e273 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -36,7 +36,7 @@ class ClusterState(StrEnum): QUEUED = "queued" PENDING = "pending" RUNNING = "running" - FAILED = "failed" + FAILED = "error" DELETED = "deleted" def __str__(self) -> str: