Skip to content

Commit

Permalink
Wait by default and notify on state changes
Browse files Browse the repository at this point in the history
  • Loading branch information
luca3rd authored and nmiculinic committed Nov 4, 2022
1 parent 921dc1c commit d3215ad
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 30 deletions.
8 changes: 2 additions & 6 deletions docs/source-app/workflows/byoc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ Parameters
^^^^^^^^^^

+------------------------+----------------------------------------------------------------------------------------------------+
|Parameter | Descritption |
|Parameter | Description |
+========================+====================================================================================================+
| provider | The cloud provider where your cluster is located. |
| | |
Expand All @@ -78,18 +78,14 @@ Parameters
+------------------------+----------------------------------------------------------------------------------------------------+
| region | AWS region containing compute resources |
+------------------------+----------------------------------------------------------------------------------------------------+
| instance-types | Instance types that you want to support, for computer jobs within the cluster. |
| | |
| | For now, this is the AWS instance types supported by the cluster. |
+------------------------+----------------------------------------------------------------------------------------------------+
| enable-performance | Specifies if the cluster uses cost savings mode. |
| | |
| | In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters |
| | with low utilization. |
+------------------------+----------------------------------------------------------------------------------------------------+
| edit-before-creation | Enables interactive editing of requests before submitting it to Lightning AI. |
+------------------------+----------------------------------------------------------------------------------------------------+
| wait | Waits for the cluster to be in a RUNNING state. Only use this for debugging. |
| no-wait | Cluster creation will happen in the background. |
+------------------------+----------------------------------------------------------------------------------------------------+

----
Expand Down
1 change: 1 addition & 0 deletions src/lightning_app/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

### Changed

- Cluster creation and deletion now waits by default [#15458](https://github.com/Lightning-AI/lightning/pull/15458)
- Changed the `flow.flows` to be recursive wont to align the behavior with the `flow.works` ([#15466](https://github.com/Lightning-AI/lightning/pull/15466))

-
Expand Down
87 changes: 82 additions & 5 deletions src/lightning_app/cli/cmd_clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import re
import time
from datetime import datetime
from enum import Enum
from textwrap import dedent
from typing import Any, List

Expand Down Expand Up @@ -29,6 +30,23 @@
MAX_CLUSTER_WAIT_TIME = 5400


class ClusterState(Enum):
UNSPECIFIED = "unspecified"
QUEUED = "queued"
PENDING = "pending"
RUNNING = "running"
FAILED = "failed"
DELETED = "deleted"

def __str__(self) -> str:
return str(self.value)

@classmethod
def from_api(cls, status: V1ClusterState) -> "ClusterState":
parsed = str(status).lower().split("_", maxsplit=2)[-1]
return cls(parsed)


class ClusterList(Formatable):
def __init__(self, clusters: List[Externalv1Cluster]):
self.clusters = clusters
Expand Down Expand Up @@ -130,9 +148,6 @@ def create(
click.echo("cluster unchanged")

resp = self.api_client.cluster_service_create_cluster(body=new_body)
if wait:
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)

click.echo(
dedent(
f"""\
Expand All @@ -146,6 +161,13 @@ def create(
"""
)
)
if wait:
click.echo("Waiting for cluster to enter state running...")
click.echo(
"Canceling this operation will NOT stop the cluster from creating"
f"(use `lightning delete cluster {resp.id}`)"
)
_wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING)

def get_clusters(self) -> ClusterList:
resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED])
Expand All @@ -171,6 +193,8 @@ def delete(self, cluster_id: str, force: bool = False, wait: bool = False) -> No
click.echo("Cluster deletion triggered successfully")

if wait:
click.echo("Waiting for cluster to delete...")
click.echo("Canceling the operation will NOT stop the cluster from deleting")
_wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED)


Expand All @@ -183,6 +207,8 @@ def _wait_for_cluster_state(
) -> None:
"""_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed.
Messages will be displayed to the user as the cluster changes state.
Args:
api_client: LightningClient used for polling
cluster_id: Specifies the cluster to wait for
Expand All @@ -192,6 +218,7 @@ def _wait_for_cluster_state(
"""
start = time.time()
elapsed = 0

while elapsed < max_wait_time:
cluster_resp = api_client.cluster_service_list_clusters()
new_cluster = None
Expand All @@ -200,10 +227,14 @@ def _wait_for_cluster_state(
new_cluster = clust
break
if new_cluster is not None:
echo_cluster_status_long(
cluster_id=cluster_id,
current_state=new_cluster.status.phase,
current_reason=new_cluster.status.reason,
desired_state=target_state,
)
if new_cluster.status.phase == target_state:
break
elif new_cluster.status.phase == V1ClusterState.FAILED:
raise click.ClickException(f"Cluster {cluster_id} is in failed state.")
time.sleep(check_timeout)
elapsed = int(time.time() - start)
else:
Expand All @@ -219,3 +250,49 @@ def _check_cluster_name_is_valid(_ctx: Any, _param: Any, value: str) -> str:
Provide a cluster name using valid characters and try again."""
)
return value


def echo_cluster_status_long(
cluster_id: str,
current_state: V1ClusterState,
current_reason: str,
desired_state: V1ClusterState,
) -> None:
"""Echos a long-form status message to the user about the cluster state.
Args:
cluster_id: The name of the cluster
current_state: The cluster's current state
reason: The reason for the cluster's state
"""

state_str = ClusterState.from_api(current_state)

message = f"Cluster {cluster_id} is now {state_str}" + (
f" with the following reason: {current_reason}" if current_reason else ""
)

if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.RUNNING:
message = "\n".join(
[
f"Cluster {cluster_id} is now running and ready to use.",
f"To launch an app on this cluster use `lightning run app app.py --cloud --cluster-id {cluster_id}`",
]
)
if current_state == V1ClusterState.RUNNING and desired_state == V1ClusterState.DELETED:
message = f"Cluster {cluster_id} is terminating"
if current_state == V1ClusterState.FAILED:
message = "\n".join(
[
message,
"We are automatically retrying cluster creation.",
"In case you want to delete this cluster:",
"1. Stop this command",
f"2. Run `lightning delete cluster {cluster_id}",
"WARNING: Any non-deleted cluster can consume cloud resources and incur cost to you.",
]
)
if current_state == V1ClusterState.DELETED:
message = f"Cluster {cluster_id} has been deleted."

click.echo(message)
12 changes: 7 additions & 5 deletions src/lightning_app/cli/lightning_cli_create.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def create() -> None:
type=bool,
required=False,
default=False,
hidden=True,
is_flag=True,
help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance.
This makes runs more expensive but start-up times decrease.""",
Expand All @@ -41,16 +42,17 @@ def create() -> None:
"--edit-before-creation",
default=False,
is_flag=True,
hidden=True,
help="Edit the cluster specs before submitting them to the API server.",
)
@click.option(
"--wait",
"wait",
"--async",
"no_wait",
type=bool,
required=False,
default=False,
is_flag=True,
help="Enabling this flag makes the CLI wait until the cluster is running.",
help="This flag makes the CLI return immediately and lets the cluster creation happen in the background.",
)
def create_cluster(
cluster_name: str,
Expand All @@ -60,7 +62,7 @@ def create_cluster(
provider: str,
edit_before_creation: bool,
enable_performance: bool,
wait: bool,
no_wait: bool,
**kwargs: Any,
) -> None:
"""Create a Lightning AI BYOC compute cluster with your cloud provider credentials."""
Expand All @@ -75,5 +77,5 @@ def create_cluster(
external_id=external_id,
edit_before_creation=edit_before_creation,
cost_savings=not enable_performance,
wait=wait,
wait=not no_wait,
)
10 changes: 5 additions & 5 deletions src/lightning_app/cli/lightning_cli_delete.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,15 +24,15 @@ def delete() -> None:
WARNING: You should NOT use this under normal circumstances.""",
)
@click.option(
"--wait",
"wait",
"--no-wait",
"no_wait",
type=bool,
required=False,
default=False,
is_flag=True,
help="Enabling this flag makes the CLI wait until the cluster is deleted.",
help="This flag makes the CLI return immediately and lets the cluster deletion happen in the background",
)
def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> None:
def delete_cluster(cluster: str, force: bool = False, no_wait: bool = False) -> None:
"""Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources.
Deleting a run also deletes all Runs and Experiments that were started on the cluster.
Expand All @@ -46,4 +46,4 @@ def delete_cluster(cluster: str, force: bool = False, wait: bool = False) -> Non
All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered.
"""
cluster_manager = AWSClusterManager()
cluster_manager.delete(cluster_id=cluster, force=force, wait=wait)
cluster_manager.delete(cluster_id=cluster, force=force, wait=not no_wait)
4 changes: 2 additions & 2 deletions tests/tests_app/cli/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def test_create_cluster(create_command: mock.MagicMock, extra_arguments, expecte
external_id="dummy",
edit_before_creation=False,
cost_savings=expected_cost_savings_mode,
wait=False,
wait=True,
)


Expand Down Expand Up @@ -158,7 +158,7 @@ def test_delete_cluster(delete: mock.MagicMock):
runner = CliRunner()
runner.invoke(delete_cluster, ["test-7"])

delete.assert_called_once_with(cluster_id="test-7", force=False, wait=False)
delete.assert_called_once_with(cluster_id="test-7", force=False, wait=True)


@mock.patch("lightning_app.utilities.login.Auth._run_server")
Expand Down
Loading

0 comments on commit d3215ad

Please sign in to comment.