From c5d3bba7643645c7d319ea7c8f865e4069390ce0 Mon Sep 17 00:00:00 2001 From: Luca Furst Date: Wed, 30 Nov 2022 11:33:57 -0500 Subject: [PATCH] [App] Raise error when launching app on multiple clusters (#15484) * Error when running on multiple clusters * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Revert this in separate PR: keep this focused * Improve testing * fixup! Improve testing * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * pass flake8 * Update changelog * Address PR feedback * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove unused import * Reword error message * Error if running on cluster that doesn't exist * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fixup! Error if running on cluster that doesn't exist * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Remove unsued import Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec <6035284+Borda@users.noreply.github.com> --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/runners/cloud.py | 22 +++-- tests/tests_app/runners/test_cloud.py | 120 ++++++++++++++++++-------- 3 files changed, 101 insertions(+), 43 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index cf115dd5e285f..59acf63a1fee0 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -94,6 +94,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ## [1.8.1] - 2022-11-10 +- Fixed bug when launching apps on multiple clusters ([#15484](https://github.com/Lightning-AI/lightning/pull/15484)) + ### Added diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 0752a3b5be8a8..d3613975f6076 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -6,15 +6,16 @@ import time from dataclasses import dataclass from pathlib import Path +from textwrap import dedent from typing import Any, Callable, List, Optional, Union -import click from lightning_cloud.openapi import ( Body3, Body4, Body7, Body8, Body9, + Externalv1LightningappInstance, Gridv1ImageSpec, V1BuildSpec, V1DependencyFileInfo, @@ -336,6 +337,7 @@ def dispatch( elif CLOUD_QUEUE_TYPE == "redis": queue_server_type = V1QueueServerType.REDIS + existing_instance: Optional[Externalv1LightningappInstance] = None if find_instances_resp.lightningapps: existing_instance = find_instances_resp.lightningapps[0] @@ -374,12 +376,20 @@ def dispatch( f"Your app last ran on cluster {app_config.cluster_id}, but that cluster " "doesn't exist anymore." ) - click.confirm( - f"{msg} Do you want to run on Lightning Cloud instead?", - abort=True, - default=True, + raise ValueError(msg) + if existing_instance and existing_instance.spec.cluster_id != app_config.cluster_id: + raise ValueError( + dedent( + f"""\ + An app names {app_config.name} is already running on cluster {existing_instance.spec.cluster_id}, and you requested it to run on cluster {app_config.cluster_id}. + + In order to proceed, please either: + a. rename the app to run on {app_config.cluster_id} with the --name option + lightning run app {app_entrypoint_file} --name (new name) --cloud --cluster-id {app_config.cluster_id} + b. delete the app running on {existing_instance.spec.cluster_id} in the UI before running this command. + """ # noqa: E501 + ) ) - app_config.cluster_id = None if app_config.cluster_id is not None: self._ensure_cluster_project_binding(project.project_id, app_config.cluster_id) diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index d1a6f9daaffe1..1f525341224eb 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -2,6 +2,7 @@ import os import re import sys +from contextlib import nullcontext as does_not_raise from copy import copy from pathlib import Path from unittest import mock @@ -101,32 +102,100 @@ def get_cloud_runtime_request_body(**kwargs) -> "Body8": return Body8(**default_request_body) +@pytest.fixture +def cloud_backend(monkeypatch): + cloud_backend = mock.MagicMock() + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + return cloud_backend + + +@pytest.fixture +def project_id(): + return "test-project-id" + + +DEFAULT_CLUSTER = "litng-ai-03" + + class TestAppCreationClient: """Testing the calls made using GridRestClient to create the app.""" + def test_run_on_deleted_cluster(self, cloud_backend): + app_name = "test-app" + + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="Default Project", project_id=project_id)] + ) + + mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse( + [ + Externalv1Cluster(id=DEFAULT_CLUSTER), + ] + ) + cloud_backend.client = mock_client + + app = mock.MagicMock() + app.flows = [] + app.frontend = {} + + existing_instance = MagicMock() + existing_instance.status.phase = V1LightningappInstanceState.STOPPED + existing_instance.spec.cluster_id = DEFAULT_CLUSTER + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[existing_instance]) + ) + + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") + cloud_runtime._check_uploaded_folder = mock.MagicMock() + + with pytest.raises(ValueError, match="that cluster doesn't exist"): + cloud_runtime.dispatch(name=app_name, cluster_id="unknown-cluster") + # TODO: remove this test once there is support for multiple instances - @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) - def test_new_instance_on_different_cluster(self, monkeypatch): - app_name = "test-app-name" - original_cluster = "cluster-001" - new_cluster = "cluster-002" + @pytest.mark.parametrize( + "old_cluster,new_cluster,expected_raise", + [ + ( + "test", + "other", + pytest.raises( + ValueError, + match="already running on cluster", + ), + ), + ("test", "test", does_not_raise()), + (None, None, does_not_raise()), + (None, "litng-ai-03", does_not_raise()), + ("litng-ai-03", None, does_not_raise()), + ], + ) + def test_new_instance_on_different_cluster( + self, cloud_backend, project_id, old_cluster, new_cluster, expected_raise + ): + app_name = "test-app" mock_client = mock.MagicMock() mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( - memberships=[V1Membership(name="Default Project", project_id="default-project-id")] + memberships=[V1Membership(name="Default Project", project_id=project_id)] ) mock_client.lightningapp_v2_service_create_lightningapp_release.return_value = V1LightningappRelease( cluster_id=new_cluster ) + + # Note: + # backend converts "None" cluster to "litng-ai-03" + # dispatch should receive None, but API calls should return "litng-ai-03" mock_client.cluster_service_list_clusters.return_value = V1ListClustersResponse( - [Externalv1Cluster(id=original_cluster), Externalv1Cluster(id=new_cluster)] + [ + Externalv1Cluster(id=old_cluster or DEFAULT_CLUSTER), + Externalv1Cluster(id=new_cluster or DEFAULT_CLUSTER), + ] ) - cloud_backend = mock.MagicMock() cloud_backend.client = mock_client - monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) - monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) - monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) app = mock.MagicMock() app.flows = [] @@ -134,7 +203,7 @@ def test_new_instance_on_different_cluster(self, monkeypatch): existing_instance = MagicMock() existing_instance.status.phase = V1LightningappInstanceState.STOPPED - existing_instance.spec.cluster_id = original_cluster + existing_instance.spec.cluster_id = old_cluster or DEFAULT_CLUSTER mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( V1ListLightningappInstancesResponse(lightningapps=[existing_instance]) ) @@ -142,34 +211,11 @@ def test_new_instance_on_different_cluster(self, monkeypatch): cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") cloud_runtime._check_uploaded_folder = mock.MagicMock() - # without requirements file - # setting is_file to False so requirements.txt existence check will return False - monkeypatch.setattr(Path, "is_file", lambda *args, **kwargs: False) - monkeypatch.setattr(cloud, "Path", Path) - # This is the main assertion: # we have an existing instance on `cluster-001` # but we want to run this app on `cluster-002` - cloud_runtime.dispatch(name=app_name, cluster_id=new_cluster) - - body = Body8( - cluster_id=new_cluster, - app_entrypoint_file=mock.ANY, - enable_app_server=True, - flow_servers=[], - image_spec=None, - works=[], - local_source=True, - dependency_cache_key=mock.ANY, - user_requested_flow_compute_config=mock.ANY, - ) - cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( - project_id="default-project-id", app_id=mock.ANY, body=body - ) - cloud_runtime.backend.client.projects_service_create_project_cluster_binding.assert_called_once_with( - project_id="default-project-id", - body=V1ProjectClusterBinding(cluster_id=new_cluster, project_id="default-project-id"), - ) + with expected_raise: + cloud_runtime.dispatch(name=app_name, cluster_id=new_cluster) @pytest.mark.parametrize("flow_cloud_compute", [None, CloudCompute(name="t2.medium")]) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock())