From 5c05719f27b160a7f6db9345542182986d4feb27 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 8 Aug 2022 08:15:54 +0200 Subject: [PATCH 01/59] Freeze requirements for CI (#14007) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * free requirements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo * typo * ui * mypy * todo * mypy Co-authored-by: Carlos Mocholí * mypy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí Co-authored-by: Akihiro Nitta --- .github/workflows/code-checks.yml | 1 + requirements/app/base.txt | 7 +++---- requirements/app/cloud.txt | 3 +-- requirements/app/docs.txt | 17 ++++++++--------- requirements/app/test.txt | 15 +++++---------- requirements/app/ui.txt | 2 +- requirements/pytorch/docs.txt | 16 ++++++++-------- requirements/pytorch/test.txt | 27 +++++++++++++-------------- 8 files changed, 40 insertions(+), 48 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index ed9cd46adbe44..7b5f3f26602e8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -34,6 +34,7 @@ jobs: run: | pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt + # todo: adjust requirements for both code-bases pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html pip list diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 0a0b9cdb4719d..02eeb04bfa218 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,9 +1,8 @@ -py lightning-cloud==0.5.0 packaging -deepdiff>=5.7.0 +deepdiff>=5.7.0, <=5.8.1 starsessions -fsspec>=2022.01.0 -s3fs>=2022.1.0 +fsspec>=2022.01.0, <=2022.7.1 +s3fs>=2022.1.0, <=2022.7.1 croniter # for now until we found something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index 5f8bf0c48692f..ff18d47b44565 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,5 +1,4 @@ starsessions redis>=4.0.0, <=4.2.4 docker==5.0.3 -setuptools==59.5.0 -s3fs==2022.1.0 +# setuptools==59.5.0 diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt index b35cc585b40c7..bf22aef2c2d92 100644 --- a/requirements/app/docs.txt +++ b/requirements/app/docs.txt @@ -1,18 +1,17 @@ sphinx>=4.0,<5.0 -myst-parser>=0.15 -nbsphinx>=0.8.5 +myst-parser>=0.15,<0.17 +nbsphinx>=0.8.5, <=0.8.9 ipython[notebook] ipython_genutils -pandoc>=1.0 -docutils>=0.16 -sphinxcontrib-fulltoc>=1.0 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 sphinxcontrib-mockautodoc https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1 -sphinx-togglebutton>=0.2 -sphinx-copybutton>=0.3 +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 sphinx-autobuild -typing-extensions # already in `requirements.txt` but the docs CI job does not install it jinja2>=3.0.0,<3.1.0 diff --git a/requirements/app/test.txt b/requirements/app/test.txt index 9d2ed0af910ca..ab5ef8f1e85ac 100644 --- a/requirements/app/test.txt +++ b/requirements/app/test.txt @@ -1,15 +1,10 @@ -coverage>=5.0 -codecov>=2.1 -pytest>=5.0 -pytest-timeout -pytest-cov +coverage>=6.4, <=6.4.2 +codecov>=2.1, <=2.1.12 +pytest>=7.0, <=7.1.2 +pytest-timeout <=2.1.0 +pytest-cov <=3.0.0 playwright==1.22.0 # pytest-flake8 -flake8>=3.0 -check-manifest -twine>=3.2 -isort>=5.0 -mypy>=0.720 httpx trio pympler diff --git a/requirements/app/ui.txt b/requirements/app/ui.txt index 28df7f9c2ffe0..f0e4b2cdef471 100644 --- a/requirements/app/ui.txt +++ b/requirements/app/ui.txt @@ -1 +1 @@ -streamlit>=1.3.1 +streamlit>=1.3.1, <=1.11.1 diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt index e6fbbe322b6bf..50e7c2049f6f6 100644 --- a/requirements/pytorch/docs.txt +++ b/requirements/pytorch/docs.txt @@ -1,16 +1,16 @@ sphinx>=4.0,<5.0 myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5 +nbsphinx>=0.8.5, <=0.8.9 ipython[notebook] -pandoc>=1.0 -docutils>=0.16 -sphinxcontrib-fulltoc>=1.0 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 sphinxcontrib-mockautodoc pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip -sphinx-autodoc-typehints>=1.11,<1.15 # v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1 -sphinx-togglebutton>=0.2 -sphinx-copybutton>=0.3 +sphinx-autodoc-typehints>=1.11,<1.15 # strict; v1.15 failing on master (#11405) +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 typing-extensions # already in `requirements.txt` but the docs CI job does not install it jinja2>=3.0.0,<3.1.0 diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index ce54cd087b1de..c155400a3d35f 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -1,18 +1,17 @@ -coverage>=6.4 -codecov>=2.1 -pytest>=7.0 -pytest-cov -pytest-forked +coverage>=6.4, <=6.4.2 +codecov>=2.1, <=2.1.12 +pytest>=7.0, <=7.1.2 +pytest-cov <=3.0.0 +pytest-forked <=1.4.0 pytest-rerunfailures>=10.2 -mypy>=0.920 -flake8>=3.9.2 pre-commit>=1.0 +mypy==0.971 # needed in tests -cloudpickle>=1.3 -scikit-learn>0.22.1 -onnxruntime -psutil # for `DeviceStatsMonitor` -pandas # needed in benchmarks -fastapi -uvicorn +cloudpickle>=1.3, <=2.1.0 +scikit-learn>0.22.1, <=1.1.1 +onnxruntime<=1.12.0 +psutil<=5.9.1 # for `DeviceStatsMonitor` +pandas>1.0, <=1.4.3 # needed in benchmarks +fastapi<=0.79.0 +uvicorn<=0.18.2 From 76836a33cdfa63e2c85c6f4ea9b2a1f174c973e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Aug 2022 10:06:41 +0200 Subject: [PATCH 02/59] Run mypy with PyTorch 1.12 (#14044) --- .github/workflows/code-checks.yml | 2 +- pyproject.toml | 1 - .../plugins/precision/fully_sharded_native_amp.py | 2 +- .../strategies/fully_sharded_native.py | 2 +- .../strategies/launchers/multiprocessing.py | 2 +- src/pytorch_lightning/utilities/cloud_io.py | 11 ++++++----- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 7b5f3f26602e8..15bd5e9911740 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | - pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install torch==1.12 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt # todo: adjust requirements for both code-bases pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html diff --git a/pyproject.toml b/pyproject.toml index 5473e73c52e19..9b8400ba27577 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.callbacks.quantization", "pytorch_lightning.core.datamodule", - "pytorch_lightning.core.decorators", "pytorch_lightning.core.module", "pytorch_lightning.core.saving", "pytorch_lightning.demos.boring_classes", diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py index 8c693f2975bbd..60e53b880c84d 100644 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py @@ -23,7 +23,7 @@ if _TORCH_GREATER_EQUAL_1_12: from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision else: - MixedPrecision = None + MixedPrecision = None # type: ignore[misc,assignment] class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 4c351f26fa3b9..d92931fb5cdb2 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -51,7 +51,7 @@ ) from torch.distributed.fsdp.wrap import enable_wrap else: - MixedPrecision = None + MixedPrecision = None # type: ignore[misc,assignment] BackwardPrefetch = None # type: ignore[misc,assignment] CPUOffload = None # type: ignore[misc,assignment] diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 39bba092e9c60..2617e5fe27b10 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -144,7 +144,7 @@ def _recover_results_in_main_process(self, worker_output: "_WorkerOutput", train # load last weights if worker_output.weights_path is not None: ckpt = self._strategy.checkpoint_io.load_checkpoint(worker_output.weights_path) - trainer.lightning_module.load_state_dict(ckpt) # type: ignore[arg-type] + trainer.lightning_module.load_state_dict(ckpt) self._strategy.checkpoint_io.remove_checkpoint(worker_output.weights_path) trainer.state = worker_output.trainer_state diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py index 81482a8ab24f9..ee3358be59541 100644 --- a/src/pytorch_lightning/utilities/cloud_io.py +++ b/src/pytorch_lightning/utilities/cloud_io.py @@ -22,14 +22,12 @@ from fsspec.core import url_to_fs from fsspec.implementations.local import AbstractFileSystem -from pytorch_lightning.utilities.types import _PATH +from pytorch_lightning.utilities.types import _DEVICE, _PATH def load( path_or_url: Union[IO, _PATH], - map_location: Optional[ - Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] - ] = None, + map_location: Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] = None, ) -> Any: """Loads a checkpoint. @@ -41,7 +39,10 @@ def load( # any sort of BytesIO or similar return torch.load(path_or_url, map_location=map_location) if str(path_or_url).startswith("http"): - return torch.hub.load_state_dict_from_url(str(path_or_url), map_location=map_location) + return torch.hub.load_state_dict_from_url( + str(path_or_url), + map_location=map_location, # type: ignore[arg-type] # upstream annotation is not correct + ) fs = get_filesystem(path_or_url) with fs.open(path_or_url, "rb") as f: return torch.load(f, map_location=map_location) From aaeff90254aa0a1b91aaed759d15e66123533618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Aug 2022 10:07:54 +0200 Subject: [PATCH 03/59] Remove deprecated `DistributedType` and `DeviceType` enum classes (#14045) --- src/pytorch_lightning/CHANGELOG.md | 8 +- src/pytorch_lightning/utilities/__init__.py | 1 - src/pytorch_lightning/utilities/enums.py | 95 +------------------ .../deprecated_api/test_remove_1-8.py | 13 --- 4 files changed, 8 insertions(+), 109 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5d77a3ad293b9..565ef0e8438b5 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -30,7 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `amp_level` from `Trainer` in favour of passing it explictly via precision plugin ([#13898](https://github.com/Lightning-AI/lightning/pull/13898)) -- +- Deprecated the calls to `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) ### Removed @@ -44,6 +44,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `DDP2Strategy` ([#14026](https://github.com/Lightning-AI/lightning/pull/14026)) +- Removed the deprecated `DistributedType` and `DeviceType` enum classes ([#14045](https://github.com/Lightning-AI/lightning/pull/14045)) + + +- Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) + + ### Fixed - Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index df5084dd85490..c849ba0a05d68 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -21,7 +21,6 @@ _AcceleratorType, _StrategyType, AMPType, - DistributedType, GradClipAlgorithmType, LightningEnum, ) diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index e687d3f9f046b..06d616f87259f 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -15,11 +15,9 @@ from __future__ import annotations import os -from enum import Enum, EnumMeta -from typing import Any +from enum import Enum from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.warnings import rank_zero_deprecation class LightningEnum(str, Enum): @@ -43,37 +41,6 @@ def __hash__(self) -> int: return hash(self.value.lower()) -class _DeprecatedEnumMeta(EnumMeta): - """Enum that calls `deprecate()` whenever a member is accessed. - - Adapted from: https://stackoverflow.com/a/62309159/208880 - """ - - def __getattribute__(cls, name: str) -> Any: - obj = super().__getattribute__(name) - # ignore __dunder__ names -- prevents potential recursion errors - if not (name.startswith("__") and name.endswith("__")) and isinstance(obj, Enum): - obj.deprecate() - return obj - - def __getitem__(cls, name: str) -> Any: - member: _DeprecatedEnumMeta = super().__getitem__(name) - member.deprecate() - return member - - def __call__(cls, *args: Any, **kwargs: Any) -> Any: - obj = super().__call__(*args, **kwargs) - if isinstance(obj, Enum): - obj.deprecate() - return obj - - -class _DeprecatedEnum(LightningEnum, metaclass=_DeprecatedEnumMeta): - """_DeprecatedEnum calls an enum's `deprecate()` method on member access.""" - - pass - - class AMPType(LightningEnum): """Type of Automatic Mixed Precission used for training. @@ -110,66 +77,6 @@ def supported_types() -> list[str]: return [x.value for x in PrecisionType] -class DistributedType(_DeprecatedEnum): - """Define type of training strategy. - - Deprecated since v1.6.0 and will be removed in v1.8.0. - - Use `_StrategyType` instead. - """ - - DP = "dp" - DDP = "ddp" - DDP_SPAWN = "ddp_spawn" - TPU_SPAWN = "tpu_spawn" - DEEPSPEED = "deepspeed" - HOROVOD = "horovod" - DDP_SHARDED = "ddp_sharded" - DDP_SHARDED_SPAWN = "ddp_sharded_spawn" - DDP_FULLY_SHARDED = "ddp_fully_sharded" - HPU_PARALLEL = "hpu_parallel" - - @staticmethod - def interactive_compatible_types() -> list[DistributedType]: - """Returns a list containing interactive compatible DistributeTypes.""" - return [ - DistributedType.DP, - DistributedType.DDP_SPAWN, - DistributedType.DDP_SHARDED_SPAWN, - DistributedType.TPU_SPAWN, - ] - - def is_interactive_compatible(self) -> bool: - """Returns whether self is interactive compatible.""" - return self in DistributedType.interactive_compatible_types() - - def deprecate(self) -> None: - rank_zero_deprecation( - "`DistributedType` Enum has been deprecated in v1.6 and will be removed in v1.8." - f" Use the string value `{self.value!r}` instead." - ) - - -class DeviceType(_DeprecatedEnum): - """Define Device type by its nature - accelerators. - - Deprecated since v1.6.0 and will be removed in v1.8.0. - - Use `_AcceleratorType` instead. - """ - - CPU = "CPU" - GPU = "GPU" - IPU = "IPU" - TPU = "TPU" - - def deprecate(self) -> None: - rank_zero_deprecation( - "`DeviceType` Enum has been deprecated in v1.6 and will be removed in v1.8." - f" Use the string value `{self.value!r}` instead." - ) - - class GradClipAlgorithmType(LightningEnum): """Define gradient_clip_algorithm types - training-tricks. NORM type means "clipping gradients by norm". This computed over all model parameters together. diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index aa6c1a615f9d2..91be34c55078f 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -36,7 +36,6 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.enums import DeviceType, DistributedType from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn from tests_pytorch.deprecated_api import no_deprecated_call @@ -44,18 +43,6 @@ from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator -def test_v1_8_0_deprecated_distributed_type_enum(): - - with pytest.deprecated_call(match="has been deprecated in v1.6 and will be removed in v1.8."): - _ = DistributedType.DDP - - -def test_v1_8_0_deprecated_device_type_enum(): - - with pytest.deprecated_call(match="has been deprecated in v1.6 and will be removed in v1.8."): - _ = DeviceType.CPU - - @pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.") def test_v1_8_0_deprecated_torchtext_batch(): From 355fda3702d640330fddbe25ad127879b0a7cbfa Mon Sep 17 00:00:00 2001 From: Dan Dale Date: Mon, 8 Aug 2022 01:16:53 -0700 Subject: [PATCH 04/59] Add Promoted CLI to API Reference Section (#14072) --- docs/source-pytorch/api_references.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index db4fc1e2c4cf8..8daed5ddcaf41 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -47,6 +47,20 @@ callbacks Timer TQDMProgressBar +cli +----- + +.. currentmodule:: pytorch_lightning.cli + +.. autosummary:: + :toctree: api + :nosignatures: + :template: classtemplate.rst + + LightningCLI + LightningArgumentParser + SaveConfigCallback + core ---- From 5271ed93e6823178d1698d150b7146fe7a288695 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Mon, 8 Aug 2022 10:03:52 +0100 Subject: [PATCH 05/59] Fix mypy errors attributed to `pytorch_lightning.trainer.connectors.callback_connector.py` (#13750) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Apply suggestions from code review Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta --- pyproject.toml | 1 - .../trainer/connectors/callback_connector.py | 26 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9b8400ba27577..2f0e290440f44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,6 @@ module = [ "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", - "pytorch_lightning.trainer.connectors.callback_connector", "pytorch_lightning.trainer.connectors.data_connector", "pytorch_lightning.trainer.supporters", "pytorch_lightning.trainer.trainer", diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py index 83881905beeb1..bb7f912420256 100644 --- a/src/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py @@ -17,6 +17,7 @@ from datetime import timedelta from typing import Dict, List, Optional, Sequence, Union +import pytorch_lightning as pl from pytorch_lightning.callbacks import ( Callback, Checkpoint, @@ -37,7 +38,7 @@ class CallbackConnector: - def __init__(self, trainer): + def __init__(self, trainer: "pl.Trainer"): self.trainer = trainer def on_trainer_init( @@ -50,7 +51,7 @@ def on_trainer_init( enable_model_summary: bool, max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None, accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None, - ): + ) -> None: # init folder paths for checkpoint + weights save callbacks self.trainer._default_root_dir = default_root_dir or os.getcwd() if weights_save_path: @@ -95,16 +96,18 @@ def on_trainer_init( def _configure_accumulated_gradients( self, accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None ) -> None: - grad_accum_callback = [cb for cb in self.trainer.callbacks if isinstance(cb, GradientAccumulationScheduler)] + grad_accum_callbacks: List[GradientAccumulationScheduler] = [ + cb for cb in self.trainer.callbacks if isinstance(cb, GradientAccumulationScheduler) + ] - if grad_accum_callback: + if grad_accum_callbacks: if accumulate_grad_batches is not None: raise MisconfigurationException( "You have set both `accumulate_grad_batches` and passed an instance of " "`GradientAccumulationScheduler` inside callbacks. Either remove `accumulate_grad_batches` " "from trainer or remove `GradientAccumulationScheduler` from callbacks list." ) - grad_accum_callback = grad_accum_callback[0] + grad_accum_callback = grad_accum_callbacks[0] else: if accumulate_grad_batches is None: accumulate_grad_batches = 1 @@ -148,6 +151,7 @@ def _configure_model_summary_callback(self, enable_model_summary: bool) -> None: progress_bar_callback = self.trainer.progress_bar_callback is_progress_bar_rich = isinstance(progress_bar_callback, RichProgressBar) + model_summary: ModelSummary if progress_bar_callback is not None and is_progress_bar_rich: model_summary = RichModelSummary() else: @@ -188,7 +192,7 @@ def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dic timer = Timer(duration=max_time, interval="step") self.trainer.callbacks.append(timer) - def _configure_fault_tolerance_callbacks(self): + def _configure_fault_tolerance_callbacks(self) -> None: from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint if any(isinstance(cb, _FaultToleranceCheckpoint) for cb in self.trainer.callbacks): @@ -196,7 +200,7 @@ def _configure_fault_tolerance_callbacks(self): # don't use `log_dir` to minimize the chances of failure self.trainer.callbacks.append(_FaultToleranceCheckpoint(dirpath=self.trainer.default_root_dir)) - def _attach_model_logging_functions(self): + def _attach_model_logging_functions(self) -> None: lightning_module = self.trainer.lightning_module for callback in self.trainer.callbacks: callback.log = lightning_module.log @@ -243,7 +247,7 @@ def _reorder_callbacks(callbacks: List[Callback]) -> List[Callback]: A new list in which the last elements are Checkpoint if there were any present in the input. """ - checkpoints = [c for c in callbacks if isinstance(c, Checkpoint)] + checkpoints: List[Callback] = [c for c in callbacks if isinstance(c, Checkpoint)] not_checkpoints = [c for c in callbacks if not isinstance(c, Checkpoint)] return not_checkpoints + checkpoints @@ -263,12 +267,12 @@ def _configure_external_callbacks() -> List[Callback]: else: from pkg_resources import iter_entry_points - factories = iter_entry_points("pytorch_lightning.callbacks_factory") + factories = iter_entry_points("pytorch_lightning.callbacks_factory") # type: ignore[assignment] - external_callbacks = [] + external_callbacks: List[Callback] = [] for factory in factories: callback_factory = factory.load() - callbacks_list: List[Callback] = callback_factory() + callbacks_list: Union[List[Callback], Callback] = callback_factory() callbacks_list = [callbacks_list] if isinstance(callbacks_list, Callback) else callbacks_list _log.info( f"Adding {len(callbacks_list)} callbacks from entry point '{factory.name}':" From 5c9b352eea38d39360324f9740e119dc42b2078e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Aug 2022 09:25:15 +0000 Subject: [PATCH 06/59] Update wandb requirement from <0.12.20,>=0.10.22 to >=0.10.22,<0.13.2 in /requirements (#14080) --- requirements/pytorch/loggers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index 48a15c30f842f..df83a077f8457 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -7,4 +7,4 @@ neptune-client>=0.10.0, <0.16.4 comet-ml>=3.1.12, <3.31.8 mlflow>=1.0.0, <1.28.0 test_tube>=0.7.5, <=0.7.5 -wandb>=0.10.22, <0.12.20 +wandb>=0.10.22, <0.13.2 From b4ade232c8d8889fcadbf9b7b49380a3690f8acd Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Mon, 8 Aug 2022 07:13:25 -0400 Subject: [PATCH 07/59] Fix: Start Lightning App on Cloud if Repo Begins With Name "Lightning" (#14025) --- .../utilities/packaging/lightning_utils.py | 7 ++++++- .../utilities/packaging/test_lightning_utils.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/utilities/packaging/lightning_utils.py b/src/lightning_app/utilities/packaging/lightning_utils.py index 37f4ff22988eb..073d4d7ab613a 100644 --- a/src/lightning_app/utilities/packaging/lightning_utils.py +++ b/src/lightning_app/utilities/packaging/lightning_utils.py @@ -89,8 +89,13 @@ def get_dist_path_if_editable_install(project_name) -> str: def _prepare_lightning_wheels_and_requirements(root: Path) -> Optional[Callable]: + """This function determines if lightning is installed in editable mode (for developers) and packages the + current lightning source along with the app. - if "site-packages" in _PROJECT_ROOT: + For normal users who install via PyPi or Conda, then this function does not do anything. + """ + + if not get_dist_path_if_editable_install("lightning"): return # Packaging the Lightning codebase happens only inside the `lightning` repo. diff --git a/tests/tests_app/utilities/packaging/test_lightning_utils.py b/tests/tests_app/utilities/packaging/test_lightning_utils.py index b34e3162d5a0c..8f30aa21dd396 100644 --- a/tests/tests_app/utilities/packaging/test_lightning_utils.py +++ b/tests/tests_app/utilities/packaging/test_lightning_utils.py @@ -1,4 +1,5 @@ import os +from unittest import mock import pytest @@ -21,6 +22,21 @@ def test_prepare_lightning_wheels_and_requirement(tmpdir): assert os.listdir(tmpdir) == [] +def _mocked_get_dist_path_if_editable_install(*args, **kwargs): + return None + + +@mock.patch( + "lightning_app.utilities.packaging.lightning_utils.get_dist_path_if_editable_install", + new=_mocked_get_dist_path_if_editable_install, +) +def test_prepare_lightning_wheels_and_requirement_for_packages_installed_in_editable_mode(tmpdir): + """This test ensures the source does not get packaged inside the lightning repo if not installed in editable + mode.""" + cleanup_handle = _prepare_lightning_wheels_and_requirements(tmpdir) + assert cleanup_handle is None + + @pytest.mark.skip(reason="TODO: Find a way to check for the latest version") @RunIf(skip_windows=True) def test_verify_lightning_version(monkeypatch): From d072e4451a73f8fc2d7886086a220fbaf614b49e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Aug 2022 13:35:06 +0200 Subject: [PATCH 08/59] Fix dtype inference during gradient norm computation (#14051) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/utilities/grads.py | 6 +++--- tests/tests_pytorch/utilities/test_grads.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 565ef0e8438b5..915436e5a0bcf 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) +- Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) + + ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/utilities/grads.py b/src/pytorch_lightning/utilities/grads.py index 66c1b7d988522..76c3f39bdc013 100644 --- a/src/pytorch_lightning/utilities/grads.py +++ b/src/pytorch_lightning/utilities/grads.py @@ -41,12 +41,12 @@ def grad_norm(module: Module, norm_type: Union[float, int, str], group_separator raise ValueError(f"`norm_type` must be a positive number or 'inf' (infinity norm). Got {norm_type}") norms = { - f"grad_{norm_type}_norm{group_separator}{name}": p.grad.data.norm(norm_type).item() + f"grad_{norm_type}_norm{group_separator}{name}": p.grad.data.norm(norm_type) for name, p in module.named_parameters() if p.grad is not None } if norms: - total_norm = torch.tensor(list(norms.values())).norm(norm_type).item() + total_norm = torch.tensor(list(norms.values())).norm(norm_type) norms[f"grad_{norm_type}_norm_total"] = total_norm - norms = {k: round(v, 4) for k, v in norms.items()} + norms = {k: round(v.item(), 4) for k, v in norms.items()} return norms diff --git a/tests/tests_pytorch/utilities/test_grads.py b/tests/tests_pytorch/utilities/test_grads.py index a548de66ab85d..49aab76403847 100644 --- a/tests/tests_pytorch/utilities/test_grads.py +++ b/tests/tests_pytorch/utilities/test_grads.py @@ -76,3 +76,17 @@ def __init__(self): def test_grad_norm_invalid_norm_type(norm_type): with pytest.raises(ValueError, match="`norm_type` must be a positive number or 'inf'"): grad_norm(Mock(), norm_type) + + +def test_grad_norm_with_double_dtype(): + class Model(nn.Module): + def __init__(self): + super().__init__() + dtype = torch.double + self.param = nn.Parameter(torch.tensor(1.0, dtype=dtype)) + # grad norm of this would become infinite + self.param.grad = torch.tensor(1e23, dtype=dtype) + + model = Model() + norms = grad_norm(model, 2) + assert all(torch.isfinite(torch.tensor(v)) for v in norms.values()), norms From 61a9f3a9bc19272ed8117a9e4dd25bd9c0608105 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Aug 2022 12:53:57 +0000 Subject: [PATCH 09/59] Update tqdm requirement from <=4.63.0,>=4.57.0 to >=4.57.0,<4.65.0 in /requirements (#13875) Update tqdm requirement in /requirements Updates the requirements on [tqdm](https://github.com/tqdm/tqdm) to permit the latest version. - [Release notes](https://github.com/tqdm/tqdm/releases) - [Commits](https://github.com/tqdm/tqdm/compare/v4.57.0...v4.64.0) --- updated-dependencies: - dependency-name: tqdm dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec --- requirements/pytorch/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index e8743b18c73b0..49e2243319206 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -3,7 +3,7 @@ numpy>=1.17.2, <1.23.1 torch>=1.9.*, <=1.12.0 -tqdm>=4.57.0, <=4.63.0 +tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 tensorboard>=2.9.1, <2.10.0 From 890156a0163668149a47943907694c40cad153d3 Mon Sep 17 00:00:00 2001 From: JongMok Lee Date: Mon, 8 Aug 2022 22:16:56 +0900 Subject: [PATCH 10/59] Fix mypy errors in `pytorch_lightning/strategies/ddp.py` (#13885) Co-authored-by: awaelchli --- pyproject.toml | 1 - .../overrides/distributed.py | 2 - src/pytorch_lightning/strategies/ddp.py | 69 +++++++++++++------ src/pytorch_lightning/strategies/ddp_spawn.py | 3 +- src/pytorch_lightning/strategies/deepspeed.py | 4 +- 5 files changed, 51 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2f0e290440f44..761c7be04cc0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ module = [ "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.profilers.simple", - "pytorch_lightning.strategies.ddp", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", diff --git a/src/pytorch_lightning/overrides/distributed.py b/src/pytorch_lightning/overrides/distributed.py index f09a7b9e3ae08..929d1ed486f4a 100644 --- a/src/pytorch_lightning/overrides/distributed.py +++ b/src/pytorch_lightning/overrides/distributed.py @@ -45,8 +45,6 @@ def _find_tensors( # https://github.com/pytorch/pytorch/blob/v1.7.1/torch/nn/parallel/distributed.py#L626-L638 def prepare_for_backward(model: DistributedDataParallel, output: Any) -> None: # `prepare_for_backward` is `DistributedDataParallel` specific. - if not isinstance(model, DistributedDataParallel): - return if torch.is_grad_enabled() and model.require_backward_grad_sync: model.require_forward_param_sync = True # type: ignore[assignment] # We'll return the output object verbatim since it is a freeform diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 922730df35269..57ab3a151b011 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -32,6 +32,7 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule +from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -39,6 +40,7 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy +from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.distributed import ( _get_process_group_backend_from_env, @@ -57,7 +59,7 @@ from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed -from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS @@ -83,12 +85,12 @@ def __init__( checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ddp_comm_state: Optional[object] = None, - ddp_comm_hook: Optional[callable] = None, - ddp_comm_wrapper: Optional[callable] = None, + ddp_comm_hook: Optional[Callable] = None, + ddp_comm_wrapper: Optional[Callable] = None, model_averaging_period: Optional[int] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, - **kwargs: Union[Any, Dict[str, Any]], + **kwargs: Any, ) -> None: super().__init__( accelerator=accelerator, @@ -105,7 +107,7 @@ def __init__( self._ddp_comm_wrapper = ddp_comm_wrapper self._model_averaging_period = model_averaging_period self._model_averager: Optional[ModelAverager] = None - self._pids: Optional[List[int]] = None + self._pids: List[int] = [] self._sync_dir: Optional[str] = None self._rank_0_will_call_children_scripts: bool = False self._process_group_backend: Optional[str] = process_group_backend @@ -117,6 +119,7 @@ def is_distributed(self) -> bool: @property def root_device(self) -> torch.device: + assert self.parallel_devices is not None return self.parallel_devices[self.local_rank] @property @@ -129,11 +132,11 @@ def num_nodes(self, num_nodes: int) -> None: self._num_nodes = num_nodes @property - def num_processes(self): + def num_processes(self) -> int: return len(self.parallel_devices) if self.parallel_devices is not None else 0 @property - def distributed_sampler_kwargs(self): + def distributed_sampler_kwargs(self) -> Dict[str, Any]: distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) return distributed_sampler_kwargs @@ -146,6 +149,7 @@ def process_group_backend(self) -> Optional[str]: return self._process_group_backend def _configure_launcher(self) -> None: + assert self.cluster_environment is not None if not self.cluster_environment.creates_processes_externally: self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes) self._rank_0_will_call_children_scripts = True @@ -156,10 +160,11 @@ def setup_environment(self) -> None: def setup(self, trainer: "pl.Trainer") -> None: # share ddp pids to all processes - self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts) + self._rank_0_will_call_children_scripts = bool(self.broadcast(self._rank_0_will_call_children_scripts)) if self._should_run_deadlock_detection(): self._share_information_to_prevent_deadlock() + assert self.accelerator is not None self.accelerator.setup(trainer) # move the model to the correct device @@ -170,6 +175,7 @@ def setup(self, trainer: "pl.Trainer") -> None: if trainer_fn == TrainerFn.FITTING: if self._layer_sync: + assert self.model is not None self.model = self._layer_sync.apply(self.model) self.setup_precision_plugin() @@ -193,7 +199,7 @@ def _setup_model(self, model: Module) -> DistributedDataParallel: log.detail(f"setting up DDP model with device ids: {device_ids}, kwargs: {self._ddp_kwargs}") return DistributedDataParallel(module=model, device_ids=device_ids, **self._ddp_kwargs) - def setup_distributed(self): + def setup_distributed(self) -> None: log.detail(f"{self.__class__.__name__}: setting up distributed...") reset_seed() @@ -204,6 +210,7 @@ def setup_distributed(self): rank_zero_only.rank = self.global_rank self._process_group_backend = self._get_process_group_backend() + assert self.cluster_environment is not None init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) def _get_process_group_backend(self) -> str: @@ -230,6 +237,7 @@ def pre_configure_ddp(self) -> None: def _register_ddp_hooks(self) -> None: log.detail(f"{self.__class__.__name__}: registering ddp hooks") if self.root_device.type == "cuda" and self._is_single_process_single_device: + assert isinstance(self.model, DistributedDataParallel) register_ddp_comm_hook( model=self.model, ddp_comm_state=self._ddp_comm_state, @@ -262,6 +270,7 @@ def _enable_model_averaging(self) -> None: f"{optimizer.__class__.__name__}." ) + assert self._ddp_comm_state is not None self._model_averager = torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager( period=self._model_averaging_period, warmup_steps=self._ddp_comm_state.start_localSGD_iter ) @@ -296,15 +305,16 @@ def optimizer_step( def configure_ddp(self) -> None: log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel") self.pre_configure_ddp() + assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model = self._setup_model(LightningDistributedModule(self.model)) self._register_ddp_hooks() - def determine_ddp_device_ids(self): + def determine_ddp_device_ids(self) -> Optional[List[int]]: if self.root_device.type == "cpu": return None return [self.root_device.index] - def barrier(self, *args, **kwargs) -> None: + def barrier(self, *args: Any, **kwargs: Any) -> None: if not distributed_available(): return if torch.distributed.get_backend() == "nccl": @@ -312,23 +322,29 @@ def barrier(self, *args, **kwargs) -> None: else: torch.distributed.barrier() - def broadcast(self, obj: object, src: int = 0) -> object: + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: obj = [obj] if self.global_rank != src: - obj = [None] + obj = [None] # type: ignore[list-item] torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def pre_backward(self, closure_loss: Tensor) -> None: """Run before precision plugin executes backward.""" + if not isinstance(self.model, DistributedDataParallel): + return + assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: prepare_for_backward(self.model, closure_loss) - def model_to_device(self): + def model_to_device(self) -> None: log.detail(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...") + assert self.model is not None self.model.to(self.root_device) - def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> Tensor: + def reduce( + self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean" + ) -> Tensor: """Reduces a tensor from several distributed processes to one aggregated tensor. Args: @@ -344,30 +360,38 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) return tensor - def training_step(self, *args, **kwargs) -> STEP_OUTPUT: + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + assert self.model is not None with self.precision_plugin.train_step_context(): return self.model(*args, **kwargs) - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.val_step_context(): + assert self.lightning_module is not None + assert self.model is not None if self.lightning_module.trainer.state.fn == TrainerFn.FITTING: # used when calling `trainer.fit` return self.model(*args, **kwargs) else: # used when calling `trainer.validate` + assert isinstance(self.model, ValidationStep) return self.model.validation_step(*args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.test_step_context(): + assert isinstance(self.model, TestStep) return self.model.test_step(*args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: with self.precision_plugin.predict_step_context(): + assert isinstance(self.model, PredictStep) return self.model.predict_step(*args, **kwargs) - def post_training_step(self): + def post_training_step(self) -> None: + assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: - self.model.require_backward_grad_sync = True + assert self.model is not None + self.model.require_backward_grad_sync = True # type: ignore[assignment] @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: @@ -458,7 +482,7 @@ def teardown(self) -> None: if ( _TORCH_GREATER_EQUAL_1_11 and not self.model.static_graph - and self.model._get_ddp_logging_data().get("can_set_static_graph") + and self.model._get_ddp_logging_data().get("can_set_static_graph") # type: ignore[operator] ): rank_zero_info( "Your model can run with static graph optimizations. For future training runs, we suggest you" @@ -475,6 +499,7 @@ def teardown(self) -> None: and pl_module._trainer.state.fn == TrainerFn.FITTING and self._layer_sync ): + assert self.model is not None self.model = self._layer_sync.revert(self.model) super().teardown() diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 30bcef457c44a..21602e60a5754 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -254,9 +254,10 @@ def model_to_device(self) -> None: def pre_backward(self, closure_loss: Tensor) -> None: """Run before precision plugin executes backward.""" + if not isinstance(self.model, DistributedDataParallel): + return assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: - assert isinstance(self.model, DistributedDataParallel) prepare_for_backward(self.model, closure_loss) def reduce( diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index b0b55374ba1a9..3c31aeb7a7657 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -19,7 +19,7 @@ import platform from collections import OrderedDict from pathlib import Path -from typing import Any, cast, Dict, Generator, List, Mapping, Optional, Tuple, Union +from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union import torch from torch import Tensor @@ -831,7 +831,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: if self.load_full_weights and self.zero_stage_3: # Broadcast to ensure we load from the rank 0 checkpoint # This doesn't have to be the case when using deepspeed sharded checkpointing - checkpoint_path = cast(_PATH, self.broadcast(checkpoint_path)) + checkpoint_path = self.broadcast(checkpoint_path) return super().load_checkpoint(checkpoint_path) # Rely on deepspeed to load the checkpoint and necessary information From 7439f5d7491a87fbbb33f47ac18fa4ff8c7eeb23 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Mon, 8 Aug 2022 14:23:07 +0100 Subject: [PATCH 11/59] Update CODEOWNERS (remove myself from defaults + some specifics) (#14084) Update CODEOWNERS --- .github/CODEOWNERS | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e40828557c2cf..05f7e91104589 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,7 +5,7 @@ # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. -* @williamfalcon @borda @tchaton @SeanNaren @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7 +* @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7 # CI/CD and configs /.github/ @borda @carmocca @akihironitta @tchaton @@ -28,22 +28,22 @@ # Packages /src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 /src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11 -/src/pytorch_lightning/core @tchaton @SeanNaren @borda @carmocca @justusschock @kaushikb11 +/src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11 /src/pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11 /src/pytorch_lightning/lite @tchaton @awaelchli @carmocca /src/pytorch_lightning/loggers @tchaton @awaelchli @borda /src/pytorch_lightning/loggers/wandb.py @borisdayma /src/pytorch_lightning/loggers/neptune.py @shnela @HubertJaworski @pkasprzyk @pitercl @Raalsky @aniezurawski @kamil-kaczmarek /src/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca -/src/pytorch_lightning/overrides @tchaton @SeanNaren @borda -/src/pytorch_lightning/plugins @tchaton @SeanNaren @awaelchli @justusschock +/src/pytorch_lightning/overrides @tchaton @borda +/src/pytorch_lightning/plugins @tchaton @awaelchli @justusschock /src/pytorch_lightning/profilers @williamfalcon @tchaton @borda @carmocca /src/pytorch_lightning/profilers/pytorch.py @nbcsm @guotuofeng /src/pytorch_lightning/strategies @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 -/src/pytorch_lightning/trainer @williamfalcon @borda @tchaton @SeanNaren @carmocca @awaelchli @justusschock @kaushikb11 -/src/pytorch_lightning/trainer/connectors @tchaton @SeanNaren @carmocca @borda +/src/pytorch_lightning/trainer @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 +/src/pytorch_lightning/trainer/connectors @tchaton @carmocca @borda /src/pytorch_lightning/tuner @SkafteNicki @borda @awaelchli -/src/pytorch_lightning/utilities @borda @tchaton @SeanNaren @carmocca +/src/pytorch_lightning/utilities @borda @tchaton @carmocca /src/lightning_app @tchaton @awaelchli @manskx @hhsecond From 55ae812dbf11f6568c73d5743aef0745715fb9fd Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 8 Aug 2022 15:48:50 +0200 Subject: [PATCH 12/59] Resolve increased time. (#14074) --- src/lightning_app/CHANGELOG.md | 2 ++ src/lightning_app/utilities/proxies.py | 2 +- tests/tests_app/core/test_lightning_app.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 07927a1b01f87..78a4e370e76ee 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -27,3 +27,5 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) + +- Resolved a bug about a race condition when sending the work state through the caller_queue ([#14074](https://github.com/Lightning-AI/lightning/pull/14074)) diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index 2c93a6c89f38c..99ad6e2aad0cf 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -74,7 +74,7 @@ def _send_data_to_caller_queue(work: "LightningWork", caller_queue: "BaseQueue", data.update({"state": work_state}) logger.debug(f"Sending to {work.name}: {data}") - caller_queue.put(data) + caller_queue.put(deepcopy(data)) # Reset the calls entry. work_state["calls"] = calls diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index a3a15085b98e3..e6c715f87ef03 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -896,6 +896,7 @@ def __init__(self, **kwargs): def run(self, signal: int): self.counter += 1 + assert len(self._calls) == 2 class SizeFlow(LightningFlow): From 34afde742ebe3acb56f5d0f14cd79d589d9771e0 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 9 Aug 2022 00:00:46 +0900 Subject: [PATCH 13/59] CI: Enable Python 3.10 in full CPU testing (#13829) * Update docker images to build --- .github/workflows/README.md | 2 +- .github/workflows/ci-pytorch_test-full.yml | 8 ++++++-- .github/workflows/ci-pytorch_test-slow.yml | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8b9e7d173b03c..f559551e1237f 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -6,7 +6,7 @@ | workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | | -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.8), (3.7, 1.11), (3.9, 1.8), (3.9, 1.12) | linux, mac, windows | +| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows | | Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | | Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | | pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch_test-full.yml index fb6916d1414fe..445707d340c4b 100644 --- a/.github/workflows/ci-pytorch_test-full.yml +++ b/.github/workflows/ci-pytorch_test-full.yml @@ -21,9 +21,13 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, windows-2019, macOS-11] - python-version: ["3.7", "3.9"] # minimum, maximum + python-version: ["3.7", "3.10"] # minimum, maximum requires: ["oldest", "latest"] release: ["stable"] + exclude: + # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. + # TODO: Remove the exclusion when dropping PyTorch 1.9 support. + - {python-version: "3.10", requires: "oldest"} # TODO: re-enable RC testing # include: # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} @@ -41,7 +45,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-full.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch_test-slow.yml index 905f60aa85699..b3756bbe8c2f7 100644 --- a/.github/workflows/ci-pytorch_test-slow.yml +++ b/.github/workflows/ci-pytorch_test-slow.yml @@ -36,7 +36,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-slow.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES From 82d2d1d85746c7743cca47e760422d3e13af6a6f Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Mon, 8 Aug 2022 16:21:26 -0400 Subject: [PATCH 14/59] Fix mypy errors attributed to `pytorch_lightning.core.saving` (#13932) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: Rohit Gupta --- pyproject.toml | 1 - src/pytorch_lightning/core/saving.py | 39 +++++++++++---------- src/pytorch_lightning/utilities/cloud_io.py | 6 ++-- src/pytorch_lightning/utilities/parsing.py | 4 ++- src/pytorch_lightning/utilities/types.py | 3 +- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 761c7be04cc0e..8db782df357d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,6 @@ module = [ "pytorch_lightning.callbacks.quantization", "pytorch_lightning.core.datamodule", "pytorch_lightning.core.module", - "pytorch_lightning.core.saving", "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.profilers.base", diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index da81e4c212560..ffdc0988a1a6e 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -20,10 +20,9 @@ from argparse import Namespace from copy import deepcopy from enum import Enum -from typing import Any, Callable, Dict, IO, MutableMapping, Optional, Union +from typing import Any, Callable, cast, Dict, IO, MutableMapping, Optional, Type, Union from warnings import warn -import torch import yaml import pytorch_lightning as pl @@ -34,7 +33,7 @@ from pytorch_lightning.utilities.migration import pl_legacy_patch from pytorch_lightning.utilities.parsing import parse_class_init_keys from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.types import _PATH +from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH log = logging.getLogger(__name__) PRIMITIVE_TYPES = (bool, int, float, str) @@ -58,11 +57,11 @@ class ModelIO: def load_from_checkpoint( cls, checkpoint_path: Union[str, IO], - map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None, + map_location: _MAP_LOCATION_TYPE = None, hparams_file: Optional[str] = None, strict: bool = True, - **kwargs, - ): + **kwargs: Any, + ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: r""" Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint it stores the arguments passed to ``__init__`` in the checkpoint under ``"hyper_parameters"``. @@ -171,15 +170,15 @@ def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None: def _load_from_checkpoint( - cls: Union["pl.LightningModule", "pl.LightningDataModule"], + cls: Union[Type["ModelIO"], Type["pl.LightningModule"], Type["pl.LightningDataModule"]], checkpoint_path: Union[str, IO], - map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None, + map_location: _MAP_LOCATION_TYPE = None, hparams_file: Optional[str] = None, - strict: Optional[bool] = None, + strict: bool = True, **kwargs: Any, -) -> Any: +) -> Union["pl.LightningModule", "pl.LightningDataModule"]: if map_location is None: - map_location = lambda storage, loc: storage + map_location = cast(_MAP_LOCATION_TYPE, lambda storage, loc: storage) with pl_legacy_patch(): checkpoint = pl_load(checkpoint_path, map_location=map_location) @@ -202,15 +201,18 @@ def _load_from_checkpoint( if issubclass(cls, pl.LightningDataModule): return _load_state(cls, checkpoint, **kwargs) - return _load_state(cls, checkpoint, strict=strict, **kwargs) + # allow cls to be evaluated as subclassed LightningModule or, + # as LightningModule for internal tests + if issubclass(cls, pl.LightningModule): + return _load_state(cls, checkpoint, strict=strict, **kwargs) def _load_state( - cls: Union["pl.LightningModule", "pl.LightningDataModule"], + cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]], checkpoint: Dict[str, Any], - strict: Optional[bool] = None, + strict: bool = True, **cls_kwargs_new: Any, -) -> Any: +) -> Union["pl.LightningModule", "pl.LightningDataModule"]: cls_spec = inspect.getfullargspec(cls.__init__) cls_init_args_name = inspect.signature(cls.__init__).parameters.keys() @@ -228,8 +230,7 @@ def _load_state( cls_kwargs_loaded.update(checkpoint.get(_old_hparam_key, {})) # 2. Try to restore model hparams from checkpoint using the new key - _new_hparam_key = cls.CHECKPOINT_HYPER_PARAMS_KEY - cls_kwargs_loaded.update(checkpoint.get(_new_hparam_key)) + cls_kwargs_loaded.update(checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_KEY, {})) # 3. Ensure that `cls_kwargs_old` has the right type, back compatibility between dict and Namespace cls_kwargs_loaded = _convert_loaded_hparams(cls_kwargs_loaded, checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_TYPE)) @@ -271,7 +272,9 @@ def _load_state( return obj -def _convert_loaded_hparams(model_args: dict, hparams_type: Optional[Union[Callable, str]] = None) -> object: +def _convert_loaded_hparams( + model_args: Dict[str, Any], hparams_type: Optional[Union[Callable, str]] = None +) -> Dict[str, Any]: """Convert hparams according given type in callable or string (past) format.""" # if not hparams type define if not hparams_type: diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py index ee3358be59541..99629bcda8980 100644 --- a/src/pytorch_lightning/utilities/cloud_io.py +++ b/src/pytorch_lightning/utilities/cloud_io.py @@ -15,19 +15,19 @@ import io from pathlib import Path -from typing import Any, Callable, Dict, IO, Optional, Union +from typing import Any, Dict, IO, Union import fsspec import torch from fsspec.core import url_to_fs from fsspec.implementations.local import AbstractFileSystem -from pytorch_lightning.utilities.types import _DEVICE, _PATH +from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH def load( path_or_url: Union[IO, _PATH], - map_location: Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] = None, + map_location: _MAP_LOCATION_TYPE = None, ) -> Any: """Loads a checkpoint. diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 9f5fe2d6b6841..81877f1dffba7 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -108,7 +108,9 @@ def clean_namespace(hparams: Union[Dict[str, Any], Namespace]) -> None: del hparams_dict[k] -def parse_class_init_keys(cls: Type["pl.LightningModule"]) -> Tuple[str, Optional[str], Optional[str]]: +def parse_class_init_keys( + cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]] +) -> Tuple[str, Optional[str], Optional[str]]: """Parse key words for standard ``self``, ``*args`` and ``**kwargs``. Examples: diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index f6c14d366805f..18e2db6feb6c6 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -19,7 +19,7 @@ from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, Union +from typing import Any, Callable, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, Union import torch from torch import Tensor @@ -49,6 +49,7 @@ ] EVAL_DATALOADERS = Union[DataLoader, Sequence[DataLoader]] _DEVICE = Union[torch.device, str, int] +_MAP_LOCATION_TYPE = Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] @runtime_checkable From 0cfc53d6b423531f598c9cfb386b3febbb8eb333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Aug 2022 10:26:02 +0200 Subject: [PATCH 15/59] Fix regression on default value for `find_unused_parameters` (#14095) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/strategies/ddp_spawn.py | 14 ++++++++++++-- tests/tests_pytorch/strategies/test_ddp.py | 12 ++++++++++++ .../strategies/test_ddp_spawn_strategy.py | 16 ++++++++++++++++ .../strategies/test_sharded_strategy.py | 14 ++++++++++++++ 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 915436e5a0bcf..04eddf2c735f4 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -70,6 +70,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) +- Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) + + ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 21602e60a5754..de34320f54093 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -315,10 +315,20 @@ def post_training_step(self) -> None: def register_strategies(cls, strategy_registry: Dict) -> None: entries = ( ("ddp_spawn", "spawn"), - ("ddp_spawn_find_unused_parameters_false", "spawn"), ("ddp_fork", "fork"), - ("ddp_fork_find_unused_parameters_false", "fork"), ("ddp_notebook", "fork"), + ) + for name, start_method in entries: + strategy_registry.register( + name, + cls, + description=f"DDP strategy with `start_method` '{start_method}'", + start_method=start_method, + ) + + entries = ( + ("ddp_spawn_find_unused_parameters_false", "spawn"), + ("ddp_fork_find_unused_parameters_false", "fork"), ("ddp_notebook_find_unused_parameters_false", "fork"), ) for name, start_method in entries: diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 4610f6153386b..1a2a0475e7ed6 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -194,3 +194,15 @@ def root_device(self): assert strategy._get_process_group_backend() == expected_process_group_backend else: assert strategy._get_process_group_backend() == expected_process_group_backend + + +@pytest.mark.parametrize( + "strategy_name,expected_ddp_kwargs", + [ + ("ddp", {}), + ("ddp_find_unused_parameters_false", {"find_unused_parameters": False}), + ], +) +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): + trainer = Trainer(strategy=strategy_name) + assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py index 52427c2c8cc3a..f485060833320 100644 --- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py @@ -178,3 +178,19 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group): mock_init_process_group.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta ) + + +@pytest.mark.parametrize( + "strategy_name,expected_ddp_kwargs", + [ + ("ddp_spawn", {}), + ("ddp_fork", {}), + ("ddp_notebook", {}), + ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}), + ], +) +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): + trainer = Trainer(strategy=strategy_name) + assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py index a047a10df32e3..ad0673ed1a5fa 100644 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py @@ -300,3 +300,17 @@ def test_block_backward_sync(): with strategy.block_backward_sync(): pass model.no_sync.assert_called_once() + + +@pytest.mark.parametrize( + "strategy_name,expected_ddp_kwargs", + [ + ("ddp_sharded", {}), + ("ddp_sharded_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_sharded_spawn", {}), + ("ddp_sharded_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), + ], +) +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): + trainer = Trainer(strategy=strategy_name) + assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs From d29a552b3c701ebc14d608347c1dbf55c3dfaa6a Mon Sep 17 00:00:00 2001 From: Robert S Lee Date: Tue, 9 Aug 2022 04:27:08 -0400 Subject: [PATCH 16/59] Fix import in doctest example (#14067) --- src/lightning_app/structures/dict.py | 2 +- src/lightning_app/structures/list.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning_app/structures/dict.py b/src/lightning_app/structures/dict.py index 93e2b161b2e7a..b414269b93eec 100644 --- a/src/lightning_app/structures/dict.py +++ b/src/lightning_app/structures/dict.py @@ -22,7 +22,7 @@ def __init__(self, **kwargs: T): .. doctest:: >>> from lightning_app import LightningFlow, LightningWork - >>> from lightning_app.core import Dict + >>> from lightning_app.structures import Dict >>> class CounterWork(LightningWork): ... def __init__(self): ... super().__init__() diff --git a/src/lightning_app/structures/list.py b/src/lightning_app/structures/list.py index f5a7c5c9913ad..cf691c98a8c38 100644 --- a/src/lightning_app/structures/list.py +++ b/src/lightning_app/structures/list.py @@ -24,7 +24,7 @@ def __init__(self, *items: T): .. doctest:: >>> from lightning_app import LightningFlow, LightningWork - >>> from lightning_app.core import List + >>> from lightning_app.structures import List >>> class CounterWork(LightningWork): ... def __init__(self): ... super().__init__() From c55fe7105b4d00735d22147612434ae9aebee4ab Mon Sep 17 00:00:00 2001 From: Anton Shevtsov <32237302+MrShevan@users.noreply.github.com> Date: Tue, 9 Aug 2022 16:40:30 +0300 Subject: [PATCH 17/59] Prefix seed_everything log messages with rank info (#14031) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Anton Shevtsov Co-authored-by: Rohit Gupta Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/utilities/seed.py | 6 ++---- tests/tests_pytorch/utilities/test_seed.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 04eddf2c735f4..4cea5685cac6f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290)) - diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py index 6648b5a56b2b1..8fce6a1debfcf 100644 --- a/src/pytorch_lightning/utilities/seed.py +++ b/src/pytorch_lightning/utilities/seed.py @@ -24,7 +24,7 @@ import numpy as np import torch -from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only, rank_zero_warn log = logging.getLogger(__name__) @@ -66,9 +66,7 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") seed = _select_seed_randomly(min_seed_value, max_seed_value) - # using `log.info` instead of `rank_zero_info`, - # so users can verify the seed is properly set in distributed training. - log.info(f"Global seed set to {seed}") + log.info(f"[rank: {_get_rank()}] Global seed set to {seed}") os.environ["PL_GLOBAL_SEED"] = str(seed) random.seed(seed) np.random.seed(seed) diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py index 7f162bd605640..6908badf1a037 100644 --- a/tests/tests_pytorch/utilities/test_seed.py +++ b/tests/tests_pytorch/utilities/test_seed.py @@ -1,6 +1,8 @@ import os import random +from typing import Mapping from unittest import mock +from unittest.mock import MagicMock import numpy as np import pytest @@ -96,3 +98,19 @@ def test_isolate_rng(): with isolate_rng(): generated = [random.random() for _ in range(3)] assert random.random() == generated[0] + + +@mock.patch("pytorch_lightning.utilities.seed.log.info") +@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}]) +def test_seed_everything_log_info(log_mock: MagicMock, env_vars: Mapping[str, str]): + """Test that log message prefix with correct rank info.""" + with mock.patch.dict(os.environ, env_vars, clear=True): + from pytorch_lightning.utilities.rank_zero import _get_rank + + rank = _get_rank() + + seed_utils.seed_everything(123) + + expected_log = f"[rank: {rank}] Global seed set to 123" + + log_mock.assert_called_once_with(expected_log) From 9722127a741e9e108e49d9cffbc10a2842302c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Aug 2022 16:03:36 +0200 Subject: [PATCH 18/59] Add missing codeowners for app package (#13542) --- .github/CODEOWNERS | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 05f7e91104589..f83924b9566ce 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -45,7 +45,13 @@ /src/pytorch_lightning/tuner @SkafteNicki @borda @awaelchli /src/pytorch_lightning/utilities @borda @tchaton @carmocca -/src/lightning_app @tchaton @awaelchli @manskx @hhsecond +/src/lightning_app @tchaton @manskx +/src/lightning_app/cli/pl-app-template @awaelchli @tchaton @Borda +/src/lightning_app/core @tchaton @awaelchli @manskx +/src/lightning_app/core/queues.py @tchaton @hhsecond @manskx +/src/lightning_app/runners/cloud.py @tchaton @hhsecond +/src/lightning_app/testing @tchaton @manskx +/src/lightning_app/__about__.py @nohalon @edenlightning @lantiga # Examples /examples/app_* @tchaton @awaelchli @manskx @hhsecond From ac369f5570d0a492d08ac9c2ba6622e451d7e131 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 9 Aug 2022 21:25:23 +0530 Subject: [PATCH 19/59] Fix incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` (#14041) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../plugins/precision/deepspeed.py | 2 +- src/pytorch_lightning/plugins/precision/ipu.py | 3 ++- src/pytorch_lightning/strategies/deepspeed.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- src/pytorch_lightning/strategies/utils.py | 2 +- tests/tests_pytorch/accelerators/test_ipu.py | 2 +- .../strategies/test_deepspeed_strategy.py | 13 ++++++------- 8 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4cea5685cac6f..dac5533a6cb17 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) +- Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041)) + + - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 01d3017760b0e..456bba1e77823 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -60,7 +60,7 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona amp_level = amp_level or "O2" - supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT, PrecisionType.MIXED) + supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT) if precision not in supported_precision: raise ValueError( f"`Trainer(strategy='deepspeed', precision={precision!r})` is not supported." diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index 89f544575f63f..67e5e373e9f52 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -19,6 +19,7 @@ import pytorch_lightning as pl from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import GradClipAlgorithmType +from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.warnings import WarningCache @@ -35,7 +36,7 @@ class IPUPrecisionPlugin(PrecisionPlugin): """ def __init__(self, precision: int) -> None: - supported_precision_values = (16, 32) + supported_precision_values = (PrecisionType.HALF, PrecisionType.FLOAT) if precision not in supported_precision_values: raise ValueError( f"`Trainer(accelerator='ipu', precision={precision!r})` is not supported." diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 3c31aeb7a7657..8acbc80257bd1 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -696,7 +696,7 @@ def _auto_select_batch_size(self) -> int: def _format_precision_config(self) -> None: assert isinstance(self.config, dict) - if self.precision_plugin.precision in (PrecisionType.HALF, PrecisionType.MIXED): + if self.precision_plugin.precision == PrecisionType.HALF: if "fp16" not in self.config and self.precision_plugin.amp_type == AMPType.NATIVE: # FP16 is a DeepSpeed standalone AMP implementation rank_zero_info("Enabling DeepSpeed FP16.") diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index c40addd4244b2..4bedbfd6d70fc 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -58,7 +58,7 @@ def __init__( self.precision = precision def forward(self, *inputs: Any, **kwargs: Any) -> Any: - if self.precision in (PrecisionType.MIXED, PrecisionType.HALF): + if self.precision == PrecisionType.HALF: inputs = self._move_float_tensors_to_half(inputs) return super().forward(*inputs, **kwargs) diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py index b71458bfc30d3..cdae7bf434eca 100644 --- a/src/pytorch_lightning/strategies/utils.py +++ b/src/pytorch_lightning/strategies/utils.py @@ -24,7 +24,7 @@ def on_colab_kaggle() -> bool: def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor: if torch.is_floating_point(tensor): - if precision in (PrecisionType.MIXED, PrecisionType.HALF): + if precision == PrecisionType.HALF: return tensor.half() if precision == PrecisionType.BFLOAT: return tensor.bfloat16() diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index 33d59d9a835ca..db3b9d1f91952 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -185,7 +185,7 @@ def test_optimization(tmpdir): @RunIf(ipu=True) -def test_mixed_precision(tmpdir): +def test_half_precision(tmpdir): class TestCallback(Callback): def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: assert trainer.strategy.model.precision == 16 diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 4f2cc14b6c62d..272b03a846688 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -171,12 +171,11 @@ def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config): @RunIf(deepspeed=True) @mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) -@pytest.mark.parametrize("precision", [16, "mixed"]) @pytest.mark.parametrize( "amp_backend", ["native", pytest.param("apex", marks=RunIf(amp_apex=True))], ) -def test_deepspeed_precision_choice(_, amp_backend, precision, tmpdir): +def test_deepspeed_precision_choice(_, amp_backend, tmpdir): """Test to ensure precision plugin is also correctly chosen. DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin @@ -188,16 +187,16 @@ def test_deepspeed_precision_choice(_, amp_backend, precision, tmpdir): accelerator="gpu", strategy="deepspeed", amp_backend=amp_backend, - precision=precision, + precision=16, ) assert isinstance(trainer.strategy, DeepSpeedStrategy) assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecisionPlugin) - assert trainer.strategy.precision_plugin.precision == precision + assert trainer.strategy.precision_plugin.precision == 16 @RunIf(deepspeed=True) -def test_deepspeed_with_invalid_config_path(tmpdir): +def test_deepspeed_with_invalid_config_path(): """Test to ensure if we pass an invalid config path we throw an exception.""" with pytest.raises( @@ -218,7 +217,7 @@ def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config): @RunIf(deepspeed=True) -def test_deepspeed_defaults(tmpdir): +def test_deepspeed_defaults(): """Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed.""" strategy = DeepSpeedStrategy() assert strategy.config is not None @@ -663,7 +662,7 @@ def training_step(self, batch, batch_idx): @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) -def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): +def test_deepspeed_multigpu_stage_3(tmpdir): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModel() trainer = Trainer( From 56abd60f048f43a7abd036380a6b5297baaa3854 Mon Sep 17 00:00:00 2001 From: Gautier Dagan Date: Tue, 9 Aug 2022 17:32:18 +0100 Subject: [PATCH 20/59] Fix assert wandb Run when mode="disabled" (#14112) --- src/pytorch_lightning/loggers/wandb.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 8e30827759b99..530fb58fabe5e 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -328,7 +328,7 @@ def __getstate__(self) -> Dict[str, Any]: @property # type: ignore[misc] @rank_zero_experiment - def experiment(self) -> Run: + def experiment(self) -> Union[Run, RunDisabled]: r""" Actual wandb object. To use wandb features in your @@ -361,11 +361,13 @@ def experiment(self) -> Run: self._experiment = wandb.init(**self._wandb_init) # define default x-axis - if isinstance(self._experiment, Run) and getattr(self._experiment, "define_metric", None): + if isinstance(self._experiment, (Run, RunDisabled)) and getattr( + self._experiment, "define_metric", None + ): self._experiment.define_metric("trainer/global_step") self._experiment.define_metric("*", step_metric="trainer/global_step", step_sync=True) - assert isinstance(self._experiment, Run) + assert isinstance(self._experiment, (Run, RunDisabled)) return self._experiment def watch(self, model: nn.Module, log: str = "gradients", log_freq: int = 100, log_graph: bool = True) -> None: From d85085479d9248d2f6a06821a0cb41c4a2eb02fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 9 Aug 2022 19:31:11 +0200 Subject: [PATCH 21/59] Reset all results on epoch end (#14061) --- src/pytorch_lightning/CHANGELOG.md | 3 ++ .../logger_connector/logger_connector.py | 3 +- .../logging_/test_train_loop_logging.py | 29 +++++++++++++++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index dac5533a6cb17..5dfd871f933f5 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -61,6 +61,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) + + - Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262)) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index ff882912625d0..02e17a8d93494 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -163,8 +163,7 @@ def update_train_epoch_metrics(self) -> None: self.log_metrics(self.metrics["log"]) # reset result collection for next epoch - assert self.trainer._results is not None - self.trainer._results.reset(metrics=True) + self.reset_results() """ Utilities and properties diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index 5855eba4c86af..d16be306b9365 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -569,11 +569,12 @@ def on_train_epoch_end(self, trainer, pl_module): "accelerator", [ pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), + "cpu", ], ) def test_metric_are_properly_reduced(tmpdir, accelerator): class TestingModel(BoringModel): - def __init__(self, *args, **kwargs) -> None: + def __init__(self) -> None: super().__init__() self.val_acc = Accuracy() @@ -592,7 +593,6 @@ def validation_step(self, batch, batch_idx): return super().validation_step(batch, batch_idx) early_stop = EarlyStopping(monitor="val_acc", mode="max") - checkpoint = ModelCheckpoint(monitor="val_acc", save_last=True, save_top_k=2, mode="max") model = TestingModel() @@ -812,3 +812,28 @@ def training_step(self, batch, batch_idx): call(metrics={"foo_epoch": 0.0, "epoch": 1}, step=3), ] ) + + +@mock.patch("pytorch_lightning.loggers.TensorBoardLogger.log_metrics") +def test_log_on_train_start(mock_log_metrics, tmpdir): + """Tests that logged metrics on_train_start get reset after the first epoch.""" + + class MyModel(BoringModel): + def on_train_start(self): + self.log("foo", 123) + + model = MyModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=0, + max_epochs=2, + log_every_n_steps=1, + enable_model_summary=False, + enable_checkpointing=False, + enable_progress_bar=False, + ) + trainer.fit(model) + + assert mock_log_metrics.mock_calls == [call(metrics={"foo": 123.0, "epoch": 0}, step=0)] + assert trainer.max_epochs > 1 From 619c2ff05827872973b2eed18d06651f7cd8bd4e Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Tue, 9 Aug 2022 12:17:57 -0700 Subject: [PATCH 22/59] [CLI] fix cluster creation CLI requiring instance-type selection (#14056) fix cluster creation CLI requiring instace-type selection we've marked `instance_types` as `required=False`, but the CLI calls `split` on the value. So if nothing is provided, we'll actually receive a runtime error, effectively rendering the flag as required. Co-authored-by: thomas chaton --- src/lightning_app/cli/lightning_cli_create.py | 2 +- tests/tests_app/cli/test_cli.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index 7e45fe7e7c078..d400db4b6f337 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -79,7 +79,7 @@ def create_cluster( region=region, role_arn=role_arn, external_id=external_id, - instance_types=instance_types.split(","), + instance_types=instance_types.split(",") if instance_types is not None else None, edit_before_creation=edit_before_creation, cost_savings=cost_savings, wait=wait, diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 16e641ac38f23..8cc5dd50f836e 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -70,7 +70,15 @@ def test_main_lightning_cli_help(): @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") -def test_create_cluster(create: mock.MagicMock): +@pytest.mark.parametrize( + "instance_types,expected_instance_types", + [ + (["--instance-types", "t3.xlarge"], ["t3.xlarge"]), + (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"]), + ([], None), + ], +) +def test_create_cluster(create_command: mock.MagicMock, instance_types, expected_instance_types): runner = CliRunner() runner.invoke( create_cluster, @@ -82,17 +90,16 @@ def test_create_cluster(create: mock.MagicMock): "dummy", "--role-arn", "arn:aws:iam::1234567890:role/lai-byoc", - "--instance-types", - "t2.small", - ], + ] + + instance_types, ) - create.assert_called_once_with( + create_command.assert_called_once_with( cluster_name="test-7", region="us-east-1", role_arn="arn:aws:iam::1234567890:role/lai-byoc", external_id="dummy", - instance_types=["t2.small"], + instance_types=expected_instance_types, edit_before_creation=False, cost_savings=False, wait=False, From 06c255c5c1889e6ecc640dc82a24193ce388511a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 00:54:10 +0200 Subject: [PATCH 23/59] Skip ddp fork tests on windows (#14121) --- .../strategies/test_ddp_spawn_strategy.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py index f485060833320..7fb22206c45c6 100644 --- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py @@ -184,11 +184,17 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group): "strategy_name,expected_ddp_kwargs", [ ("ddp_spawn", {}), - ("ddp_fork", {}), - ("ddp_notebook", {}), + pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)), + pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)), ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}), + pytest.param( + "ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True) + ), + pytest.param( + "ddp_notebook_find_unused_parameters_false", + {"find_unused_parameters": False}, + marks=RunIf(skip_windows=True), + ), ], ) def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): From 975a4fc2f1daf5a1662a0d1f47212e7dcdae8b2b Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 9 Aug 2022 16:18:21 -0700 Subject: [PATCH 24/59] Support checkpoint save and load with Stochastic Weight Averaging (#9938) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: thomas chaton Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adrian Wälchli Co-authored-by: Carlos Mocholi Co-authored-by: Kushashwa Ravi Shrimali Co-authored-by: Jirka Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 + .../callbacks/stochastic_weight_avg.py | 78 ++++++++++- .../callbacks/test_stochastic_weight_avg.py | 128 +++++++++++++++++- 3 files changed, 195 insertions(+), 14 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5dfd871f933f5..8852367a116f6 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -73,6 +73,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041)) +- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) + + - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 20a3dcc3f0f26..6650bb3f0c479 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -16,7 +16,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ """ from copy import deepcopy -from typing import Any, Callable, cast, List, Optional, Union +from typing import Any, Callable, cast, Dict, List, Optional, Union import torch from torch import nn, Tensor @@ -24,6 +24,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback +from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig @@ -112,15 +113,22 @@ def __init__( if device is not None and not isinstance(device, (torch.device, str)): raise MisconfigurationException(f"device is expected to be a torch.device or a str. Found {device}") + self.n_averaged: Optional[torch.Tensor] = None self._swa_epoch_start = swa_epoch_start self._swa_lrs = swa_lrs self._annealing_epochs = annealing_epochs self._annealing_strategy = annealing_strategy self._avg_fn = avg_fn or self.avg_fn self._device = device - self._max_epochs: int - self._model_contains_batch_norm: bool + self._model_contains_batch_norm: Optional[bool] = None self._average_model: "pl.LightningModule" + self._initialized = False + self._swa_scheduler: Optional[_LRScheduler] = None + self._scheduler_state: Optional[Dict] = None + self._init_n_averaged = 0 + self._latest_update_epoch = -1 + self.momenta: Optional[Dict[nn.modules.batchnorm._BatchNorm, float]] = None + self._max_epochs: int @property def swa_start(self) -> int: @@ -147,6 +155,9 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - if len(trainer.lr_scheduler_configs) > 1: raise MisconfigurationException("SWA currently not supported for more than 1 `lr_scheduler`.") + if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DeepSpeedStrategy)): + raise MisconfigurationException("SWA does not currently support sharded models.") + if isinstance(self._swa_epoch_start, float): self._swa_epoch_start = int(trainer.max_epochs * self._swa_epoch_start) @@ -158,8 +169,13 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - assert trainer.fit_loop.max_epochs is not None trainer.fit_loop.max_epochs += 1 + if self._scheduler_state is not None: + self._clear_schedulers(trainer) + def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if trainer.current_epoch == self.swa_start: + if (not self._initialized) and (self.swa_start <= trainer.current_epoch <= self.swa_end): + self._initialized = True + # move average model to request device. self._average_model = self._average_model.to(self._device or pl_module.device) @@ -180,6 +196,17 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1, ), ) + if self._scheduler_state is not None: + # Restore scheduler state from checkpoint + self._swa_scheduler.load_state_dict(self._scheduler_state) + elif trainer.current_epoch != self.swa_start: + # Log a warning if we're initializing after start without any checkpoint data, + # as behaviour will be different compared to having checkpoint data. + rank_zero_warn( + "SWA is initializing after swa_start without any checkpoint data. " + "This may be caused by loading a checkpoint from an older version of PyTorch Lightning." + ) + # We assert that there is only one optimizer on fit start, so know opt_idx is always 0 default_scheduler_cfg = LRSchedulerConfig(self._swa_scheduler, opt_idx=0) assert default_scheduler_cfg.interval == "epoch" and default_scheduler_cfg.frequency == 1 @@ -196,14 +223,18 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo else: trainer.lr_scheduler_configs.append(default_scheduler_cfg) - self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device) + if self.n_averaged is None: + self.n_averaged = torch.tensor(self._init_n_averaged, dtype=torch.long, device=pl_module.device) - if self.swa_start <= trainer.current_epoch <= self.swa_end: + if (self.swa_start <= trainer.current_epoch <= self.swa_end) and ( + trainer.current_epoch > self._latest_update_epoch + ): + assert self.n_averaged is not None self.update_parameters(self._average_model, pl_module, self.n_averaged, self._avg_fn) + self._latest_update_epoch = trainer.current_epoch # Note: No > here in case the callback is saved with the model and training continues if trainer.current_epoch == self.swa_end + 1: - # Transfer weights from average model to pl_module self.transfer_weights(self._average_model, pl_module) @@ -265,6 +296,7 @@ def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule") -> No def reset_momenta(self) -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165.""" + assert self.momenta is not None for bn_module in self.momenta: bn_module.momentum = self.momenta[bn_module] @@ -285,3 +317,35 @@ def update_parameters( def avg_fn(averaged_model_parameter: Tensor, model_parameter: Tensor, num_averaged: Tensor) -> Tensor: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97.""" return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1) + + def state_dict(self) -> Dict[str, Any]: + return { + "n_averaged": 0 if self.n_averaged is None else self.n_averaged.item(), + "latest_update_epoch": self._latest_update_epoch, + "scheduler_state": None if self._swa_scheduler is None else self._swa_scheduler.state_dict(), + "average_model_state": None if self._average_model is None else self._average_model.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + self._init_n_averaged = state_dict["n_averaged"] + self._latest_update_epoch = state_dict["latest_update_epoch"] + self._scheduler_state = state_dict["scheduler_state"] + self._load_average_model_state(state_dict["average_model_state"]) + + @staticmethod + def _clear_schedulers(trainer: "pl.Trainer") -> None: + # If we have scheduler state saved, clear the scheduler configs so that we don't try to + # load state into the wrong type of schedulers when restoring scheduler checkpoint state. + # We'll configure the scheduler and re-load its state in on_train_epoch_start. + # Note that this relies on the callback state being restored before the scheduler state is + # restored, and doesn't work if restore_checkpoint_after_setup is True, but at the time of + # writing that is only True for deepspeed which is already not supported by SWA. + # See https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 for background. + if trainer.lr_scheduler_configs: + assert len(trainer.lr_scheduler_configs) == 1 + trainer.lr_scheduler_configs.clear() + + def _load_average_model_state(self, model_state: Any) -> None: + if self._average_model is None: + return + self._average_model.load_state_dict(model_state) diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 859cf2fa98c0c..65a0fea2fb4a5 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import os +from pathlib import Path +from typing import ContextManager, Optional from unittest import mock import pytest import torch from torch import nn +from torch.optim.lr_scheduler import LambdaLR from torch.optim.swa_utils import SWALR from torch.utils.data import DataLoader @@ -30,7 +34,9 @@ class SwaTestModel(BoringModel): - def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False): + def __init__( + self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False, crash_on_epoch=None + ): super().__init__() layers = [nn.Linear(32, 32)] if batchnorm: @@ -39,17 +45,18 @@ def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dat self.layer = nn.Sequential(*layers) self.interval = interval self.iterable_dataset = iterable_dataset + self.crash_on_epoch = crash_on_epoch def training_step(self, batch, batch_idx): + if self.crash_on_epoch and self.trainer.current_epoch >= self.crash_on_epoch: + raise Exception("SWA crash test") output = self.forward(batch) loss = self.loss(batch, output) return {"loss": loss} def train_dataloader(self): - dset_cls = RandomIterableDataset if self.iterable_dataset else RandomDataset dset = dset_cls(32, 64) - return DataLoader(dset, batch_size=2) def configure_optimizers(self): @@ -66,6 +73,8 @@ def configure_optimizers(self): class SwaTestCallback(StochasticWeightAveraging): update_parameters_calls: int = 0 transfer_weights_calls: int = 0 + # Record the first epoch, as if we are resuming from a checkpoint this may not be equal to 0 + first_epoch: Optional[int] = None def update_parameters(self, *args, **kwargs): self.update_parameters_calls += 1 @@ -77,6 +86,11 @@ def transfer_weights(self, *args, **kwargs): def on_train_epoch_start(self, trainer, *args): super().on_train_epoch_start(trainer, *args) + if self.first_epoch is None and not trainer.fit_loop.restarting: + # since the checkpoint loaded was saved `on_train_epoch_end`, the first `FitLoop` iteration will + # not update the model and just call the epoch-level hooks, for that reason, we check that we are not + # restarting before choosing the first epoch + self.first_epoch = trainer.current_epoch assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end) if self.swa_start <= trainer.current_epoch: assert isinstance(trainer.lr_scheduler_configs[0].scheduler, SWALR) @@ -88,6 +102,7 @@ def on_train_epoch_end(self, trainer, *args): if self.swa_start <= trainer.current_epoch <= self.swa_end: swa_epoch = trainer.current_epoch - self.swa_start assert self.n_averaged == swa_epoch + 1 + assert self._swa_scheduler is not None # Scheduler is stepped once on initialization and then at the end of each epoch assert self._swa_scheduler._step_count == swa_epoch + 2 elif trainer.current_epoch > self.swa_end: @@ -103,10 +118,13 @@ def on_train_end(self, trainer, pl_module): if not isinstance(trainer.strategy, DDPSpawnStrategy): # check backward call count. the batchnorm update epoch should not backward - assert trainer.strategy.backward.call_count == trainer.max_epochs * trainer.limit_train_batches + assert trainer.strategy.backward.call_count == ( + (trainer.max_epochs - self.first_epoch) * trainer.limit_train_batches + ) # check call counts - assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1) + first_swa_epoch = max(self.first_epoch, self.swa_start) + assert self.update_parameters_calls == trainer.max_epochs - first_swa_epoch assert self.transfer_weights_calls == 1 @@ -140,7 +158,7 @@ def train_with_swa( devices=devices, ) - with mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward): + with _backward_patch(trainer): trainer.fit(model) # check the model is the expected @@ -226,9 +244,10 @@ def test_swa_multiple_lrs(tmpdir): class TestModel(BoringModel): def __init__(self): - super(BoringModel, self).__init__() + super().__init__() self.layer1 = torch.nn.Linear(32, 32) self.layer2 = torch.nn.Linear(32, 2) + self.on_train_epoch_start_called = False def forward(self, x): x = self.layer1(x) @@ -255,3 +274,98 @@ def on_train_epoch_start(self): ) trainer.fit(model) assert model.on_train_epoch_start_called + + +def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False): + swa_start = 3 + trainer_kwargs = { + "default_root_dir": tmpdir, + "max_epochs": 5, + "accelerator": "cpu", + "strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None, + "devices": 2 if ddp else 1, + "limit_train_batches": 5, + "limit_val_batches": 0, + "accumulate_grad_batches": 2, + "enable_progress_bar": False, + } + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer), pytest.raises(Exception, match="SWA crash test"): + trainer.fit(model) + + checkpoint_dir = Path(tmpdir) / "lightning_logs" / "version_0" / "checkpoints" + checkpoint_files = os.listdir(checkpoint_dir) + assert len(checkpoint_files) == 1 + ckpt_path = str(checkpoint_dir / checkpoint_files[0]) + + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer): + trainer.fit(resume_model, ckpt_path=ckpt_path) + + +class CustomSchedulerModel(SwaTestModel): + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + + def lr_lambda(current_step: int): + return 0.1 + + scheduler = LambdaLR(optimizer, lr_lambda, -1) + return { + "optimizer": optimizer, + "lr_scheduler": { + "scheduler": scheduler, + "interval": self.interval, + }, + } + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint(tmpdir, crash_on_epoch): + model = SwaTestModel(crash_on_epoch=crash_on_epoch) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint_custom_scheduler(tmpdir, crash_on_epoch): + # Reproduces the bug reported in https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 + model = CustomSchedulerModel(crash_on_epoch=crash_on_epoch) + resume_model = CustomSchedulerModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@RunIf(skip_windows=True) +def test_swa_resume_training_from_checkpoint_ddp(tmpdir): + model = SwaTestModel(crash_on_epoch=3) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=True) + + +@pytest.mark.parametrize( + "strategy", + [ + pytest.param("fsdp", marks=RunIf(fairscale_fully_sharded=True, min_cuda_gpus=1)), + pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)), + ], +) +def test_misconfiguration_error_with_sharded_model(tmpdir, strategy: str): + model = SwaTestModel() + swa_callback = SwaTestCallback(swa_epoch_start=2, swa_lrs=0.1) + trainer = Trainer( + default_root_dir=tmpdir, + enable_progress_bar=False, + max_epochs=5, + callbacks=[swa_callback], + strategy=strategy, + accelerator="gpu", + devices=1, + ) + with pytest.raises(MisconfigurationException, match="SWA does not currently support sharded models"): + trainer.fit(model) + + +def _backward_patch(trainer: Trainer) -> ContextManager: + return mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward) From 8fa9e8651d044e6122a4380f570fd2451a665f8c Mon Sep 17 00:00:00 2001 From: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> Date: Wed, 10 Aug 2022 09:38:18 +0900 Subject: [PATCH 25/59] Update collect env details and issue template (#14017) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- .github/ISSUE_TEMPLATE/bug_report.md | 8 ++++++ requirements/collect_env_details.py | 41 ++++++++++------------------ 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index f08865180ba1d..de4eacde1f39e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -41,8 +41,16 @@ You can get the script and run it with: ```bash wget https://raw.githubusercontent.com/Lightning-AI/lightning/master/requirements/collect_env_details.py python collect_env_details.py + ``` + +
+ Details + Paste the output here and move this toggle outside of the comment block. +
+ + You can also fill out the list below manually. --> diff --git a/requirements/collect_env_details.py b/requirements/collect_env_details.py index 1d65753a55553..b0c47efc43859 100644 --- a/requirements/collect_env_details.py +++ b/requirements/collect_env_details.py @@ -20,27 +20,17 @@ import platform import sys -import numpy +import pkg_resources import torch -import tqdm sys.path += [os.path.abspath(".."), os.path.abspath("")] -import pytorch_lightning # noqa: E402 -try: - import lightning -except ModuleNotFoundError: - pass -try: - import lightning_app -except ModuleNotFoundError: - pass LEVEL_OFFSET = "\t" KEY_PADDING = 20 -def info_system(): +def info_system() -> dict: return { "OS": platform.system(), "architecture": platform.architecture(), @@ -50,28 +40,24 @@ def info_system(): } -def info_cuda(): +def info_cuda() -> dict: return { - "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())], - # 'nvidia_driver': get_nvidia_driver_version(run_lambda), + "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] or None, "available": torch.cuda.is_available(), "version": torch.version.cuda, } -def info_packages(): - return { - "numpy": numpy.__version__, - "pyTorch_version": torch.__version__, - "pyTorch_debug": torch.version.debug, - "pytorch-lightning": pytorch_lightning.__version__, - "lightning": lightning.__version__ if "lightning" in sys.modules else None, - "lightning_app": lightning_app.__version__ if "lightning_app" in sys.modules else None, - "tqdm": tqdm.__version__, - } +def info_packages() -> dict: + """Get name and version of all installed packages.""" + packages = {} + for dist in pkg_resources.working_set: + package = dist.as_requirement() + packages[package.key] = package.specs[0][1] + return packages -def nice_print(details, level=0): +def nice_print(details: dict, level: int = 0) -> list: lines = [] for k in sorted(details): key = f"* {k}:" if level == 0 else f"- {k}:" @@ -88,8 +74,9 @@ def nice_print(details, level=0): return lines -def main(): +def main() -> None: details = {"System": info_system(), "CUDA": info_cuda(), "Packages": info_packages()} + details["Lightning"] = {k: v for k, v in details["Packages"].items() if "torch" in k or "lightning" in k} lines = nice_print(details) text = os.linesep.join(lines) print(text) From d211d46e1db3ca4c4c938cafbfe51704f51b8ab4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 11:35:41 +0900 Subject: [PATCH 26/59] CI: Replace `_` of in GHA workflow filenames with `-` (#13917) * Rename workflow files * Update docs * Fix azure badges * Update the main readme * bad rebase * Update doc --- .actions/setup_tools.py | 5 ++- .github/workflows/README.md | 34 +++++++++---------- ...e2e_test.yml => ci-app-cloud-e2e-test.yml} | 0 ...i-app_examples.yml => ci-app-examples.yml} | 0 .../{ci-app_tests.yml => ci-app-tests.yml} | 0 ...{ci_pkg-install.yml => ci-pkg-install.yml} | 0 ...pr-gatekeeper.yml => ci-pr-gatekeeper.yml} | 0 ...st-conda.yml => ci-pytorch-test-conda.yml} | 0 ...test-full.yml => ci-pytorch-test-full.yml} | 0 ...test-slow.yml => ci-pytorch-test-slow.yml} | 0 .../{ci_schema.yml => ci-schema.yml} | 0 ...h_dockers.yml => cicd-pytorch-dockers.yml} | 0 README.md | 25 ++++++++------ src/pytorch_lightning/README.md | 26 +++++++------- 14 files changed, 46 insertions(+), 44 deletions(-) rename .github/workflows/{ci-app_cloud_e2e_test.yml => ci-app-cloud-e2e-test.yml} (100%) rename .github/workflows/{ci-app_examples.yml => ci-app-examples.yml} (100%) rename .github/workflows/{ci-app_tests.yml => ci-app-tests.yml} (100%) rename .github/workflows/{ci_pkg-install.yml => ci-pkg-install.yml} (100%) rename .github/workflows/{ci_pr-gatekeeper.yml => ci-pr-gatekeeper.yml} (100%) rename .github/workflows/{ci-pytorch_test-conda.yml => ci-pytorch-test-conda.yml} (100%) rename .github/workflows/{ci-pytorch_test-full.yml => ci-pytorch-test-full.yml} (100%) rename .github/workflows/{ci-pytorch_test-slow.yml => ci-pytorch-test-slow.yml} (100%) rename .github/workflows/{ci_schema.yml => ci-schema.yml} (100%) rename .github/workflows/{cicd-pytorch_dockers.yml => cicd-pytorch-dockers.yml} (100%) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 5088be2020738..a76e81246798c 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -94,11 +94,10 @@ def load_readme_description(path_dir: str, homepage: str, version: str) -> str: text = text.replace("pytorch-lightning.readthedocs.io/en/stable/", f"pytorch-lightning.readthedocs.io/en/{version}") # codecov badge text = text.replace("/branch/master/graph/badge.svg", f"/release/{version}/graph/badge.svg") - # replace github badges for release ones + # github actions badge text = text.replace("badge.svg?branch=master&event=push", f"badge.svg?tag={version}") - # Azure... + # azure pipelines badge text = text.replace("?branchName=master", f"?branchName=refs%2Ftags%2F{version}") - text = re.sub(r"\?definitionId=\d+&branchName=master", f"?definitionId=2&branchName=refs%2Ftags%2F{version}", text) skip_begin = r"" skip_end = r"" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index f559551e1237f..4ed903c0f3a93 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,16 +4,16 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows | -| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | -| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | -| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | +| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | +| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------- | ------------------- | +| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | +| Test PyTorch with Conda | .github/workflows/ci-pytorch-test-conda.yml | Same as ci-pytorch-test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.9), (3.8, 1.10), (3.8, 1.11), (3.9, 1.12) | linux | +| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | - \*Accelerators used in CI - GPU: 2 x NVIDIA Tesla V100 @@ -33,15 +33,15 @@ | --------------------------------- | ----------------------------------------------------------------------------------------- | | .codecov.yml | Measure test coverage with [codecov.io](https://app.codecov.io/gh/Lightning-AI/lightning) | | .github/workflows/code-checks.yml | Check Python typing with [MyPy](https://mypy.readthedocs.io/en/stable/). | -| .github/workflows/ci_schema.yml | Validate the syntax of workflow files. | +| .github/workflows/ci-schema.yml | Validate the syntax of workflow files. | ## Others -| workflow file | action | -| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| .github/workflows/ci_dockers.yml | Build docker images used for testing in CI without pushing to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). Publishing these built images takes place in `.github/workflows/release-docker.yml` which only runs in master. | -| .github/workflows/ci_pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | -| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | +| workflow file | action | +| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| .github/workflows/cicd-pytorch-dockers.yml | Build docker images used for testing in CI. If run on nightly schedule, push to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). | +| .github/workflows/ci-pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | +| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | ## Deployment @@ -60,4 +60,4 @@ | .github/stale.yml | Close inactive issues/PRs sometimes after adding the "won't fix" label to them. | | .github/workflows/probot-auto-cc.yml, .github/lightning-probot.yml | Notify maintainers of interest depending on labels added to an issue We utilize lightning-probot forked from PyTorch’s probot. | | .pre-commit-config.yaml | pre-commit.ci runs a set of linters and formatters, such as black, flake8 and isort. When formatting is applied, the bot pushes a commit with its change. This configuration is also used for running pre-commit locally. | -| .github/workflows/ci_pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | +| .github/workflows/ci-pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml similarity index 100% rename from .github/workflows/ci-app_cloud_e2e_test.yml rename to .github/workflows/ci-app-cloud-e2e-test.yml diff --git a/.github/workflows/ci-app_examples.yml b/.github/workflows/ci-app-examples.yml similarity index 100% rename from .github/workflows/ci-app_examples.yml rename to .github/workflows/ci-app-examples.yml diff --git a/.github/workflows/ci-app_tests.yml b/.github/workflows/ci-app-tests.yml similarity index 100% rename from .github/workflows/ci-app_tests.yml rename to .github/workflows/ci-app-tests.yml diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci-pkg-install.yml similarity index 100% rename from .github/workflows/ci_pkg-install.yml rename to .github/workflows/ci-pkg-install.yml diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml similarity index 100% rename from .github/workflows/ci_pr-gatekeeper.yml rename to .github/workflows/ci-pr-gatekeeper.yml diff --git a/.github/workflows/ci-pytorch_test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-conda.yml rename to .github/workflows/ci-pytorch-test-conda.yml diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch-test-full.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-full.yml rename to .github/workflows/ci-pytorch-test-full.yml diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-slow.yml rename to .github/workflows/ci-pytorch-test-slow.yml diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci-schema.yml similarity index 100% rename from .github/workflows/ci_schema.yml rename to .github/workflows/ci-schema.yml diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml similarity index 100% rename from .github/workflows/cicd-pytorch_dockers.yml rename to .github/workflows/cicd-pytorch-dockers.yml diff --git a/README.md b/README.md index 2fef343425f17..9c03e3707ec24 100644 --- a/README.md +++ b/README.md @@ -80,21 +80,24 @@ ______________________________________________________________________ ## Continuous Integration -Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major Python and PyTorch versions. +Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs and against major Python and PyTorch versions.
Current build statuses
-| System / PyTorch ver. | 1.8 (LTS, min. req.) | 1.9 | 1.10 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 (with Conda | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -136,8 +139,8 @@ conda install pytorch-lightning -c conda-forge The actual status of 1.7 \[stable\] is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml?query=branch%3Arelease%2Fpytorch) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml?query=branch%3Arelease%2Fpytorch) [![TPU tests](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![Check Docs](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml?query=branch%3Arelease%2Fpytorch) diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index eb1a42730b5f0..b57aea6fae147 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -78,17 +78,17 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | -| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | - | -| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -130,8 +130,8 @@ conda install pytorch-lightning -c conda-forge The actual status of stable is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) [![GPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=release%2Fpytorch) [![TPU](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![IPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=release%2Fpytorch) From dfda3f384e020ab3955a2cf3fe29dcd831d8a969 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 18:02:54 +0900 Subject: [PATCH 27/59] CI: Update Windows version from 2019 to 2022 (#14129) Update windows --- .github/workflows/ci-app-examples.yml | 2 +- .github/workflows/ci-app-tests.yml | 6 +++--- .github/workflows/ci-pkg-install.yml | 6 +++--- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index ec8becd5f70d1..01570f59c2c77 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 1678dab257301..fe3cc36dc16d3 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -21,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] @@ -126,7 +126,7 @@ jobs: # - name: Clone Quick Start Example Repo # uses: actions/checkout@v3 # # TODO: this needs to be git submodule -# if: matrix.os == 'windows-2019' # because the install doesn't work on windows +# if: matrix.os == 'windows-2022' # because the install doesn't work on windows # with: # repository: Lightning-AI/lightning-quick-start # ref: 'main' @@ -134,6 +134,6 @@ jobs: # # - name: Lightning Install quick-start # shell: bash -# if: matrix.os != 'windows-2019' # because the install doesn't work on windows +# if: matrix.os != 'windows-2022' # because the install doesn't work on windows # run: | # python -m lightning install app lightning/quick-start -y diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 342e027b07cfe..a9fdd36693a67 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -33,7 +33,7 @@ jobs: fail-fast: true max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["app", "pytorch"] python-version: [3.8] # , 3.9 @@ -67,7 +67,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["", "lightning"] python-version: [3.8] # , 3.9 @@ -100,7 +100,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] # , 3.9 steps: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 445707d340c4b..7409ce25a5128 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] python-version: ["3.7", "3.10"] # minimum, maximum requires: ["oldest", "latest"] release: ["stable"] diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index b3756bbe8c2f7..36007d3311451 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] # same config as '.azure-pipelines/gpu-tests.yml' python-version: ["3.7"] pytorch-version: ["1.11"] From dc8ff5ed2699b2ab9d21ee1ea6270191e290f620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 11:23:20 +0200 Subject: [PATCH 28/59] Fix device placement when `.cuda()` called without specifying index (#14128) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../core/mixins/device_dtype_mixin.py | 10 ++++---- .../utilities/test_dtype_device_mixin.py | 24 ++++++++++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 8852367a116f6..b405665b9df88 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) +- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) + + ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index 62e81e4839da6..2916d8b07cb4e 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -116,14 +116,16 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: # ty while being optimized. Arguments: - device: if specified, all parameters will be - copied to that device + device: If specified, all parameters will be copied to that device. If `None`, the current CUDA device + index will be used. Returns: Module: self """ - if device is None or isinstance(device, int): - device = torch.device("cuda", index=(device or 0)) + if device is None: + device = torch.device("cuda", torch.cuda.current_device()) + elif isinstance(device, int): + device = torch.device("cuda", index=device) self.__update_properties(device=device) return super().cuda(device=device) diff --git a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py index 38f72b555d52d..7c17b3d9f7642 100644 --- a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py +++ b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py @@ -113,7 +113,7 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir): ], ) @RunIf(min_cuda_gpus=1) -def test_gpu_cuda_device(device): +def test_cuda_device(device): model = TopModule() model.cuda(device) @@ -122,3 +122,25 @@ def test_gpu_cuda_device(device): assert device.type == "cuda" assert device.index is not None assert device.index == torch.cuda.current_device() + + +@RunIf(min_cuda_gpus=2) +def test_cuda_current_device(): + """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting.""" + + class CudaModule(DeviceDtypeModuleMixin): + def __init__(self): + super().__init__() + self.layer = nn.Linear(1, 1) + + model = CudaModule() + + torch.cuda.set_device(0) + model.cuda(1) + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) + + torch.cuda.set_device(1) + model.cuda() # model is already on device 1, and calling .cuda() without device index should not move model + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) From ddb476d334f501a655586ae3809587e09f71b9c8 Mon Sep 17 00:00:00 2001 From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com> Date: Wed, 10 Aug 2022 11:48:06 +0200 Subject: [PATCH 29/59] [App] Application logs in CLI (#13634) --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/cli/lightning_cli.py | 89 +++++++++++++ src/lightning_app/testing/testing.py | 2 +- src/lightning_app/utilities/app_logs.py | 125 ++++++++++++++++++ .../utilities/logs_socket_api.py | 95 +++++++++++++ tests/tests_app/cli/test_cmd_show_logs.py | 61 +++++++++ tests/tests_app_examples/test_boring_app.py | 15 +++ .../test_collect_failures.py | 1 + tests/tests_app_examples/test_commands.py | 1 + .../test_custom_work_dependencies.py | 2 +- tests/tests_app_examples/test_drive.py | 1 + tests/tests_app_examples/test_idle_timeout.py | 1 + tests/tests_app_examples/test_payload.py | 2 +- tests/tests_app_examples/test_quick_start.py | 2 +- .../test_template_react_ui.py | 1 + .../test_template_streamlit_ui.py | 1 + tests/tests_app_examples/test_v0_app.py | 1 + 17 files changed, 398 insertions(+), 4 deletions(-) create mode 100644 src/lightning_app/utilities/app_logs.py create mode 100644 src/lightning_app/utilities/logs_socket_api.py create mode 100644 tests/tests_app/cli/test_cmd_show_logs.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 78a4e370e76ee..ba8cdd796c5bb 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) - Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987)) - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) +- Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) + ### Changed diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index fb4c40330dfd9..45c80d4dcc357 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -8,7 +8,9 @@ import click import requests +import rich from requests.exceptions import ConnectionError +from rich.color import ANSI_COLOR_NAMES from lightning_app import __version__ as ver from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init @@ -18,12 +20,15 @@ from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType +from lightning_app.utilities.app_logs import _app_logs_reader from lightning_app.utilities.cli_helpers import ( _format_input_env_variables, _retrieve_application_url_and_available_commands, ) +from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth +from lightning_app.utilities.network import LightningClient from lightning_app.utilities.state import headers_for logger = logging.getLogger(__name__) @@ -50,9 +55,93 @@ def main(): @click.version_option(ver) def _main(): register_all_external_components() + + +@_main.group() +def show(): + """Show given resource.""" pass +@show.command() +@click.argument("app_name", required=False) +@click.argument("components", nargs=-1, required=False) +@click.option("-f", "--follow", required=False, is_flag=True, help="Wait for new logs, to exit use CTRL+C.") +def logs(app_name: str, components: List[str], follow: bool) -> None: + """Show cloud application logs. By default prints logs for all currently available components. + + Example uses: + + Print all application logs: + + $ lightning show logs my-application + + + Print logs only from the flow (no work): + + $ lightning show logs my-application flow + + + Print logs only from selected works: + + $ lightning show logs my-application root.work_a root.work_b + """ + + client = LightningClient() + project = _get_project(client) + + apps = { + app.name: app + for app in client.lightningapp_instance_service_list_lightningapp_instances(project.project_id).lightningapps + } + + if not apps: + raise click.ClickException( + "You don't have any application in the cloud. Please, run an application first with `--cloud`." + ) + + if not app_name: + raise click.ClickException( + f"You have not specified any Lightning App. Please select one of available: [{', '.join(apps.keys())}]" + ) + + if app_name not in apps: + raise click.ClickException( + f"The Lightning App '{app_name}' does not exist. Please select one of following: [{', '.join(apps.keys())}]" + ) + + # Fetch all lightning works from given application + # 'Flow' component is somewhat implicit, only one for whole app, + # and not listed in lightningwork API - so we add it directly to the list + works = client.lightningwork_service_list_lightningwork( + project_id=project.project_id, app_id=apps[app_name].id + ).lightningworks + app_component_names = ["flow"] + [f.name for f in apps[app_name].spec.flow_servers] + [w.name for w in works] + + if not components: + components = app_component_names + + for component in components: + if component not in app_component_names: + raise click.ClickException(f"Component '{component}' does not exist in app {app_name}.") + + log_reader = _app_logs_reader( + client=client, + project_id=project.project_id, + app_id=apps[app_name].id, + component_names=components, + follow=follow, + ) + + rich_colors = list(ANSI_COLOR_NAMES) + colors = {c: rich_colors[i + 1] for i, c in enumerate(components)} + + for component_name, log_event in log_reader: + date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") + color = colors[component_name] + rich.print(f"[{color}]{component_name}[/{color}] {date} {log_event.message}") + + @_main.command() def login(): """Log in to your Lightning.ai account.""" diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index e1cc2e180dab5..74d57db38c427 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -318,7 +318,7 @@ def fetch_logs() -> str: ) try: - yield admin_page, view_page, fetch_logs + yield admin_page, view_page, fetch_logs, name except KeyboardInterrupt: pass finally: diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py new file mode 100644 index 0000000000000..4a7af9b5c5143 --- /dev/null +++ b/src/lightning_app/utilities/app_logs.py @@ -0,0 +1,125 @@ +import json +import queue +import sys +from dataclasses import dataclass +from datetime import datetime, timedelta +from json import JSONDecodeError +from threading import Thread +from typing import Iterator, List, Optional, Tuple + +import dateutil.parser +from websocket import WebSocketApp + +from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI +from lightning_app.utilities.network import LightningClient + + +@dataclass +class _LogEventLabels: + app: str + container: str + filename: str + job: str + namespace: str + node_name: str + pod: str + stream: Optional[str] = None + + +@dataclass +class _LogEvent: + message: str + timestamp: datetime + labels: _LogEventLabels + + +def _push_logevents_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue): + """Pushes _LogEvents from websocket to read_queue. + + Returns callback function used with `on_message_callback` of websocket.WebSocketApp. + """ + + def callback(ws_app: WebSocketApp, msg: str): + # We strongly trust that the contract on API will hold atm :D + event_dict = json.loads(msg) + labels = _LogEventLabels(**event_dict["labels"]) + if "message" in event_dict: + event = _LogEvent( + message=event_dict["message"], + timestamp=dateutil.parser.isoparse(event_dict["timestamp"]), + labels=labels, + ) + read_queue.put((event.timestamp, component_name, event)) + + return callback + + +def _error_callback(ws_app: WebSocketApp, error: Exception): + errors = { + KeyError: "Malformed log message, missing key", + JSONDecodeError: "Malformed log message", + TypeError: "Malformed log format", + ValueError: "Malformed date format", + } + print(f"Error while reading logs ({errors.get(type(error), 'Unknown')})", file=sys.stderr) + ws_app.close() + + +def _app_logs_reader( + client: LightningClient, project_id: str, app_id: str, component_names: List[str], follow: bool +) -> Iterator[Tuple[str, _LogEvent]]: + + read_queue = queue.PriorityQueue() + logs_api_client = _LightningLogsSocketAPI(client.api_client) + + # We will use a socket per component + log_sockets = [ + logs_api_client.create_lightning_logs_socket( + project_id=project_id, + app_id=app_id, + component=component_name, + on_message_callback=_push_logevents_to_read_queue_callback(component_name, read_queue), + on_error_callback=_error_callback, + ) + for component_name in component_names + ] + + # And each socket on separate thread pushing log event to print queue + # run_forever() will run until we close() the connection from outside + log_threads = [Thread(target=work.run_forever) for work in log_sockets] + + # Establish connection and begin pushing logs to the print queue + for th in log_threads: + th.start() + + user_log_start = "<<< BEGIN USER_RUN_FLOW SECTION >>>" + start_timestamp = None + + # Print logs from queue when log event is available + try: + while True: + _, component_name, log_event = read_queue.get(timeout=None if follow else 1.0) + log_event: _LogEvent + + if user_log_start in log_event.message: + start_timestamp = log_event.timestamp + timedelta(seconds=0.5) + + if start_timestamp and log_event.timestamp > start_timestamp: + yield component_name, log_event + + except queue.Empty: + # Empty is raised by queue.get if timeout is reached. Follow = False case. + pass + + except KeyboardInterrupt: + # User pressed CTRL+C to exit, we sould respect that + pass + + finally: + # Close connections - it will cause run_forever() to finish -> thread as finishes aswell + for socket in log_sockets: + socket.close() + + # Because all socket were closed, we can just wait for threads to finish. + for th in log_threads: + th.join() diff --git a/src/lightning_app/utilities/logs_socket_api.py b/src/lightning_app/utilities/logs_socket_api.py new file mode 100644 index 0000000000000..0ab9a5c24f3e5 --- /dev/null +++ b/src/lightning_app/utilities/logs_socket_api.py @@ -0,0 +1,95 @@ +from typing import Callable, Optional +from urllib.parse import urlparse + +from lightning_cloud.openapi import ApiClient, AuthServiceApi, V1LoginRequest +from websocket import WebSocketApp + +from lightning_app.utilities.login import Auth + + +class _LightningLogsSocketAPI: + def __init__(self, api_client: ApiClient): + self.api_client = api_client + self._auth = Auth() + self._auth.authenticate() + self._auth_service = AuthServiceApi(api_client) + + def _get_api_token(self) -> str: + token_resp = self._auth_service.auth_service_login( + body=V1LoginRequest( + username=self._auth.username, + api_key=self._auth.api_key, + ) + ) + return token_resp.token + + @staticmethod + def _socket_url(host: str, project_id: str, app_id: str, token: str, component: str) -> str: + return ( + f"wss://{host}/v1/projects/{project_id}/appinstances/{app_id}/logs?" + f"token={token}&component={component}&follow=true" + ) + + def create_lightning_logs_socket( + self, + project_id: str, + app_id: str, + component: str, + on_message_callback: Callable[[WebSocketApp, str], None], + on_error_callback: Optional[Callable[[Exception, str], None]] = None, + ) -> WebSocketApp: + """Creates and returns WebSocketApp to listen to lightning app logs. + + .. code-block:: python + # Synchronous reading, run_forever() is blocking + + + def print_log_msg(ws_app, msg): + print(msg) + + + flow_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "flow", print_log_msg) + flow_socket.run_forever() + + .. code-block:: python + # Asynchronous reading (with Threads) + + + def print_log_msg(ws_app, msg): + print(msg) + + + flow_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "flow", print_log_msg) + work_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "work_1", print_log_msg) + + flow_logs_thread = Thread(target=flow_logs_socket.run_forever) + work_logs_thread = Thread(target=work_logs_socket.run_forever) + + flow_logs_thread.start() + work_logs_thread.start() + # ....... + + flow_logs_socket.close() + work_logs_thread.close() + + Arguments: + project_id: Project ID. + app_id: Application ID. + component: Component name eg flow. + on_message_callback: Callback object which is called when received data. + on_error_callback: Callback object which is called when we get error. + + Returns: + WebSocketApp of the wanted socket + """ + _token = self._get_api_token() + clean_ws_host = urlparse(self.api_client.configuration.host).netloc + socket_url = self._socket_url( + host=clean_ws_host, + project_id=project_id, + app_id=app_id, + token=_token, + component=component, + ) + + return WebSocketApp(socket_url, on_message=on_message_callback, on_error=on_error_callback) diff --git a/tests/tests_app/cli/test_cmd_show_logs.py b/tests/tests_app/cli/test_cmd_show_logs.py new file mode 100644 index 0000000000000..0dc06025151fa --- /dev/null +++ b/tests/tests_app/cli/test_cmd_show_logs.py @@ -0,0 +1,61 @@ +from unittest import mock + +from click.testing import CliRunner + +from lightning_app.cli.lightning_cli import logs + + +@mock.patch("lightning_app.cli.lightning_cli.LightningClient") +@mock.patch("lightning_app.cli.lightning_cli._get_project") +def test_show_logs_errors(project, client): + """Test that the CLI prints the errors for the show logs command.""" + + runner = CliRunner() + + # Response prep + app = mock.MagicMock() + app.name = "MyFakeApp" + work = mock.MagicMock() + work.name = "MyFakeWork" + flow = mock.MagicMock() + flow.name = "MyFakeFlow" + + # No apps ever run + apps = {} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + + result = runner.invoke(logs, ["NonExistentApp"]) + + assert result.exit_code == 1 + assert "Error: You don't have any application in the cloud" in result.output + + # App not specified + apps = {app} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + + result = runner.invoke(logs) + + assert result.exit_code == 1 + assert "Please select one of available: [MyFakeApp]" in str(result.output) + + # App does not exit + apps = {app} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + + result = runner.invoke(logs, ["ThisAppDoesNotExist"]) + + assert result.exit_code == 1 + assert "The Lightning App 'ThisAppDoesNotExist' does not exist." in str(result.output) + + # Component does not exist + apps = {app} + works = {work} + flows = {flow} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + client.return_value.lightningwork_service_list_lightningwork.return_value.lightningworks = works + app.spec.flow_servers = flows + + result = runner.invoke(logs, ["MyFakeApp", "NonExistentComponent"]) + + assert result.exit_code == 1 + assert "Component 'NonExistentComponent' does not exist in app MyFakeApp." in result.output diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index 1f681260de5c2..f8143b1db1a88 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -1,8 +1,10 @@ import os import pytest +from click.testing import CliRunner from tests_app import _PROJECT_ROOT +from lightning_app.cli.lightning_cli import logs from lightning_app.testing.testing import run_app_in_cloud, wait_for @@ -12,6 +14,7 @@ def test_boring_app_example_cloud() -> None: _, view_page, _, + name, ): def check_hello_there(*_, **__): @@ -21,3 +24,15 @@ def check_hello_there(*_, **__): return True wait_for(view_page, check_hello_there) + + runner = CliRunner() + result = runner.invoke(logs, [name]) + lines = result.output.splitlines() + + assert result.exit_code == 0 + assert result.exception is None + assert len(lines) > 1, result.output + # We know that at some point we need to intstall lightning, so we check for that + assert any( + "Successfully built lightning" in line for line in lines + ), f"Did not find logs with lightning installation: {result.output}" diff --git a/tests/tests_app_examples/test_collect_failures.py b/tests/tests_app_examples/test_collect_failures.py index f263ebb1a9f58..c149211e10774 100644 --- a/tests/tests_app_examples/test_collect_failures.py +++ b/tests/tests_app_examples/test_collect_failures.py @@ -26,6 +26,7 @@ def test_collect_failures_example_cloud() -> None: _, _, fetch_logs, + _, ): last_found_log_index = -1 while len(expected_logs) != 0: diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py index 5116b1b9d54bb..266f0305c7604 100644 --- a/tests/tests_app_examples/test_commands.py +++ b/tests/tests_app_examples/test_commands.py @@ -16,6 +16,7 @@ def test_commands_example_cloud() -> None: admin_page, _, fetch_logs, + _, ): app_id = admin_page.url.split("/")[-1] cmd = f"lightning trigger_with_client_command --name=something --app_id {app_id}" diff --git a/tests/tests_app_examples/test_custom_work_dependencies.py b/tests/tests_app_examples/test_custom_work_dependencies.py index 8390233e2eee3..d7c9db5ef610a 100644 --- a/tests/tests_app_examples/test_custom_work_dependencies.py +++ b/tests/tests_app_examples/test_custom_work_dependencies.py @@ -13,7 +13,7 @@ def test_custom_work_dependencies_example_cloud() -> None: with run_app_in_cloud( os.path.join(_PROJECT_ROOT, "tests/tests_app_examples/custom_work_dependencies/"), app_name="app.py", - ) as (_, _, fetch_logs): + ) as (_, _, fetch_logs, _): has_logs = False while not has_logs: for log in fetch_logs(): diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py index 9cebca9cf1072..14efc3458716e 100644 --- a/tests/tests_app_examples/test_drive.py +++ b/tests/tests_app_examples/test_drive.py @@ -13,6 +13,7 @@ def test_drive_example_cloud() -> None: _, view_page, fetch_logs, + _, ): has_logs = False diff --git a/tests/tests_app_examples/test_idle_timeout.py b/tests/tests_app_examples/test_idle_timeout.py index fb58a83aefc93..a39ae3f693f7a 100644 --- a/tests/tests_app_examples/test_idle_timeout.py +++ b/tests/tests_app_examples/test_idle_timeout.py @@ -13,6 +13,7 @@ def test_idle_timeout_example_cloud() -> None: _, _, fetch_logs, + _, ): has_logs = False while not has_logs: diff --git a/tests/tests_app_examples/test_payload.py b/tests/tests_app_examples/test_payload.py index 28d2391c18a2a..58fc28a4a8d3c 100644 --- a/tests/tests_app_examples/test_payload.py +++ b/tests/tests_app_examples/test_payload.py @@ -9,7 +9,7 @@ @pytest.mark.cloud def test_payload_example_cloud() -> None: - with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_payload")) as (_, _, fetch_logs): + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_payload")) as (_, _, fetch_logs, _): has_logs = False while not has_logs: diff --git a/tests/tests_app_examples/test_quick_start.py b/tests/tests_app_examples/test_quick_start.py index 9db693a5dc3d6..454c1084ca1bb 100644 --- a/tests/tests_app_examples/test_quick_start.py +++ b/tests/tests_app_examples/test_quick_start.py @@ -51,7 +51,7 @@ def test_quick_start_example(caplog, monkeypatch): @pytest.mark.cloud def test_quick_start_example_cloud() -> None: - with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "lightning-quick-start/")) as (_, view_page, _): + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "lightning-quick-start/")) as (_, view_page, _, _): def click_gradio_demo(*_, **__): button = view_page.locator('button:has-text("Interactive demo")') diff --git a/tests/tests_app_examples/test_template_react_ui.py b/tests/tests_app_examples/test_template_react_ui.py index 2e348035fe6e5..4b4588d2397e5 100644 --- a/tests/tests_app_examples/test_template_react_ui.py +++ b/tests/tests_app_examples/test_template_react_ui.py @@ -14,6 +14,7 @@ def test_template_react_ui_example_cloud() -> None: _, view_page, fetch_logs, + _, ): def click_button(*_, **__): diff --git a/tests/tests_app_examples/test_template_streamlit_ui.py b/tests/tests_app_examples/test_template_streamlit_ui.py index a8ba93794f2a0..e2c33305298f7 100644 --- a/tests/tests_app_examples/test_template_streamlit_ui.py +++ b/tests/tests_app_examples/test_template_streamlit_ui.py @@ -14,6 +14,7 @@ def test_template_streamlit_ui_example_cloud() -> None: _, view_page, fetch_logs, + _, ): def click_button(*_, **__): diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py index d34a92d6102f8..acc9e285c4d79 100644 --- a/tests/tests_app_examples/test_v0_app.py +++ b/tests/tests_app_examples/test_v0_app.py @@ -74,5 +74,6 @@ def test_v0_app_example_cloud() -> None: _, view_page, fetch_logs, + _, ): run_v0_app(fetch_logs, view_page) From d5f35ece72fd253adeb8e9947fd9be4a5992f8f8 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 19:37:50 +0900 Subject: [PATCH 30/59] CI/CD: Add CUDA version to docker image tags (#13831) * append cuda version to tags * revertme: push to hub * Update docker readme * Build base-conda-py3.9-torch1.12-cuda11.3.1 * Use new images in conda tests * revertme: push to hub * Revert "revertme: push to hub" This reverts commit 0f7d534b2ae41e4bd227961a929c333c88e35f59. * Revert "revertme: push to hub" This reverts commit 46a05fccbb9b596aa98d5d68424917b5811c5b4f. * Run conda if workflow edited * Run gpu testing if workflow edited * Use new tags in release/Dockerfile * Build base-cuda and PL release images with all combinations * Update release docker * Update conda from py3.9-torch1.12 to py3.10-torch.1.12 * Fix ubuntu version * Revert conda * revertme: push to hub * Don't build Python 3.10 for now... * Fix pl release builder * updating version contribute to the error? https://github.com/docker/buildx/issues/456 * Update actions' versions * Update slack user to notify * Don't use 11.6.0 to avoid bagua incompatibility * Don't use 11.1, and use 11.1.1 * Update .github/workflows/ci-pytorch_test-conda.yml Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> * Update trigger * Ignore artfacts from tutorials * Trim docker images to distribute * Add an image for tutorials * Update conda image 3.8x1.10 * Try different conda variants * No need to set cuda for conda jobs * Update who to notify ipu failure * Don't push * update filenaem Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 4 +- .github/workflows/cicd-pytorch-dockers.yml | 80 +++++++++++---------- .github/workflows/release-docker.yml | 31 +++++--- .gitignore | 6 ++ dockers/README.md | 45 +++--------- dockers/release/Dockerfile | 3 +- 8 files changed, 87 insertions(+), 88 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index ac5ca6f60a6b4..0de590f2c54a6 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f37c17613affc..68ba6974a3527 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 777ec2af759a0..2bbdb699c2c1e 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -22,13 +22,11 @@ jobs: strategy: fail-fast: false matrix: - # nightly: add when there's a release candidate include: - {python-version: "3.8", pytorch-version: "1.9"} - {python-version: "3.8", pytorch-version: "1.10"} - {python-version: "3.9", pytorch-version: "1.11"} - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 30 steps: @@ -45,7 +43,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml index a6ba2ac4aa5f4..84051cafd82d8 100644 --- a/.github/workflows/cicd-pytorch-dockers.yml +++ b/.github/workflows/cicd-pytorch-dockers.yml @@ -29,17 +29,22 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image - python_version: ["3.9"] - pytorch_version: ["1.12"] + include: + # We only release one docker image per PyTorch version. + # The matrix here is the same as the one in release-docker.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/release/Dockerfile push: false # pushed in release-docker.yml only when PL is released timeout-minutes: 50 @@ -53,14 +58,14 @@ jobs: python_version: ["3.7"] xla_version: ["1.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -85,30 +90,31 @@ jobs: fail-fast: false matrix: include: - # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"} - # latest (used in Tutorials) - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + # These are the base images for PL release docker images, + # so include at least all of the combinations in release-dockers.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + # Used in Lightning-AI/tutorials + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ matrix.cuda_version }} - UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 95 - uses: ravsamhq/notify-slack-action@v1 if: failure() && env.PUSH_TO_HUB == 'true' @@ -126,25 +132,23 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - # nightly: add when there's a release candidate - # - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9"} + - {python_version: "3.8", pytorch_version: "1.10"} + - {python_version: "3.9", pytorch_version: "1.11"} + - {python_version: "3.9", pytorch_version: "1.12"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} @@ -168,14 +172,14 @@ jobs: # the config used in 'dockers/ci-runner-ipu/Dockerfile' - {python_version: "3.9", pytorch_version: "1.9"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -184,7 +188,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 100 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -199,7 +203,7 @@ jobs: status: ${{ job.status }} token: ${{ secrets.GITHUB_TOKEN }} notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' # SeanNaren + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} @@ -212,14 +216,14 @@ jobs: # the config used in 'dockers/ci-runner-hpu/Dockerfile' - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | DIST=latest @@ -243,10 +247,10 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build Conda Docker # publish master/release - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: file: dockers/nvidia/Dockerfile push: false diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 9d87f1a582fb1..6901a24204683 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,6 +1,5 @@ name: Docker -# https://www.docker.com/blog/first-docker-github-action-is-here -# https://github.com/docker/build-push-action + on: push: branches: [master, "release/*"] @@ -15,8 +14,12 @@ jobs: strategy: fail-fast: false matrix: - python_version: ["3.7", "3.8", "3.9"] - pytorch_version: ["1.9", "1.10"] + include: + # We only release one docker image per PyTorch version. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - name: Checkout uses: actions/checkout@v2 @@ -32,19 +35,29 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} - tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + tags: | + ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} + latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 55 - name: Publish Latest to Docker uses: docker/build-push-action@v1.1.0 - # only on releases and latest Python and PyTorch - if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10' + # Only latest Python and PyTorch + if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} tags: "latest" timeout-minutes: 55 diff --git a/.gitignore b/.gitignore index 719f291a492ca..259d9f271189c 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,9 @@ hars* artifacts/* *docs/examples* *docs/source-app/api* + +# tutorials +our_model.tar +test.png +saved_models +data/ diff --git a/dockers/README.md b/dockers/README.md index 533c85739f528..b1ff9826b6c1f 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,36 +1,17 @@ # Docker images -## Builds images form attached Dockerfiles +## Build images from Dockerfiles You can build it on your own, note it takes lots of time, be prepared. ```bash -git clone -docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile . -``` - -or with specific arguments - -```bash -git clone -docker image build \ - -t pytorch-lightning:base-cuda-py3.9-pt1.10 \ - -f dockers/base-cuda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.10 \ - . -``` +git clone https://github.com/Lightning-AI/lightning.git -or nightly version from Conda +# build with the default arguments +docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . -```bash -git clone -docker image build \ - -t pytorch-lightning:base-conda-py3.9-pt1.11 \ - -f dockers/base-conda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.11 \ - . +# build with specific arguments +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . ``` To run your docker use @@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest ## Run docker image with GPUs -To run docker image with access to you GPUs you need to install +To run docker image with access to your GPUs, you need to install ```bash # Add the package repositories @@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker ``` -and later run the docker image with `--gpus all` so for example +and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 ``` ## Run Jupyter server @@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in- 1. Build the docker image: ```bash - docker image build \ - -t pytorch-lightning:v1.3.1 \ - -f dockers/nvidia/Dockerfile \ - --build-arg LIGHTNING_VERSION=1.3.1 \ - . + docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 . ``` 1. start the server and map ports: ```bash - docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1 + docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5 ``` 1. Connect in local browser: - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6` diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index cb393c91dfbe0..c39e66509188c 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,8 +14,9 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 +ARG CUDA_VERSION=11.3.1 -FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} LABEL maintainer="Lightning-AI " From 2f7daac4b80bc13135f7e14dffcdd0bd3d50a654 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 13:17:29 +0200 Subject: [PATCH 31/59] Use websockets in e2es (#14138) --- src/lightning_app/cli/lightning_cli.py | 6 +- src/lightning_app/testing/testing.py | 72 ++++++++++++------- src/lightning_app/utilities/app_logs.py | 41 +++++++---- tests/tests_app/utilities/test_app_logs.py | 11 +++ tests/tests_app_examples/test_commands.py | 2 +- .../test_custom_work_dependencies.py | 2 +- tests/tests_app_examples/test_drive.py | 4 +- tests/tests_app_examples/test_idle_timeout.py | 2 +- tests/tests_app_examples/test_payload.py | 2 +- tests/tests_app_examples/test_v0_app.py | 2 +- 10 files changed, 97 insertions(+), 47 deletions(-) create mode 100644 tests/tests_app/utilities/test_app_logs.py diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 45c80d4dcc357..babe0aa2b2abc 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -136,10 +136,10 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: rich_colors = list(ANSI_COLOR_NAMES) colors = {c: rich_colors[i + 1] for i, c in enumerate(components)} - for component_name, log_event in log_reader: + for log_event in log_reader: date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") - color = colors[component_name] - rich.print(f"[{color}]{component_name}[/{color}] {date} {log_event.message}") + color = colors[log_event.component_name] + rich.print(f"[{color}]{log_event.component_name}[/{color}] {date} {log_event.message}") @_main.command() diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 74d57db38c427..884c02a0521c1 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -1,26 +1,30 @@ import asyncio import json +import logging import os import shutil import subprocess import sys import tempfile import time +import traceback from contextlib import contextmanager from subprocess import Popen from time import sleep -from typing import Any, Callable, Dict, Generator, List, Type +from typing import Any, Callable, Dict, Generator, List, Optional, Type import requests from lightning_cloud.openapi.rest import ApiException from requests import Session from rich import print +from rich.color import ANSI_COLOR_NAMES from lightning_app import LightningApp, LightningFlow from lightning_app.cli.lightning_cli import run_app from lightning_app.core.constants import LIGHTNING_CLOUD_PROJECT_ID from lightning_app.runners.multiprocess import MultiProcessRuntime from lightning_app.testing.config import Config +from lightning_app.utilities.app_logs import _app_logs_reader from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.imports import _is_playwright_available, requires @@ -32,6 +36,9 @@ from playwright.sync_api import HttpCredentials, sync_playwright +_logger = logging.getLogger(__name__) + + class LightningTestApp(LightningApp): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -282,20 +289,6 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str var scrollingElement = (document.scrollingElement || document.body); scrollingElement.scrollTop = scrollingElement.scrollHeight; }, 200); - - if (!window._logs) { - window._logs = []; - } - - if (window.logTerminals) { - Object.entries(window.logTerminals).forEach( - ([key, value]) => { - window.logTerminals[key]._onLightningWritelnHandler = function (data) { - window._logs = window._logs.concat([data]); - } - } - ); - } """ ) @@ -309,8 +302,46 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError): pass - def fetch_logs() -> str: - return admin_page.evaluate("window._logs;") + client = LightningClient() + project = _get_project(client) + identifiers = [] + rich_colors = list(ANSI_COLOR_NAMES) + + def fetch_logs(component_names: Optional[List[str]] = None) -> Generator: + """This methods creates websockets connection in threads and returns the logs to the main thread.""" + app_id = admin_page.url.split("/")[-1] + + if not component_names: + works = client.lightningwork_service_list_lightningwork( + project_id=project.project_id, + app_id=app_id, + ).lightningworks + component_names = ["flow"] + [w.name for w in works] + + def on_error_callback(ws_app, *_): + print(traceback.print_exc()) + ws_app.close() + + colors = {c: rich_colors[i + 1] for i, c in enumerate(component_names)} + gen = _app_logs_reader( + client=client, + project_id=project.project_id, + app_id=app_id, + component_names=component_names, + follow=False, + on_error_callback=on_error_callback, + ) + max_length = max(len(c.replace("root.", "")) for c in component_names) + for log_event in gen: + message = log_event.message + identifier = f"{log_event.timestamp}{log_event.message}" + if identifier not in identifiers: + date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") + identifiers.append(identifier) + color = colors[log_event.component_name] + padding = (max_length - len(log_event.component_name)) * " " + print(f"[{color}]{log_event.component_name}{padding}[/{color}] {date} {message}") + yield message # 5. Print your application ID print( @@ -323,11 +354,6 @@ def fetch_logs() -> str: pass finally: print("##################################################") - printed_logs = [] - for log in fetch_logs(): - if log not in printed_logs: - printed_logs.append(log) - print(log.split("[0m")[-1]) button = admin_page.locator('[data-cy="stop"]') try: button.wait_for(timeout=3 * 1000) @@ -337,8 +363,6 @@ def fetch_logs() -> str: context.close() browser.close() - client = LightningClient() - project = _get_project(client) list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) for lightningapp in list_lightningapps.lightningapps: diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py index 4a7af9b5c5143..536fbaae05093 100644 --- a/src/lightning_app/utilities/app_logs.py +++ b/src/lightning_app/utilities/app_logs.py @@ -5,7 +5,7 @@ from datetime import datetime, timedelta from json import JSONDecodeError from threading import Thread -from typing import Iterator, List, Optional, Tuple +from typing import Callable, Iterator, List, Optional import dateutil.parser from websocket import WebSocketApp @@ -30,10 +30,17 @@ class _LogEventLabels: class _LogEvent: message: str timestamp: datetime + component_name: str labels: _LogEventLabels + def __ge__(self, other: "_LogEvent") -> bool: + return self.timestamp >= other.timestamp -def _push_logevents_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue): + def __gt__(self, other: "_LogEvent") -> bool: + return self.timestamp > other.timestamp + + +def _push_log_events_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue): """Pushes _LogEvents from websocket to read_queue. Returns callback function used with `on_message_callback` of websocket.WebSocketApp. @@ -43,13 +50,17 @@ def callback(ws_app: WebSocketApp, msg: str): # We strongly trust that the contract on API will hold atm :D event_dict = json.loads(msg) labels = _LogEventLabels(**event_dict["labels"]) + if "message" in event_dict: + message = event_dict["message"] + timestamp = dateutil.parser.isoparse(event_dict["timestamp"]) event = _LogEvent( - message=event_dict["message"], - timestamp=dateutil.parser.isoparse(event_dict["timestamp"]), + message=message, + timestamp=timestamp, + component_name=component_name, labels=labels, ) - read_queue.put((event.timestamp, component_name, event)) + read_queue.put(event) return callback @@ -66,8 +77,13 @@ def _error_callback(ws_app: WebSocketApp, error: Exception): def _app_logs_reader( - client: LightningClient, project_id: str, app_id: str, component_names: List[str], follow: bool -) -> Iterator[Tuple[str, _LogEvent]]: + client: LightningClient, + project_id: str, + app_id: str, + component_names: List[str], + follow: bool, + on_error_callback: Optional[Callable] = None, +) -> Iterator[_LogEvent]: read_queue = queue.PriorityQueue() logs_api_client = _LightningLogsSocketAPI(client.api_client) @@ -78,8 +94,8 @@ def _app_logs_reader( project_id=project_id, app_id=app_id, component=component_name, - on_message_callback=_push_logevents_to_read_queue_callback(component_name, read_queue), - on_error_callback=_error_callback, + on_message_callback=_push_log_events_to_read_queue_callback(component_name, read_queue), + on_error_callback=on_error_callback or _error_callback, ) for component_name in component_names ] @@ -92,20 +108,19 @@ def _app_logs_reader( for th in log_threads: th.start() + # Print logs from queue when log event is available user_log_start = "<<< BEGIN USER_RUN_FLOW SECTION >>>" start_timestamp = None # Print logs from queue when log event is available try: while True: - _, component_name, log_event = read_queue.get(timeout=None if follow else 1.0) - log_event: _LogEvent - + log_event = read_queue.get(timeout=None if follow else 1.0) if user_log_start in log_event.message: start_timestamp = log_event.timestamp + timedelta(seconds=0.5) if start_timestamp and log_event.timestamp > start_timestamp: - yield component_name, log_event + yield log_event except queue.Empty: # Empty is raised by queue.get if timeout is reached. Follow = False case. diff --git a/tests/tests_app/utilities/test_app_logs.py b/tests/tests_app/utilities/test_app_logs.py new file mode 100644 index 0000000000000..e7384dd72d6e2 --- /dev/null +++ b/tests/tests_app/utilities/test_app_logs.py @@ -0,0 +1,11 @@ +from datetime import datetime +from unittest.mock import MagicMock + +from lightning_app.utilities.app_logs import _LogEvent + + +def test_log_event(): + event_1 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) + event_2 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) + assert event_1 < event_2 + assert event_1 <= event_2 diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py index 266f0305c7604..236e587e23101 100644 --- a/tests/tests_app_examples/test_commands.py +++ b/tests/tests_app_examples/test_commands.py @@ -26,7 +26,7 @@ def test_commands_example_cloud() -> None: has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "['something', 'else']" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_custom_work_dependencies.py b/tests/tests_app_examples/test_custom_work_dependencies.py index d7c9db5ef610a..b8971e0ef2148 100644 --- a/tests/tests_app_examples/test_custom_work_dependencies.py +++ b/tests/tests_app_examples/test_custom_work_dependencies.py @@ -16,7 +16,7 @@ def test_custom_work_dependencies_example_cloud() -> None: ) as (_, _, fetch_logs, _): has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Custom Work Dependency checker End" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py index 14efc3458716e..630e76b550e9e 100644 --- a/tests/tests_app_examples/test_drive.py +++ b/tests/tests_app_examples/test_drive.py @@ -11,14 +11,14 @@ def test_drive_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_drive")) as ( _, - view_page, + _, fetch_logs, _, ): has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Application End!" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_idle_timeout.py b/tests/tests_app_examples/test_idle_timeout.py index a39ae3f693f7a..f06181ce86ed3 100644 --- a/tests/tests_app_examples/test_idle_timeout.py +++ b/tests/tests_app_examples/test_idle_timeout.py @@ -17,7 +17,7 @@ def test_idle_timeout_example_cloud() -> None: ): has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Application End" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_payload.py b/tests/tests_app_examples/test_payload.py index 58fc28a4a8d3c..b40b8ca52defd 100644 --- a/tests/tests_app_examples/test_payload.py +++ b/tests/tests_app_examples/test_payload.py @@ -13,7 +13,7 @@ def test_payload_example_cloud() -> None: has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Application End!" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py index acc9e285c4d79..026c45a4e1ba1 100644 --- a/tests/tests_app_examples/test_v0_app.py +++ b/tests/tests_app_examples/test_v0_app.py @@ -45,7 +45,7 @@ def check_content(button_name, text_content): wait_for(view_page, check_content, "TAB_2", "Hello from component B") has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "'a': 'a', 'b': 'b'" in log: has_logs = True sleep(1) From b8b8f033fd55db6c03e28ced1ddc2b49f6c8b770 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 14:56:41 +0200 Subject: [PATCH 32/59] (app) Run the flow only if the state has updated 1/2 (#14076) --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/core/app.py | 31 +++++++++++----- src/lightning_app/utilities/app_helpers.py | 7 ++-- src/lightning_app/utilities/commands/base.py | 1 + src/lightning_app/utilities/scheduler.py | 2 +- tests/tests_app/core/test_lightning_app.py | 39 +++++++++++++++++++- tests/tests_app/core/test_lightning_flow.py | 21 +++++------ tests/tests_app/utilities/test_commands.py | 4 +- 8 files changed, 78 insertions(+), 29 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index ba8cdd796c5bb..f32d07697f376 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -24,6 +24,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703)) +- Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) + ### Deprecated ### Fixed diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 584f94285c219..3f9e2521eb21d 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -15,7 +15,7 @@ from lightning_app.core.queues import BaseQueue, SingleProcessQueue from lightning_app.frontend import Frontend from lightning_app.storage.path import storage_root_dir -from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef +from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests from lightning_app.utilities.component import _convert_paths_after_init from lightning_app.utilities.enum import AppStage, CacheCallsKeys @@ -94,7 +94,7 @@ def __init__( self.processes: t.Dict[str, WorkManager] = {} self.frontends: t.Dict[str, Frontend] = {} self.stage = AppStage.RUNNING - self._has_updated: bool = False + self._has_updated: bool = True self._schedules: t.Dict[str, t.Dict] = {} self.threads: t.List[threading.Thread] = [] @@ -278,7 +278,7 @@ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]: if component_output: logger.debug(f"Received from {component_output.id} : {component_output.delta.to_dict()}") work = self.get_component_by_name(component_output.id) - new_work_delta = _delta_to_appstate_delta(self.root, work, deepcopy(component_output.delta)) + new_work_delta = _delta_to_app_state_delta(self.root, work, deepcopy(component_output.delta)) deltas.append(new_work_delta) else: should_get_component_output = False @@ -307,9 +307,11 @@ def maybe_apply_changes(self) -> bool: if not deltas: # When no deltas are received from the Rest API or work queues, # we need to check if the flow modified the state and populate changes. - if Delta(DeepDiff(self.last_state, self.state, verbose_level=2)).to_dict(): + deep_diff = DeepDiff(self.last_state, self.state, verbose_level=2) + if deep_diff: + # TODO: Resolve changes with ``CacheMissException``. # new_state = self.populate_changes(self.last_state, self.state) - self.set_state(self.state) + self.set_last_state(self.state) self._has_updated = True return False @@ -329,7 +331,6 @@ def maybe_apply_changes(self) -> bool: def run_once(self): """Method used to collect changes and run the root Flow once.""" done = False - self._has_updated = False self._last_run_time = 0.0 if self.backend is not None: @@ -352,17 +353,23 @@ def run_once(self): _process_command_requests(self) + t0 = time() + try: self.check_error_queue() - t0 = time() - self.root.run() - self._last_run_time = time() - t0 + # Execute the flow only if: + # - There are state changes + # - It is the first execution of the flow + if self._has_updated: + self.root.run() except CacheMissException: self._on_cache_miss_exception() except (ExitAppException, KeyboardInterrupt): done = True self.stage = AppStage.STOPPING + self._last_run_time = time() - t0 + self.on_run_once_end() return done @@ -414,6 +421,8 @@ def _run(self) -> bool: if self._has_updated and self.should_publish_changes_to_api and self.api_publish_state_queue: self.api_publish_state_queue.put(self.state_vars) + self._has_updated = False + return True def _update_layout(self) -> None: @@ -430,8 +439,10 @@ def _apply_restarting(self) -> bool: self.stage = AppStage.BLOCKING return False - def _has_work_finished(self, work): + def _has_work_finished(self, work) -> bool: latest_call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH] + if latest_call_hash is None: + return False return "ret" in work._calls[latest_call_hash] def _collect_work_finish_status(self) -> dict: diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index 4144c6de3ba12..faa612bba1998 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -299,7 +299,7 @@ def _set_child_name(component: "Component", child: "Component", new_name: str) - return child_name -def _delta_to_appstate_delta(root: "LightningFlow", component: "Component", delta: Delta) -> Delta: +def _delta_to_app_state_delta(root: "LightningFlow", component: "Component", delta: Delta) -> Delta: delta_dict = delta.to_dict() for changed in delta_dict.values(): for delta_key in changed.copy().keys(): @@ -322,8 +322,9 @@ def _delta_to_appstate_delta(root: "LightningFlow", component: "Component", delt delta_key_without_root = delta_key[4:] # the first 4 chars are the word 'root', strip it new_key = new_prefix + delta_key_without_root - changed[new_key] = val - del changed[delta_key] + if new_key != delta_key: + changed[new_key] = val + del changed[delta_key] return Delta(delta_dict) diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py index 11661e51ca26a..b87b41b05df42 100644 --- a/src/lightning_app/utilities/commands/base.py +++ b/src/lightning_app/utilities/commands/base.py @@ -243,3 +243,4 @@ def _process_command_requests(app): # Validation is done on the CLI side. response = method(**command_query["command_arguments"]) app.commands_responses_queue.put({"response": response, "id": command_query["id"]}) + app._has_updated = True diff --git a/src/lightning_app/utilities/scheduler.py b/src/lightning_app/utilities/scheduler.py index 012930f017f20..e45b0879246b9 100644 --- a/src/lightning_app/utilities/scheduler.py +++ b/src/lightning_app/utilities/scheduler.py @@ -15,7 +15,7 @@ class SchedulerThread(threading.Thread): def __init__(self, app) -> None: super().__init__(daemon=True) self._exit_event = threading.Event() - self._sleep_time = 0.5 + self._sleep_time = 1.0 self._app = app def run(self) -> None: diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index e6c715f87ef03..3776481965be3 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -1,3 +1,4 @@ +import logging import os import pickle from time import sleep @@ -27,6 +28,8 @@ from lightning_app.utilities.redis import check_if_redis_running from lightning_app.utilities.warnings import LightningFlowWarning +logger = logging.getLogger() + class B1(LightningFlow): def __init__(self): @@ -439,19 +442,25 @@ def __init__(self): self.counter = 0 def run(self): - self.counter = 1 + if self.counter < 2: + self.counter += 1 def test_maybe_apply_changes_from_flow(): """This test validates the app `_updated` is set to True only if the state was changed in the flow.""" app = LightningApp(SimpleFlow()) - assert not app._has_updated + assert app._has_updated app.maybe_apply_changes() app.root.run() app.maybe_apply_changes() assert app._has_updated app._has_updated = False + app.root.run() + app.maybe_apply_changes() + assert app._has_updated + app._has_updated = False + app.root.run() app.maybe_apply_changes() assert not app._has_updated @@ -920,3 +929,29 @@ def test_state_size_constant_growth(): MultiProcessRuntime(app, start_server=False).dispatch() assert app.root._state_sizes[0] <= 5904 assert app.root._state_sizes[20] <= 23736 + + +class FlowUpdated(LightningFlow): + def run(self): + logger.info("Hello World") + + +class NonUpdatedLightningTestApp(LightningTestApp): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.counter = 0 + + def on_after_run_once(self): + self.counter += 1 + if not self._has_updated and self.counter > 2: + return True + return super().on_after_run_once() + + +def test_non_updated_flow(caplog): + """This tests validate the app can run 3 times and call the flow only once.""" + with caplog.at_level(logging.INFO): + app = NonUpdatedLightningTestApp(FlowUpdated()) + MultiProcessRuntime(app, start_server=False).dispatch() + assert caplog.messages == ["Hello World"] + assert app.counter == 3 diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py index e8ce1222a3186..4c0eb23ea014c 100644 --- a/tests/tests_app/core/test_lightning_flow.py +++ b/tests/tests_app/core/test_lightning_flow.py @@ -16,7 +16,7 @@ from lightning_app.storage import Path from lightning_app.storage.path import storage_root_dir from lightning_app.testing.helpers import EmptyFlow, EmptyWork -from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef +from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.exceptions import ExitAppException @@ -416,7 +416,7 @@ def run(self): flow_a.work.counter = 1 work_state_2 = flow_a.work.state delta = Delta(DeepDiff(work_state, work_state_2, verbose_level=2)) - delta = _delta_to_appstate_delta(flow_a, flow_a.work, delta) + delta = _delta_to_app_state_delta(flow_a, flow_a.work, delta) new_flow_state = LightningApp.populate_changes(flow_state, flow_state + delta) flow_a.set_state(new_flow_state) assert flow_a.work.counter == 1 @@ -592,24 +592,23 @@ def run(self): class FlowSchedule(LightningFlow): def __init__(self): super().__init__() - self._last_time = None + self._last_times = [] + self.target = 3 + self.seconds = ",".join([str(v) for v in range(0, 60, self.target)]) def run(self): - if self.schedule("* * * * * 0,5,10,15,20,25,30,35,40,45,50,55"): - if self._last_time is None: - self._last_time = False - elif not self._last_time: - self._last_time = time() + if self.schedule(f"* * * * * {self.seconds}"): + if len(self._last_times) < 3: + self._last_times.append(time()) else: - # TODO (tchaton) Optimize flow execution. - assert 4.0 < abs(time() - self._last_time) < 6.0 + assert abs((time() - self._last_times[-1]) - self.target) < 3 self._exit() def test_scheduling_api(): app = LightningApp(FlowSchedule()) - MultiProcessRuntime(app).dispatch() + MultiProcessRuntime(app, start_server=True).dispatch() def test_lightning_flow(): diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py index 1e8e36ed09545..ed7f386395282 100644 --- a/tests/tests_app/utilities/test_commands.py +++ b/tests/tests_app/utilities/test_commands.py @@ -44,7 +44,7 @@ def __init__(self): def run(self): if self.has_sweep and len(self.names) == 1: - sleep(2) + sleep(1) self._exit() def trigger_method(self, name: str): @@ -156,7 +156,7 @@ def test_configure_commands(monkeypatch): monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name", "my_name", "--num_trials", "1"]) app_command() time_left = 15 - while time_left > 0 or process.exitcode is None: + while time_left > 0 and process.exitcode != 0: sleep(0.1) time_left -= 0.1 assert process.exitcode == 0 From cda381a626719d965d85f9034993cae1f4227f29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 15:03:53 +0200 Subject: [PATCH 33/59] Update changelog after 1.7.1 release (#14127) --- src/pytorch_lightning/CHANGELOG.md | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b405665b9df88..baf98d81a7733 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -52,39 +52,29 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) -- Casted tensors to fp16 before moving them to device with `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) +- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) -- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) +- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) -- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +## [1.7.1] - 2022-08-09 +### Fixed +- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) +- Casted tensors to fp16 before moving them to device with `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) +- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) - Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262)) - - - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) - - - Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041)) - - -- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) - - - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) - - - Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) -- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) - - ## [1.7.0] - 2022-08-02 ### Added From 58014846ee0fb54b92e4bfb4c0965b72bc0a9641 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Wed, 10 Aug 2022 14:32:12 +0100 Subject: [PATCH 34/59] Update Grid links to Lightning AI (#14081) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * initial changes for lightning * Update .github/BECOMING_A_CORE_CONTRIBUTOR.md Co-authored-by: Adrian Wälchli Co-authored-by: Adrian Wälchli --- .github/BECOMING_A_CORE_CONTRIBUTOR.md | 2 +- SECURITY.md | 2 +- src/pytorch_lightning/README.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md index a179161f687a1..fd40e29e1ebf1 100644 --- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md +++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md @@ -62,4 +62,4 @@ We are on the lookout for new people to join, however, if you feel like you meet ## Employment -You can also become a [Grid.ai](https://www.grid.ai) employee or intern and work on Lightning. To get started, you can email `careers@grid.ai` with your resume or check out our [open job postings](https://boards.greenhouse.io/gridai). +You can also become a [Lightning AI](https://lightning.ai/) employee or intern and work on Lightning. To get started, you can email `careers@lightning.ai` with your resume or check out our [open job postings](https://boards.greenhouse.io/lightningai). diff --git a/SECURITY.md b/SECURITY.md index 8f265f26be452..862563f84e2fe 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,2 +1,2 @@ -developer@grid.ai +developer@lightning.ai developer@pytorchlightning.ai diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index b57aea6fae147..914596c0a9d2f 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -14,8 +14,8 @@ ______________________________________________________________________ DocsExamplesCommunity • - Grid AI • - License + Lightning AI • + License

From 4e87a44002a91c869f43c0929d29fa8600f14f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 10 Aug 2022 17:15:35 +0200 Subject: [PATCH 35/59] Avoid entry_points deprecation warning (#14052) Co-authored-by: Adam J. Stewart Co-authored-by: Akihiro Nitta --- src/pytorch_lightning/CHANGELOG.md | 6 ++++++ .../trainer/connectors/callback_connector.py | 11 ++++++++--- src/pytorch_lightning/utilities/imports.py | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index baf98d81a7733..90285b55c8037 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -61,6 +61,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) +- Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) + + +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py index bb7f912420256..32d67d44ad44c 100644 --- a/src/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py @@ -31,7 +31,7 @@ from pytorch_lightning.callbacks.rich_model_summary import RichModelSummary from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info _log = logging.getLogger(__name__) @@ -260,14 +260,19 @@ def _configure_external_callbacks() -> List[Callback]: Return: A list of all callbacks collected from external factories. """ + group = "pytorch_lightning.callbacks_factory" + if _PYTHON_GREATER_EQUAL_3_8_0: from importlib.metadata import entry_points - factories = entry_points().get("pytorch_lightning.callbacks_factory", ()) + if _PYTHON_GREATER_EQUAL_3_10_0: + factories = entry_points(group=group) # type: ignore[call-arg] + else: + factories = entry_points().get(group, {}) # type: ignore[assignment] else: from pkg_resources import iter_entry_points - factories = iter_entry_points("pytorch_lightning.callbacks_factory") # type: ignore[assignment] + factories = iter_entry_points(group) # type: ignore[assignment] external_callbacks: List[Callback] = [] for factory in factories: diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index 67bf75be3c4d3..ba437ad332dfa 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -124,6 +124,7 @@ def __repr__(self) -> str: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) +_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) _TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1") _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") _TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2") From 9b61b1c482cb8be569e664647a577730e55680c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 10 Aug 2022 17:21:05 +0200 Subject: [PATCH 36/59] Remove duplicated test classes (#14122) Remove duplicated classes --- .../progress/test_rich_progress_bar.py | 3 +- .../callbacks/test_stochastic_weight_avg.py | 3 +- tests/tests_pytorch/helpers/datasets.py | 39 +------------------ .../strategies/test_deepspeed_strategy.py | 3 +- .../trainer/flags/test_val_check_interval.py | 3 +- .../logging_/test_train_loop_logging.py | 3 +- .../test_estimated_stepping_batches.py | 3 +- .../tests_pytorch/trainer/test_dataloaders.py | 8 +++- tests/tests_pytorch/trainer/test_trainer.py | 8 +++- tests/tests_pytorch/utilities/test_data.py | 3 +- 10 files changed, 20 insertions(+), 56 deletions(-) diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py index e9374f8ea4be1..f1ccf2a2726a2 100644 --- a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py @@ -21,8 +21,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ProgressBarBase, RichProgressBar from pytorch_lightning.callbacks.progress.rich_progress import RichProgressBarTheme -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset -from tests_pytorch.helpers.datasets import RandomIterableDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 65a0fea2fb4a5..7f1692e30a3f2 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -26,10 +26,9 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import StochasticWeightAveraging -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.strategies import DDPSpawnStrategy, Strategy from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/helpers/datasets.py b/tests/tests_pytorch/helpers/datasets.py index 3443020d4528f..c9d185313e85e 100644 --- a/tests/tests_pytorch/helpers/datasets.py +++ b/tests/tests_pytorch/helpers/datasets.py @@ -19,7 +19,7 @@ from typing import Optional, Sequence, Tuple import torch -from torch.utils.data import Dataset, IterableDataset +from torch.utils.data import Dataset class MNIST(Dataset): @@ -212,40 +212,3 @@ def __getitem__(self, idx): def __len__(self): return len(self.y) - - -class RandomDictDataset(Dataset): - def __init__(self, size: int, length: int): - self.len = length - self.data = torch.randn(length, size) - - def __getitem__(self, index): - a = self.data[index] - b = a + 2 - return {"a": a, "b": b} - - def __len__(self): - return self.len - - -class RandomIterableDataset(IterableDataset): - def __init__(self, size: int, count: int): - self.count = count - self.size = size - - def __iter__(self): - for _ in range(self.count): - yield torch.randn(self.size) - - -class RandomIterableDatasetWithLen(IterableDataset): - def __init__(self, size: int, count: int): - self.count = count - self.size = size - - def __iter__(self): - for _ in range(len(self)): - yield torch.randn(self.size) - - def __len__(self): - return self.count diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 272b03a846688..e3c6f95f3ff47 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -28,13 +28,12 @@ from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin from pytorch_lightning.strategies import DeepSpeedStrategy from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE, LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf if _DEEPSPEED_AVAILABLE: diff --git a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py index 9414fd1c5096f..e5fd9b5dd2706 100644 --- a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py +++ b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py @@ -16,10 +16,9 @@ import pytest from torch.utils.data import DataLoader -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset @pytest.mark.parametrize("max_epochs", [1, 2, 3]) diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index d16be306b9365..85ed3d8e3471d 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -28,9 +28,8 @@ from pytorch_lightning import callbacks, Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, TQDMProgressBar from pytorch_lightning.core.module import LightningModule -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomDictDataset from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomDictDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 92a1126294dfc..846a39a748a60 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -22,11 +22,10 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler -from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.demos.boring_classes import BoringModel, RandomIterableDataset from pytorch_lightning.strategies.ipu import IPUStrategy from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py index 5bea5a4cbbe1c..34504392dc0c1 100644 --- a/tests/tests_pytorch/trainer/test_dataloaders.py +++ b/tests/tests_pytorch/trainer/test_dataloaders.py @@ -25,12 +25,16 @@ from pytorch_lightning import Callback, seed_everything, Trainer from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import ( + BoringModel, + RandomDataset, + RandomIterableDataset, + RandomIterableDatasetWithLen, +) from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_iterable_dataset, has_len_all_ranks from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.dataloaders import CustomInfDataloader, CustomNotImplementedErrorDataloader -from tests_pytorch.helpers.datasets import RandomIterableDataset, RandomIterableDatasetWithLen from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index e4be8929f9c7e..9506acee425d0 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -41,7 +41,12 @@ from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import ( + BoringModel, + RandomDataset, + RandomIterableDataset, + RandomIterableDatasetWithLen, +) from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler from pytorch_lightning.strategies import ( @@ -60,7 +65,6 @@ from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.seed import seed_everything from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.datasets import RandomIterableDataset, RandomIterableDatasetWithLen from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.simple_models import ClassificationModel diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index ffb898efaa815..3700feaba9992 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -6,7 +6,7 @@ from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler from pytorch_lightning import Trainer -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.data import ( @@ -23,7 +23,6 @@ warning_cache, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.utils import no_warning_call From 2abed91c5386ee9434b4e45e859e91d06bef3080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 10 Aug 2022 17:25:44 +0200 Subject: [PATCH 37/59] Update CODEOWNERS (#14119) * Update CODEOWNERS * Cleanup and remove old sections * pl focus Co-authored-by: Jirka Borovec --- .github/CODEOWNERS | 40 ++++++++++++++---------------- src/pytorch_lightning/__about__.py | 1 - 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f83924b9566ce..0b4692731bff9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,7 +5,7 @@ # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. -* @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7 +* @williamfalcon @borda @tchaton @awaelchli @kaushikb11 @rohitgr7 # CI/CD and configs /.github/ @borda @carmocca @akihironitta @tchaton @@ -26,13 +26,14 @@ /docs/source-app/expertise_levels @williamfalcon @Felonious-Spellfire @RobertLaurella # Packages +/src/pytorch_lightning @carmocca @justusschock /src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 /src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11 /src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11 /src/pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11 /src/pytorch_lightning/lite @tchaton @awaelchli @carmocca /src/pytorch_lightning/loggers @tchaton @awaelchli @borda -/src/pytorch_lightning/loggers/wandb.py @borisdayma +/src/pytorch_lightning/loggers/wandb.py @borisdayma @borda /src/pytorch_lightning/loggers/neptune.py @shnela @HubertJaworski @pkasprzyk @pitercl @Raalsky @aniezurawski @kamil-kaczmarek /src/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca /src/pytorch_lightning/overrides @tchaton @borda @@ -46,7 +47,7 @@ /src/pytorch_lightning/utilities @borda @tchaton @carmocca /src/lightning_app @tchaton @manskx -/src/lightning_app/cli/pl-app-template @awaelchli @tchaton @Borda +/src/lightning_app/cli/pl-app-template @tchaton @awaelchli @Borda /src/lightning_app/core @tchaton @awaelchli @manskx /src/lightning_app/core/queues.py @tchaton @hhsecond @manskx /src/lightning_app/runners/cloud.py @tchaton @hhsecond @@ -54,28 +55,23 @@ /src/lightning_app/__about__.py @nohalon @edenlightning @lantiga # Examples -/examples/app_* @tchaton @awaelchli @manskx @hhsecond +/examples/app_* @tchaton @awaelchli @manskx @hhsecond # App tests -/tests/tests_app @tchaton @awaelchli @manskx @hhsecond -/tests/tests_app_examples @tchaton @awaelchli @manskx @hhsecond +/tests/tests_app @tchaton @awaelchli @manskx @hhsecond +/tests/tests_app_examples @tchaton @awaelchli @manskx @hhsecond # Specifics -/src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca -/src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca - +/src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca +/src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca # API -/src/pytorch_lightning/callbacks/base.py @williamfalcon @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/core/datamodule.py @williamFalcon @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli -/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/core/lightning.py @williamfalcon @tchaton @awaelchli - -# Testing -/tests/helpers/boring_model.py @williamfalcon @tchaton @borda +/src/pytorch_lightning/callbacks/callback.py @williamfalcon @awaelchli @ananthsub @carmocca +/src/pytorch_lightning/core/datamodule.py @williamFalcon @awaelchli @ananthsub @carmocca +/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli +/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @ananthsub @carmocca +/src/pytorch_lightning/core/module.py @williamfalcon @tchaton @awaelchli -/.github/CODEOWNERS @williamfalcon -/.github/approve_config.yml @williamfalcon -/SECURITY.md @williamfalcon -/README.md @williamfalcon @edenlightning @borda -/setup.py @williamfalcon @borda @carmocca +/.github/CODEOWNERS @williamfalcon +/SECURITY.md @williamfalcon +/README.md @williamfalcon @edenlightning @borda +/setup.py @williamfalcon @borda @carmocca /src/pytorch_lightning/__about__.py @williamfalcon @borda @carmocca diff --git a/src/pytorch_lightning/__about__.py b/src/pytorch_lightning/__about__.py index 6d09c5264e1ab..e2fdbd9ee3016 100644 --- a/src/pytorch_lightning/__about__.py +++ b/src/pytorch_lightning/__about__.py @@ -13,7 +13,6 @@ # limitations under the License. import time -# __version__ = "1.7.0" __author__ = "Lightning AI et al." __author_email__ = "pytorch@lightning.ai" __license__ = "Apache-2.0" From 527b28ed974c326f9e86c334b0c5bd477b635f89 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Wed, 10 Aug 2022 16:26:44 +0100 Subject: [PATCH 38/59] Fix mypy errors attributed to `pytorch_lightning.profilers.simple` (#14103) --- pyproject.toml | 1 - src/pytorch_lightning/profilers/simple.py | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8db782df357d8..b5e806bc69900 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,6 @@ module = [ "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", - "pytorch_lightning.profilers.simple", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", diff --git a/src/pytorch_lightning/profilers/simple.py b/src/pytorch_lightning/profilers/simple.py index 20d76f9b2d378..0fb9497ff17fb 100644 --- a/src/pytorch_lightning/profilers/simple.py +++ b/src/pytorch_lightning/profilers/simple.py @@ -60,7 +60,7 @@ def __init__( """ super().__init__(dirpath=dirpath, filename=filename) self.current_actions: Dict[str, float] = {} - self.recorded_durations = defaultdict(list) + self.recorded_durations: Dict = defaultdict(list) self.extended = extended self.start_time = time.monotonic() @@ -104,20 +104,23 @@ def summary(self) -> str: if len(self.recorded_durations) > 0: max_key = max(len(k) for k in self.recorded_durations.keys()) - def log_row(action, mean, num_calls, total, per): + def log_row_extended(action: str, mean: str, num_calls: str, total: str, per: str) -> str: row = f"{sep}| {action:<{max_key}s}\t| {mean:<15}\t|" row += f" {num_calls:<15}\t| {total:<15}\t| {per:<15}\t|" return row - header_string = log_row("Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %") + header_string = log_row_extended( + "Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %" + ) output_string_len = len(header_string.expandtabs()) sep_lines = f"{sep}{'-' * output_string_len}" output_string += sep_lines + header_string + sep_lines - report, total_calls, total_duration = self._make_report_extended() - output_string += log_row("Total", "-", f"{total_calls:}", f"{total_duration:.5}", "100 %") + report_extended: _TABLE_DATA_EXTENDED + report_extended, total_calls, total_duration = self._make_report_extended() + output_string += log_row_extended("Total", "-", f"{total_calls:}", f"{total_duration:.5}", "100 %") output_string += sep_lines - for action, mean_duration, num_calls, total_duration, duration_per in report: - output_string += log_row( + for action, mean_duration, num_calls, total_duration, duration_per in report_extended: + output_string += log_row_extended( action, f"{mean_duration:.5}", f"{num_calls}", @@ -128,7 +131,7 @@ def log_row(action, mean, num_calls, total, per): else: max_key = max(len(k) for k in self.recorded_durations) - def log_row(action, mean, total): + def log_row(action: str, mean: str, total: str) -> str: return f"{sep}| {action:<{max_key}s}\t| {mean:<15}\t| {total:<15}\t|" header_string = log_row("Action", "Mean duration (s)", "Total time (s)") From 6f4edd721f9852d8f4afaa49edd1f80c5fc6dc72 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 10 Aug 2022 09:03:51 -0700 Subject: [PATCH 39/59] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9c03e3707ec24..2d32094f6595f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +### ** NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines** +
From f132d44821f9fe7ad83d74edbb13dc6ee7769a3d Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Wed, 10 Aug 2022 18:09:50 +0200 Subject: [PATCH 40/59] Fix a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported (#14117) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/utilities/data.py | 10 +++++---- tests/tests_pytorch/utilities/test_data.py | 25 ++++++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 90285b55c8037..97bb317b02a14 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -52,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) + + - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 00a7cb8486709..b625a046f6122 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -501,15 +501,17 @@ def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = Non It patches the ``__init__`` method. """ classes = _get_all_subclasses(base_cls) | {base_cls} - wrapped = set() for cls in classes: - if cls.__init__ not in wrapped: + # Check that __init__ belongs to the class + # https://stackoverflow.com/a/5253424 + if "__init__" in cls.__dict__: cls._old_init = cls.__init__ cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) - wrapped.add(cls.__init__) yield for cls in classes: - if hasattr(cls, "_old_init"): + # Check that _old_init belongs to the class + # https://stackoverflow.com/a/5253424 + if "_old_init" in cls.__dict__: cls.__init__ = cls._old_init del cls._old_init diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index 3700feaba9992..cc70417988616 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -1,3 +1,4 @@ +import random from dataclasses import dataclass import pytest @@ -172,6 +173,30 @@ def __init__(self, randomize, *args, **kwargs): assert isinstance(new_dataloader, GoodImpl) +def test_replace_init_method_multiple_loaders_without_init(): + """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` + method (the one we are wrapping), it can happen, that `hasattr(cls, "_old_init")` is True because of parent + class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error + occured only sometimes because it depends on the order in which we are iterating over a set of classes we are + patching. + + This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` + and are children of `DataLoader`. We are testing that a) context manager `_replace_init_method` exits cleanly, and + b) the mechanism checking for presence of `_old_init` works as expected. + """ + classes = [DataLoader] + for i in range(100): + classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) + + with _replace_init_method(DataLoader, "dataset"): + for cls in classes[1:]: # First one is `DataLoader` + assert "_old_init" not in cls.__dict__ + assert hasattr(cls, "_old_init") + + assert "_old_init" in DataLoader.__dict__ + assert hasattr(DataLoader, "_old_init") + + class DataLoaderSubclass1(DataLoader): def __init__(self, attribute1, *args, **kwargs): self.at1 = attribute1 From 45a10a137cbbc7bd07bf3bf4b7c4b8b8a9439516 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 10 Aug 2022 18:22:44 +0200 Subject: [PATCH 41/59] update chlog after 0.5.5 (#14133) --- src/lightning_app/CHANGELOG.md | 65 ++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index f32d07697f376..ea28c57611311 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -9,27 +9,86 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) + + - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) + + - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) + + - Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987)) + + - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) - Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) + ### Changed -- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537)) +- + ### Changed -- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703)) +- + - Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) ### Deprecated +- + + ### Fixed -- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) +- + + +## [0.5.5] - 2022-08-9 +### Deprecated + +- Deprecate sheety API ([#14004](https://github.com/Lightning-AI/lightning/pull/14004)) + +### Fixed + +- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) - Resolved a bug about a race condition when sending the work state through the caller_queue ([#14074](https://github.com/Lightning-AI/lightning/pull/14074)) +- Fixed Start Lightning App on Cloud if Repo Begins With Name "Lightning" ([#14025](https://github.com/Lightning-AI/lightning/pull/14025)) + + +## [0.5.4] - 2022-08-01 + +### Changed + +- Wrapped imports for traceability ([#13924](https://github.com/Lightning-AI/lightning/pull/13924)) +- Set version as today ([#13906](https://github.com/Lightning-AI/lightning/pull/13906)) + +### Fixed + +- Included app templates to the lightning and app packages ([#13731](https://github.com/Lightning-AI/lightning/pull/13731)) +- Added UI for install all ([#13732](https://github.com/Lightning-AI/lightning/pull/13732)) +- Fixed build meta pkg flow ([#13926](https://github.com/Lightning-AI/lightning/pull/13926)) + +## [0.5.3] - 2022-07-25 + +### Changed + +- Pruned requirements duplicity ([#13739](https://github.com/Lightning-AI/lightning/pull/13739)) + +### Fixed + +- Use correct python version in lightning component template ([#13790](https://github.com/Lightning-AI/lightning/pull/13790)) + +## [0.5.2] - 2022-07-18 + +### Added + +- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537)) + +### Changed + +- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703)) From e226180527b065813bb1ba5e83f4990c3b81d444 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 19:26:01 +0200 Subject: [PATCH 42/59] (app) Remove ClickRunner (#14147) --- README.md | 2 +- tests/tests_app_examples/test_boring_app.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 2d32094f6595f..f9d5a9a57f5e2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -### ** NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines** +### \*\* NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines\*\*
diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index f8143b1db1a88..0ca1b823b4706 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -13,7 +13,6 @@ def test_boring_app_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py") as ( _, view_page, - _, name, ): @@ -31,8 +30,4 @@ def check_hello_there(*_, **__): assert result.exit_code == 0 assert result.exception is None - assert len(lines) > 1, result.output - # We know that at some point we need to intstall lightning, so we check for that - assert any( - "Successfully built lightning" in line for line in lines - ), f"Did not find logs with lightning installation: {result.output}" + assert any("http://0.0.0.0:8080" in line for line in lines) From 3966f959aab2682df26f9712c37e468704304792 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 10 Aug 2022 19:38:39 +0200 Subject: [PATCH 43/59] relax `docker` requirement (#14009) --- requirements/app/cloud.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index ff18d47b44565..6644a56a2894b 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,4 +1,4 @@ starsessions redis>=4.0.0, <=4.2.4 -docker==5.0.3 +docker>=5.0.0, <=5.0.3 # setuptools==59.5.0 From f11f1e2bb470a57f4043a41b1cdf194071c4be1e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Aug 2022 19:40:34 +0200 Subject: [PATCH 44/59] Update gcsfs requirement from <2022.6.0,>=2021.5.0 to >=2021.5.0,<2022.8.0 in /requirements (#14079) Update gcsfs requirement in /requirements Updates the requirements on [gcsfs](https://github.com/fsspec/gcsfs) to permit the latest version. - [Release notes](https://github.com/fsspec/gcsfs/releases) - [Commits](https://github.com/fsspec/gcsfs/compare/2021.05.0...2022.7.1) --- updated-dependencies: - dependency-name: gcsfs dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index c386c5581cc42..20b6c1b8dbc12 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -7,5 +7,5 @@ torchtext>=0.10.*, <0.14.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.12.0, <=4.12.0 -gcsfs>=2021.5.0, <2022.6.0 +gcsfs>=2021.5.0, <2022.8.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 From 7e7736778bfc1f3864d878458b9de87de7ded52c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Aug 2022 14:27:35 -0400 Subject: [PATCH 45/59] Update onnxruntime requirement from <=1.12.0 to <1.13.0 in /requirements (#14083) Updates the requirements on [onnxruntime](https://github.com/microsoft/onnxruntime) to permit the latest version. - [Release notes](https://github.com/microsoft/onnxruntime/releases) - [Changelog](https://github.com/microsoft/onnxruntime/blob/master/docs/ReleaseManagement.md) - [Commits](https://github.com/microsoft/onnxruntime/compare/v0.1.4...v1.12.1) --- updated-dependencies: - dependency-name: onnxruntime dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index c155400a3d35f..f8bd5793a0af6 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -10,7 +10,7 @@ mypy==0.971 # needed in tests cloudpickle>=1.3, <=2.1.0 scikit-learn>0.22.1, <=1.1.1 -onnxruntime<=1.12.0 +onnxruntime<1.13.0 psutil<=5.9.1 # for `DeviceStatsMonitor` pandas>1.0, <=1.4.3 # needed in benchmarks fastapi<=0.79.0 From 784b60412c1dec73c5f7c90ced343d2bbd394c25 Mon Sep 17 00:00:00 2001 From: panos-is <102533125+panos-is@users.noreply.github.com> Date: Wed, 10 Aug 2022 23:07:23 +0300 Subject: [PATCH 46/59] (app) Add s3 drive type (1/2) (#14002) * Add S3 protocol and optimization field to the drive object * Add a list of drives to the work specification * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add only protocol for s3 drives, no optimization arguments, and add tests * added trailing slash criteria * allow slash in s3 drives * fix * fixed test issues Co-authored-by: Panos Lantavos-Stratigakis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rick Izzo Co-authored-by: Jirka Borovec Co-authored-by: Rick Izzo --- src/lightning_app/storage/drive.py | 33 ++++++++++- tests/tests_app/storage/test_drive.py | 81 +++++++++++++++++++-------- 2 files changed, 90 insertions(+), 24 deletions(-) diff --git a/src/lightning_app/storage/drive.py b/src/lightning_app/storage/drive.py index 3bcdf72780653..b69d2581851b8 100644 --- a/src/lightning_app/storage/drive.py +++ b/src/lightning_app/storage/drive.py @@ -13,7 +13,7 @@ class Drive: __IDENTIFIER__ = "__drive__" - __PROTOCOLS__ = ["lit://"] + __PROTOCOLS__ = ["lit://", "s3://"] def __init__( self, @@ -35,15 +35,28 @@ def __init__( root_folder: This is the folder from where the Drive perceives the data (e.g this acts as a mount dir). """ self.id = None + self.protocol = None for protocol in self.__PROTOCOLS__: if id.startswith(protocol): self.protocol = protocol self.id = id.replace(protocol, "") + break + else: # N.B. for-else loop + raise ValueError( + f"Unknown protocol for the drive 'id' argument '{id}`. The 'id' string " + f"must start with one of the following prefixes {self.__PROTOCOLS__}" + ) + + if self.protocol == "s3://" and not self.id.endswith("/"): + raise ValueError( + "S3 drives must end in a trailing slash (`/`) to indicate a folder is being mounted. " + f"Recieved: '{id}'. Mounting a single file is not currently supported." + ) if not self.id: raise Exception(f"The Drive id needs to start with one of the following protocols: {self.__PROTOCOLS__}") - if "/" in self.id: + if self.protocol != "s3://" and "/" in self.id: raise Exception(f"The id should be unique to identify your drive. Found `{self.id}`.") self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else os.getcwd() @@ -75,6 +88,10 @@ def put(self, path: str) -> None: raise Exception("The component name needs to be known to put a path to the Drive.") if _is_flow_context(): raise Exception("The flow isn't allowed to put files into a Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently add files via this API. Did you mean to use `lit://` drives?" + ) self._validate_path(path) @@ -98,6 +115,10 @@ def list(self, path: Optional[str] = ".", component_name: Optional[str] = None) """ if _is_flow_context(): raise Exception("The flow isn't allowed to list files from a Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently list files via this API. Did you mean to use `lit://` drives?" + ) if component_name: paths = [ @@ -142,6 +163,10 @@ def get( """ if _is_flow_context(): raise Exception("The flow isn't allowed to get files from a Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently get files via this API. Did you mean to use `lit://` drives?" + ) if component_name: shared_path = self._to_shared_path( @@ -189,6 +214,10 @@ def delete(self, path: str) -> None: """ if not self.component_name: raise Exception("The component name needs to be known to delete a path to the Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently delete files via this API. Did you mean to use `lit://` drives?" + ) shared_path = self._to_shared_path( path, diff --git a/tests/tests_app/storage/test_drive.py b/tests/tests_app/storage/test_drive.py index 3d9db44c10e13..0d452571d9f43 100644 --- a/tests/tests_app/storage/test_drive.py +++ b/tests/tests_app/storage/test_drive.py @@ -11,7 +11,7 @@ from lightning_app.utilities.component import _set_flow_context -class SyncWorkA(LightningWork): +class SyncWorkLITDriveA(LightningWork): def __init__(self, tmpdir): super().__init__() self.tmpdir = tmpdir @@ -25,19 +25,19 @@ def run(self, drive: Drive): os.remove(f"{self.tmpdir}/a.txt") -class SyncWorkB(LightningWork): +class SyncWorkLITDriveB(LightningWork): def run(self, drive: Drive): assert not os.path.exists("a.txt") drive.get("a.txt") assert os.path.exists("a.txt") -class SyncFlow(LightningFlow): +class SyncFlowLITDrives(LightningFlow): def __init__(self, tmpdir): super().__init__() self.log_dir = Drive("lit://log_dir") - self.work_a = SyncWorkA(str(tmpdir)) - self.work_b = SyncWorkB() + self.work_a = SyncWorkLITDriveA(str(tmpdir)) + self.work_b = SyncWorkLITDriveB() def run(self): self.work_a.run(self.log_dir) @@ -45,15 +45,15 @@ def run(self): self._exit() -def test_synchronization_drive(tmpdir): +def test_synchronization_lit_drive(tmpdir): if os.path.exists("a.txt"): os.remove("a.txt") - app = LightningApp(SyncFlow(tmpdir)) + app = LightningApp(SyncFlowLITDrives(tmpdir)) MultiProcessRuntime(app, start_server=False).dispatch() os.remove("a.txt") -class Work(LightningWork): +class LITDriveWork(LightningWork): def __init__(self): super().__init__(parallel=True) self.drive = None @@ -75,7 +75,7 @@ def run(self, *args, **kwargs): self.counter += 1 -class Work2(LightningWork): +class LITDriveWork2(LightningWork): def __init__(self): super().__init__(parallel=True) @@ -86,11 +86,11 @@ def run(self, drive: Drive, **kwargs): assert drive.list(".", component_name=self.name) == [] -class Flow(LightningFlow): +class LITDriveFlow(LightningFlow): def __init__(self): super().__init__() - self.work = Work() - self.work2 = Work2() + self.work = LITDriveWork() + self.work2 = LITDriveWork2() def run(self): self.work.run("0") @@ -102,15 +102,15 @@ def run(self): self._exit() -def test_drive_transferring_files(): - app = LightningApp(Flow()) +def test_lit_drive_transferring_files(): + app = LightningApp(LITDriveFlow()) MultiProcessRuntime(app, start_server=False).dispatch() os.remove("a.txt") -def test_drive(): - with pytest.raises(Exception, match="The Drive id needs to start with one of the following protocols"): - Drive("this_drive_id") +def test_lit_drive(): + with pytest.raises(Exception, match="Unknown protocol for the drive 'id' argument"): + Drive("invalid_drive_id") with pytest.raises( Exception, match="The id should be unique to identify your drive. Found `this_drive_id/something_else`." @@ -213,9 +213,46 @@ def test_drive(): os.remove("a.txt") -def test_maybe_create_drive(): +def test_s3_drives(): + drive = Drive("s3://foo/", allow_duplicates=True) + drive.component_name = "root.work" - drive = Drive("lit://drive_3", allow_duplicates=False) + with pytest.raises( + Exception, match="S3 based drives cannot currently add files via this API. Did you mean to use `lit://` drives?" + ): + drive.put("a.txt") + with pytest.raises( + Exception, + match="S3 based drives cannot currently list files via this API. Did you mean to use `lit://` drives?", + ): + drive.list("a.txt") + with pytest.raises( + Exception, match="S3 based drives cannot currently get files via this API. Did you mean to use `lit://` drives?" + ): + drive.get("a.txt") + with pytest.raises( + Exception, + match="S3 based drives cannot currently delete files via this API. Did you mean to use `lit://` drives?", + ): + drive.delete("a.txt") + + _set_flow_context() + with pytest.raises(Exception, match="The flow isn't allowed to put files into a Drive."): + drive.put("a.txt") + with pytest.raises(Exception, match="The flow isn't allowed to list files from a Drive."): + drive.list("a.txt") + with pytest.raises(Exception, match="The flow isn't allowed to get files from a Drive."): + drive.get("a.txt") + + +def test_create_s3_drive_without_trailing_slash_fails(): + with pytest.raises(ValueError, match="S3 drives must end in a trailing slash"): + Drive("s3://foo") + + +@pytest.mark.parametrize("drive_id", ["lit://drive", "s3://drive/"]) +def test_maybe_create_drive(drive_id): + drive = Drive(drive_id, allow_duplicates=False) drive.component_name = "root.work1" new_drive = _maybe_create_drive(drive.component_name, drive.to_dict()) assert new_drive.protocol == drive.protocol @@ -223,9 +260,9 @@ def test_maybe_create_drive(): assert new_drive.component_name == drive.component_name -def test_drive_deepcopy(): - - drive = Drive("lit://drive", allow_duplicates=True) +@pytest.mark.parametrize("drive_id", ["lit://drive", "s3://drive/"]) +def test_drive_deepcopy(drive_id): + drive = Drive(drive_id, allow_duplicates=True) drive.component_name = "root.work1" new_drive = deepcopy(drive) assert new_drive.id == drive.id From 5396b1899fa2ed3de1a369a1551fa155a80c4321 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 22:34:23 +0200 Subject: [PATCH 47/59] Resolve e2es V3 (#14153) update --- tests/tests_app_examples/test_boring_app.py | 4 ++++ tests/tests_app_examples/test_drive.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index 0ca1b823b4706..afb958571d16b 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -13,6 +13,7 @@ def test_boring_app_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py") as ( _, view_page, + fetch_logs, name, ): @@ -24,6 +25,9 @@ def check_hello_there(*_, **__): wait_for(view_page, check_hello_there) + for _ in fetch_logs(): + pass + runner = CliRunner() result = runner.invoke(logs, [name]) lines = result.output.splitlines() diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py index 630e76b550e9e..dde68d1a85113 100644 --- a/tests/tests_app_examples/test_drive.py +++ b/tests/tests_app_examples/test_drive.py @@ -18,7 +18,7 @@ def test_drive_example_cloud() -> None: has_logs = False while not has_logs: - for log in fetch_logs(["flow"]): + for log in fetch_logs(): if "Application End!" in log: has_logs = True sleep(1) From 4008f9cd414db2b0319b62ab4cb5d2193c6e97ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 23:15:12 +0200 Subject: [PATCH 48/59] Convert subprocess test to standalone test (#14101) --- tests/tests_pytorch/run_standalone_tasks.sh | 10 ++- tests/tests_pytorch/serve/__init__.py | 0 tests/tests_pytorch/strategies/ddp_model.py | 58 ---------------- .../strategies/scripts/__init__.py | 0 .../strategies/scripts/cli_script.py | 24 +++++++ tests/tests_pytorch/strategies/test_ddp.py | 67 +++++++------------ tests/tests_pytorch/utilities/distributed.py | 45 ------------- 7 files changed, 55 insertions(+), 149 deletions(-) create mode 100644 tests/tests_pytorch/serve/__init__.py delete mode 100644 tests/tests_pytorch/strategies/ddp_model.py create mode 100644 tests/tests_pytorch/strategies/scripts/__init__.py create mode 100644 tests/tests_pytorch/strategies/scripts/cli_script.py delete mode 100644 tests/tests_pytorch/utilities/distributed.py diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 960bd867ceaa4..698ed7863ab96 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -34,6 +34,10 @@ fi # test that a user can manually launch individual processes echo "Running manual ddp launch test" export PYTHONPATH="${PYTHONPATH}:$(pwd)" -args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} & -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} +args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} & +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args} + +# test that ddp can launched as a module (-m option) +echo "Running ddp example as module" +python -m strategies.scripts.cli_script ${args} diff --git a/tests/tests_pytorch/serve/__init__.py b/tests/tests_pytorch/serve/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/ddp_model.py b/tests/tests_pytorch/strategies/ddp_model.py deleted file mode 100644 index 76d1f3f2f6866..0000000000000 --- a/tests/tests_pytorch/strategies/ddp_model.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Runs either `.fit()` or `.test()` on a single node across multiple gpus.""" -import os -from argparse import ArgumentParser - -import torch - -from pytorch_lightning import seed_everything, Trainer -from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.simple_models import ClassificationModel - - -def main(): - seed_everything(4321) - - parser = ArgumentParser(add_help=False) - parser = Trainer.add_argparse_args(parser) - parser.add_argument("--trainer_method", default="fit") - parser.add_argument("--tmpdir") - parser.add_argument("--workdir") - parser.set_defaults(accelerator="gpu", devices=2) - parser.set_defaults(strategy="ddp") - args = parser.parse_args() - - dm = ClassifDataModule() - model = ClassificationModel() - trainer = Trainer.from_argparse_args(args) - - if args.trainer_method == "fit": - trainer.fit(model, datamodule=dm) - result = None - elif args.trainer_method == "test": - result = trainer.test(model, datamodule=dm) - elif args.trainer_method == "fit_test": - trainer.fit(model, datamodule=dm) - result = trainer.test(model, datamodule=dm) - else: - raise ValueError(f"Unsupported: {args.trainer_method}") - - result_ext = {"status": "complete", "method": args.trainer_method, "result": result} - file_path = os.path.join(args.tmpdir, "ddp.result") - torch.save(result_ext, file_path) - - -if __name__ == "__main__": - main() diff --git a/tests/tests_pytorch/strategies/scripts/__init__.py b/tests/tests_pytorch/strategies/scripts/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/scripts/cli_script.py b/tests/tests_pytorch/strategies/scripts/cli_script.py new file mode 100644 index 0000000000000..17f0d29392eb9 --- /dev/null +++ b/tests/tests_pytorch/strategies/scripts/cli_script.py @@ -0,0 +1,24 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A trivial script that wraps a LightningCLI around the BoringModel and BoringDataModule.""" +from pytorch_lightning.cli import LightningCLI +from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel + +if __name__ == "__main__": + LightningCLI( + BoringModel, + BoringDataModule, + seed_everything_default=42, + save_config_overwrite=True, + ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 1a2a0475e7ed6..9b196f3e2a97f 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -21,60 +21,41 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning import Trainer +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.callbacks import Callback from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DDPStrategy +from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.strategies import ddp_model -from tests_pytorch.utilities.distributed import call_training_script +from tests_pytorch.helpers.simple_models import ClassificationModel -CLI_ARGS = "--max_epochs 1 --accelerator gpu --devices 2 --strategy ddp" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit", tmpdir, timeout=120, as_module=as_module) - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_test_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.test(model, datamodule=dm) - # verify the file wrote the expected outputs - assert result["status"] == "complete" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_test(tmpdir): + seed_everything(4321) + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) + result = trainer.test(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_test_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "test", tmpdir, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit_test", tmpdir, timeout=20, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - model_outs = result["result"] - for out in model_outs: + for out in result: assert out["test_acc"] > 0.7 diff --git a/tests/tests_pytorch/utilities/distributed.py b/tests/tests_pytorch/utilities/distributed.py deleted file mode 100644 index 38a50edcc7177..0000000000000 --- a/tests/tests_pytorch/utilities/distributed.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import subprocess -import sys -from pathlib import Path -from subprocess import TimeoutExpired - -import pytorch_lightning - - -def call_training_script(module_file, cli_args, method, tmpdir, timeout=60, as_module=False): - file = Path(module_file.__file__).absolute() - cli_args = cli_args.split(" ") if cli_args else [] - cli_args += ["--tmpdir", str(tmpdir)] - cli_args += ["--trainer_method", method] - file_args = ["-m", module_file.__spec__.name] if as_module else [str(file)] - command = [sys.executable] + file_args + cli_args - - # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment - env = os.environ.copy() - env["PYTHONPATH"] = env.get("PYTHONPATH", "") + f"{pytorch_lightning.__file__}:" - - # for running in ddp mode, we need to launch it's own process or pytest will get stuck - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - try: - std, err = p.communicate(timeout=timeout) - err = str(err.decode("utf-8")) - if "Exception" in err: - raise Exception(err) - except TimeoutExpired: - p.kill() - std, err = p.communicate() - return std, err From 2a10a36b9211fbecdfc79dc0bdae9b972ec8f91d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 10 Aug 2022 18:30:01 -0400 Subject: [PATCH 49/59] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f9d5a9a57f5e2..6f075f5fd42b6 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
-**Build high-performance PyTorch models and deploy them with Lightning Apps (scalable end-to-end ML systems).** +**Build high-performance (PyTorch) models, research workflows, ML production pipelines.** ______________________________________________________________________ From a7cebf24169dbe80c5e718946cb5de931082f814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 01:32:32 +0200 Subject: [PATCH 50/59] Fix entry point test for Python 3.10 (#14154) --- .../trainer/connectors/test_callback_connector.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py index d6d5018aa1dd0..02e846425a2a0 100644 --- a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py @@ -30,7 +30,7 @@ ) from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 def test_checkpoint_callbacks_are_last(tmpdir): @@ -265,7 +265,10 @@ def _make_entry_point_query_mock(callback_factory): entry_point = Mock() entry_point.name = "mocked" entry_point.load.return_value = callback_factory - if _PYTHON_GREATER_EQUAL_3_8_0: + if _PYTHON_GREATER_EQUAL_3_10_0: + query_mock.return_value = [entry_point] + import_path = "importlib.metadata.entry_points" + elif _PYTHON_GREATER_EQUAL_3_8_0: query_mock().get.return_value = [entry_point] import_path = "importlib.metadata.entry_points" else: From 3dc08b1ef565774853467a7e56842becfa381dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Aug 2022 09:33:19 +0200 Subject: [PATCH 51/59] Fix flaky test caused by weak reference (#14157) --- tests/tests_pytorch/trainer/connectors/test_data_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 52ef4c4db6d8d..2650e46b7fa60 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -445,7 +445,8 @@ def test_dataloader_source_direct_access(): def test_dataloader_source_request_from_module(): """Test requesting a dataloader from a module works.""" module = BoringModel() - module.trainer = Trainer() + trainer = Trainer() + module.trainer = trainer module.foo = Mock(return_value=module.train_dataloader()) source = _DataLoaderSource(module, "foo") From 6eed72b621921856a846e39e4dd6bc9fd764348b Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 11 Aug 2022 12:35:00 +0200 Subject: [PATCH 52/59] (app) Introduce configure_api and Post, Get, Delete, Put HttpMethods (#13945) --- .github/workflows/ci-app-cloud-e2e-test.yml | 2 +- .../.lightning | 0 .../app.py | 18 +- .../command.py | 0 src/lightning_app/CHANGELOG.md | 1 + src/lightning_app/api/__init__.py | 3 + src/lightning_app/api/http_methods.py | 107 +++++++++++ src/lightning_app/api/request_types.py | 36 ++++ src/lightning_app/cli/lightning_cli.py | 50 ++--- src/lightning_app/core/api.py | 126 +++++-------- src/lightning_app/core/app.py | 32 ++-- src/lightning_app/core/flow.py | 33 ++++ src/lightning_app/core/queues.py | 22 +-- src/lightning_app/runners/backends/backend.py | 5 +- src/lightning_app/runners/multiprocess.py | 19 +- src/lightning_app/utilities/cli_helpers.py | 38 +++- src/lightning_app/utilities/commands/base.py | 175 ++++++++---------- src/lightning_app/utilities/enum.py | 6 + src/lightning_app/utilities/network.py | 3 +- tests/tests_app/core/test_lightning_api.py | 107 ++++++++--- tests/tests_app/utilities/test_app_logs.py | 2 + tests/tests_app/utilities/test_commands.py | 33 ++-- tests/tests_app_examples/test_commands.py | 32 ---- .../test_commands_and_api.py | 42 +++++ 24 files changed, 568 insertions(+), 324 deletions(-) rename examples/{app_commands => app_commands_and_api}/.lightning (100%) rename examples/{app_commands => app_commands_and_api}/app.py (56%) rename examples/{app_commands => app_commands_and_api}/command.py (100%) create mode 100644 src/lightning_app/api/__init__.py create mode 100644 src/lightning_app/api/http_methods.py create mode 100644 src/lightning_app/api/request_types.py delete mode 100644 tests/tests_app_examples/test_commands.py create mode 100644 tests/tests_app_examples/test_commands_and_api.py diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index 3ad455650a117..9a5a10a95cd33 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -54,7 +54,7 @@ jobs: - custom_work_dependencies - drive - payload - - commands + - commands_and_api timeout-minutes: 35 steps: - uses: actions/checkout@v2 diff --git a/examples/app_commands/.lightning b/examples/app_commands_and_api/.lightning similarity index 100% rename from examples/app_commands/.lightning rename to examples/app_commands_and_api/.lightning diff --git a/examples/app_commands/app.py b/examples/app_commands_and_api/app.py similarity index 56% rename from examples/app_commands/app.py rename to examples/app_commands_and_api/app.py index 99eb15c75c709..0d15bc531bb38 100644 --- a/examples/app_commands/app.py +++ b/examples/app_commands_and_api/app.py @@ -1,15 +1,16 @@ from command import CustomCommand, CustomConfig from lightning import LightningFlow +from lightning_app.api import Post from lightning_app.core.app import LightningApp class ChildFlow(LightningFlow): - def trigger_method(self, name: str): + def nested_command(self, name: str): print(f"Hello {name}") def configure_commands(self): - return [{"nested_trigger_command": self.trigger_method}] + return [{"nested_command": self.nested_command}] class FlowCommands(LightningFlow): @@ -19,21 +20,24 @@ def __init__(self): self.child_flow = ChildFlow() def run(self): - if len(self.names): + if self.names: print(self.names) - def trigger_without_client_command(self, name: str): + def command_without_client(self, name: str): self.names.append(name) - def trigger_with_client_command(self, config: CustomConfig): + def command_with_client(self, config: CustomConfig): self.names.append(config.name) def configure_commands(self): commands = [ - {"trigger_without_client_command": self.trigger_without_client_command}, - {"trigger_with_client_command": CustomCommand(self.trigger_with_client_command)}, + {"command_without_client": self.command_without_client}, + {"command_with_client": CustomCommand(self.command_with_client)}, ] return commands + self.child_flow.configure_commands() + def configure_api(self): + return [Post("/user/command_without_client", self.command_without_client)] + app = LightningApp(FlowCommands()) diff --git a/examples/app_commands/command.py b/examples/app_commands_and_api/command.py similarity index 100% rename from examples/app_commands/command.py rename to examples/app_commands_and_api/command.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index ea28c57611311..7158d1ff7a2da 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +- Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) ### Changed - diff --git a/src/lightning_app/api/__init__.py b/src/lightning_app/api/__init__.py new file mode 100644 index 0000000000000..25ec5c4708761 --- /dev/null +++ b/src/lightning_app/api/__init__.py @@ -0,0 +1,3 @@ +from lightning_app.api.http_methods import Delete, Get, Post, Put + +__all__ = ["Delete", "Get", "Post", "Put"] diff --git a/src/lightning_app/api/http_methods.py b/src/lightning_app/api/http_methods.py new file mode 100644 index 0000000000000..02b6ec87f17d2 --- /dev/null +++ b/src/lightning_app/api/http_methods.py @@ -0,0 +1,107 @@ +import asyncio +import inspect +import time +from copy import deepcopy +from functools import wraps +from multiprocessing import Queue +from typing import Any, Callable, Dict, List, Optional +from uuid import uuid4 + +from fastapi import FastAPI + +from lightning_app.api.request_types import APIRequest, CommandRequest + + +def _signature_proxy_function(): + pass + + +class HttpMethod: + def __init__(self, route: str, method: Callable, method_name: Optional[str] = None, timeout: int = 30, **kwargs): + """This class is used to inject user defined methods within the App Rest API. + + Arguments: + route: The path used to route the requests + method: The associated flow method + timeout: The time in seconds taken before raising a timeout exception. + """ + self.route = route + self.component_name = method.__self__.name + self.method_name = method_name or method.__name__ + self.method_annotations = method.__annotations__ + # TODO: Validate the signature contains only pydantic models. + self.method_signature = inspect.signature(method) + self.timeout = timeout + self.kwargs = kwargs + + def add_route(self, app: FastAPI, request_queue: Queue, responses_store: Dict[str, Any]) -> None: + # 1: Create a proxy function with the signature of the wrapped method. + fn = deepcopy(_signature_proxy_function) + fn.__annotations__ = self.method_annotations + fn.__name__ = self.method_name + setattr(fn, "__signature__", self.method_signature) + + # 2: Get the route associated with the http method. + route = getattr(app, self.__class__.__name__.lower()) + + request_cls = CommandRequest if self.route.startswith("/command/") else APIRequest + + # 3: Define the request handler. + @wraps(_signature_proxy_function) + async def _handle_request(*args, **kwargs): + async def fn(*args, **kwargs): + request_id = str(uuid4()).split("-")[0] + request_queue.put( + request_cls( + name=self.component_name, + method_name=self.method_name, + args=args, + kwargs=kwargs, + id=request_id, + ) + ) + + t0 = time.time() + while request_id not in responses_store: + await asyncio.sleep(0.1) + if (time.time() - t0) > self.timeout: + raise Exception("The response was never received.") + + return responses_store.pop(request_id) + + return await asyncio.create_task(fn(*args, **kwargs)) + + # 4: Register the user provided route to the Rest API. + route(self.route, **self.kwargs)(_handle_request) + + +class Post(HttpMethod): + pass + + +class Get(HttpMethod): + + pass + + +class Put(HttpMethod): + + pass + + +class Delete(HttpMethod): + pass + + +def _add_tags_to_api(apis: List[HttpMethod], tags: List[str]) -> None: + for api in apis: + if not api.kwargs.get("tag"): + api.kwargs["tags"] = tags + + +def _validate_api(apis: List[HttpMethod]) -> None: + for api in apis: + if not isinstance(api, HttpMethod): + raise Exception(f"The provided api should be either [{Delete}, {Get}, {Post}, {Put}]") + if api.route.startswith("/command"): + raise Exception("The route `/command` is reserved for commands. Please, use something else.") diff --git a/src/lightning_app/api/request_types.py b/src/lightning_app/api/request_types.py new file mode 100644 index 0000000000000..53a6df25820a3 --- /dev/null +++ b/src/lightning_app/api/request_types.py @@ -0,0 +1,36 @@ +from dataclasses import asdict, dataclass +from typing import Any + +from deepdiff import Delta + + +@dataclass +class BaseRequest: + def to_dict(self): + return asdict(self) + + +@dataclass +class DeltaRequest(BaseRequest): + delta: Delta + + def to_dict(self): + return self.delta.to_dict() + + +@dataclass +class CommandRequest(BaseRequest): + id: str + name: str + method_name: str + args: Any + kwargs: Any + + +@dataclass +class APIRequest(BaseRequest): + id: str + name: str + method_name: str + args: Any + kwargs: Any diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index babe0aa2b2abc..6a6e41df57026 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -4,7 +4,6 @@ from argparse import ArgumentParser from pathlib import Path from typing import List, Tuple, Union -from uuid import uuid4 import click import requests @@ -26,10 +25,10 @@ _retrieve_application_url_and_available_commands, ) from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth from lightning_app.utilities.network import LightningClient -from lightning_app.utilities.state import headers_for logger = logging.getLogger(__name__) @@ -263,41 +262,42 @@ def app_command(): hparams, argv = parser.parse_known_args() # 1: Collect the url and comments from the running application - url, commands = _retrieve_application_url_and_available_commands(hparams.app_id) - if url is None or commands is None: + url, api_commands = _retrieve_application_url_and_available_commands(hparams.app_id) + if url is None or api_commands is None: raise Exception("We couldn't find any matching running app.") - if not commands: + if not api_commands: raise Exception("This application doesn't expose any commands yet.") command = argv[0] - command_names = [c["command"] for c in commands] - if command not in command_names: - raise Exception(f"The provided command {command} isn't available in {command_names}") + if command not in api_commands: + raise Exception(f"The provided command {command} isn't available in {list(api_commands)}") # 2: Send the command from the user - command_metadata = [c for c in commands if c["command"] == command][0] - params = command_metadata["params"] + metadata = api_commands[command] # 3: Execute the command - if not command_metadata["is_client_command"]: - # TODO: Improve what is supported there. - kwargs = {k.split("=")[0].replace("--", ""): k.split("=")[1] for k in argv[1:]} - for param in params: - if param not in kwargs: - raise Exception(f"The argument --{param}=X hasn't been provided.") - json = { - "command_name": command, - "command_arguments": kwargs, - "affiliation": command_metadata["affiliation"], - "id": str(uuid4()), - } - resp = requests.post(url + "/api/v1/commands", json=json, headers=headers_for({})) + if metadata["tag"] == OpenAPITags.APP_COMMAND: + # TODO: Improve what is current supported + kwargs = [v.replace("--", "") for v in argv[1:]] + + for p in kwargs: + if p.split("=")[0] not in metadata["parameters"]: + raise Exception(f"Some arguments need to be provided. The keys are {list(metadata['parameters'])}.") + # TODO: Encode the parameters and validate their type. + query_parameters = "&".join(kwargs) + resp = requests.post(url + f"/command/{command}?{query_parameters}") assert resp.status_code == 200, resp.json() else: - client_command, models = _download_command(command_metadata, hparams.app_id, debug_mode=debug_mode) - client_command._setup(metadata=command_metadata, models=models, app_url=url) + client_command = _download_command( + command, + metadata["cls_path"], + metadata["cls_name"], + hparams.app_id, + debug_mode=debug_mode, + ) + client_command._setup(command_name=command, app_url=url) sys.argv = argv client_command.run() diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index f19ada5340d57..8b625713e0c2c 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -3,7 +3,6 @@ import os import queue import sys -import time import traceback from copy import deepcopy from multiprocessing import Queue @@ -21,9 +20,12 @@ from pydantic import BaseModel from websockets.exceptions import ConnectionClosed +from lightning_app.api.http_methods import HttpMethod +from lightning_app.api.request_types import DeltaRequest from lightning_app.core.constants import FRONTEND_DIR from lightning_app.core.queues import RedisQueue from lightning_app.utilities.app_helpers import InMemoryStateStore, StateStore +from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.imports import _is_redis_available, _is_starsessions_available if _is_starsessions_available(): @@ -42,9 +44,6 @@ class SessionMiddleware: frontend_static_dir = os.path.join(FRONTEND_DIR, "static") api_app_delta_queue: Queue = None -api_commands_requests_queue: Queue = None -api_commands_metadata_queue: Queue = None -api_commands_responses_queue: Queue = None template = {"ui": {}, "app": {}} templates = Jinja2Templates(directory=FRONTEND_DIR) @@ -56,8 +55,8 @@ class SessionMiddleware: lock = Lock() app_spec: Optional[List] = None -app_commands_metadata: Optional[Dict] = None -commands_response_store = {} +# In the future, this would be abstracted to support horizontal scaling. +responses_store = {} logger = logging.getLogger(__name__) @@ -67,11 +66,10 @@ class SessionMiddleware: class UIRefresher(Thread): - def __init__(self, api_publish_state_queue, api_commands_metadata_queue, api_commands_responses_queue) -> None: + def __init__(self, api_publish_state_queue, api_response_queue) -> None: super().__init__(daemon=True) self.api_publish_state_queue = api_publish_state_queue - self.api_commands_metadata_queue = api_commands_metadata_queue - self.api_commands_responses_queue = api_commands_responses_queue + self.api_response_queue = api_response_queue self._exit_event = Event() def run(self): @@ -93,18 +91,11 @@ def run_once(self): pass try: - metadata = self.api_commands_metadata_queue.get(timeout=0) + response = self.api_response_queue.get(timeout=0) with lock: - global app_commands_metadata - app_commands_metadata = metadata - except queue.Empty: - pass - - try: - response = self.api_commands_responses_queue.get(timeout=0) - with lock: - global commands_response_store - commands_response_store[response["id"]] = response["response"] + # TODO: Abstract the responses store to support horizontal scaling. + global responses_store + responses_store[response["id"]] = response["response"] except queue.Empty: pass @@ -117,6 +108,23 @@ class StateUpdate(BaseModel): state: dict = {} +openapi_tags = [ + { + "name": OpenAPITags.APP_CLIENT_COMMAND, + "description": "The App Endpoints to be triggered exclusively from the CLI", + }, + { + "name": OpenAPITags.APP_COMMAND, + "description": "The App Endpoints that can be triggered equally from the CLI or from a Http Request", + }, + { + "name": OpenAPITags.APP_API, + "description": "The App Endpoints that can be triggered exclusively from a Http Request", + }, +] + +app = FastAPI(openapi_tags=openapi_tags) + fastapi_service = FastAPI() fastapi_service.add_middleware( @@ -176,50 +184,13 @@ async def get_spec( return app_spec or [] -@fastapi_service.post("/api/v1/commands", response_class=JSONResponse) -async def run_remote_command( - request: Request, -) -> None: - data = await request.json() - command_name = data.get("command_name", None) - if not command_name: - raise Exception("The provided command name is empty.") - command_arguments = data.get("command_arguments", None) - if not command_arguments: - raise Exception("The provided command metadata is empty.") - affiliation = data.get("affiliation", None) - if not affiliation: - raise Exception("The provided affiliation is empty.") - - async def fn(data): - request_id = data["id"] - api_commands_requests_queue.put(data) - - t0 = time.time() - while request_id not in commands_response_store: - await asyncio.sleep(0.1) - if (time.time() - t0) > 15: - raise Exception("The response was never received.") - - return commands_response_store[request_id] - - return await asyncio.create_task(fn(data)) - - -@fastapi_service.get("/api/v1/commands", response_class=JSONResponse) -async def get_commands() -> Optional[Dict]: - global app_commands_metadata - with lock: - return app_commands_metadata - - @fastapi_service.post("/api/v1/delta") async def post_delta( request: Request, x_lightning_type: Optional[str] = Header(None), x_lightning_session_uuid: Optional[str] = Header(None), x_lightning_session_id: Optional[str] = Header(None), -) -> Mapping: +) -> None: """This endpoint is used to make an update to the app state using delta diff, mainly used by streamlit to update the state.""" @@ -229,9 +200,7 @@ async def post_delta( raise Exception("Missing X-Lightning-Session-ID header") body: Dict = await request.json() - delta = body["delta"] - update_delta = Delta(delta) - api_app_delta_queue.put(update_delta) + api_app_delta_queue.put(DeltaRequest(delta=Delta(body["delta"]))) @fastapi_service.post("/api/v1/state") @@ -240,7 +209,7 @@ async def post_state( x_lightning_type: Optional[str] = Header(None), x_lightning_session_uuid: Optional[str] = Header(None), x_lightning_session_id: Optional[str] = Header(None), -) -> Mapping: +) -> None: if x_lightning_session_uuid is None: raise Exception("Missing X-Lightning-Session-UUID header") if x_lightning_session_id is None: @@ -263,8 +232,7 @@ async def post_state( state = body["state"] last_state = global_app_state_store.get_served_state(x_lightning_session_uuid) deep_diff = DeepDiff(last_state, state, verbose_level=2) - update_delta = Delta(deep_diff) - api_app_delta_queue.put(update_delta) + api_app_delta_queue.put(DeltaRequest(delta=Delta(deep_diff))) @fastapi_service.get("/healthz", status_code=200) @@ -307,8 +275,6 @@ async def websocket_endpoint(websocket: WebSocket): await websocket.close() -# Catch-all for nonexistent API routes (since we define a catch-all for client-side routing) -@fastapi_service.get("/api{full_path:path}", response_class=JSONResponse) async def api_catch_all(request: Request, full_path: str): raise HTTPException(status_code=404, detail="Not found") @@ -317,14 +283,18 @@ async def api_catch_all(request: Request, full_path: str): fastapi_service.mount("/static", StaticFiles(directory=frontend_static_dir, check_dir=False), name="static") -# Catch-all for frontend routes, must be defined after all other routes -@fastapi_service.get("/{full_path:path}", response_class=HTMLResponse) async def frontend_route(request: Request, full_path: str): if "pytest" in sys.modules: return "" return templates.TemplateResponse("index.html", {"request": request}) +def register_global_routes(): + # Catch-all for nonexistent API routes (since we define a catch-all for client-side routing) + fastapi_service.get("/api{full_path:path}", response_class=JSONResponse)(api_catch_all) + fastapi_service.get("/{full_path:path}", response_class=HTMLResponse)(frontend_route) + + class LightningUvicornServer(uvicorn.Server): has_started_queue = None @@ -346,34 +316,28 @@ async def check_is_started(self, queue): def start_server( api_publish_state_queue, api_delta_queue, - commands_requests_queue, - commands_responses_queue, - commands_metadata_queue, + api_response_queue, has_started_queue: Optional[Queue] = None, host="127.0.0.1", port=8000, uvicorn_run: bool = True, spec: Optional[List] = None, + apis: Optional[List[HttpMethod]] = None, app_state_store: Optional[StateStore] = None, ): global api_app_delta_queue global global_app_state_store - global api_commands_requests_queue - global api_commands_responses_queue global app_spec app_spec = spec api_app_delta_queue = api_delta_queue - api_commands_requests_queue = commands_requests_queue - api_commands_responses_queue = commands_responses_queue - api_commands_metadata_queue = commands_metadata_queue if app_state_store is not None: global_app_state_store = app_state_store global_app_state_store.add(TEST_SESSION_UUID) - refresher = UIRefresher(api_publish_state_queue, api_commands_metadata_queue, commands_responses_queue) + refresher = UIRefresher(api_publish_state_queue, api_response_queue) refresher.setDaemon(True) refresher.start() @@ -384,6 +348,14 @@ def start_server( LightningUvicornServer.has_started_queue = has_started_queue # uvicorn is doing some uglyness by replacing uvicorn.main by click command. sys.modules["uvicorn.main"].Server = LightningUvicornServer + + # Register the user API. + if apis: + for api in apis: + api.add_route(fastapi_service, api_app_delta_queue, responses_store) + + register_global_routes() + uvicorn.run(app=fastapi_service, host=host, port=port, log_level="error") return refresher diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 3f9e2521eb21d..65242a1ae0a2a 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -11,12 +11,13 @@ from deepdiff import DeepDiff, Delta import lightning_app +from lightning_app.api.request_types import APIRequest, CommandRequest, DeltaRequest from lightning_app.core.constants import FLOW_DURATION_SAMPLES, FLOW_DURATION_THRESHOLD, STATE_ACCUMULATE_WAIT from lightning_app.core.queues import BaseQueue, SingleProcessQueue from lightning_app.frontend import Frontend from lightning_app.storage.path import storage_root_dir from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef -from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests +from lightning_app.utilities.commands.base import _process_requests from lightning_app.utilities.component import _convert_paths_after_init from lightning_app.utilities.enum import AppStage, CacheCallsKeys from lightning_app.utilities.exceptions import CacheMissException, ExitAppException @@ -73,9 +74,7 @@ def __init__( # queues definition. self.delta_queue: t.Optional[BaseQueue] = None self.readiness_queue: t.Optional[BaseQueue] = None - self.commands_requests_queue: t.Optional[BaseQueue] = None - self.commands_responses_queue: t.Optional[BaseQueue] = None - self.commands_metadata_queue: t.Optional[BaseQueue] = None + self.api_response_queue: t.Optional[BaseQueue] = None self.api_publish_state_queue: t.Optional[BaseQueue] = None self.api_delta_queue: t.Optional[BaseQueue] = None self.error_queue: t.Optional[BaseQueue] = None @@ -253,7 +252,7 @@ def named_works(self) -> t.List[t.Tuple[str, "lightning_app.LightningWork"]]: """Returns all the works defined within this application with their names.""" return self.root.named_works(recurse=True) - def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]: + def _collect_deltas_from_ui_and_work_queues(self) -> t.List[t.Union[Delta, APIRequest, CommandRequest]]: # The aggregation would try to get as many deltas as possible # from both the `api_delta_queue` and `delta_queue` # during the `state_accumulate_wait` time. @@ -267,8 +266,12 @@ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]: while (time() - t0) < self.state_accumulate_wait: if self.api_delta_queue and should_get_delta_from_api: - delta_from_api: Delta = self.get_state_changed_from_queue(self.api_delta_queue) # TODO: rename + delta_from_api: t.Union[DeltaRequest, APIRequest, CommandRequest] = self.get_state_changed_from_queue( + self.api_delta_queue + ) # TODO: rename if delta_from_api: + if isinstance(delta_from_api, DeltaRequest): + delta_from_api = delta_from_api.delta deltas.append(delta_from_api) else: should_get_delta_from_api = False @@ -317,8 +320,19 @@ def maybe_apply_changes(self) -> bool: logger.debug(f"Received {[d.to_dict() for d in deltas]}") - state = self.state + # 1: Process the API / Command Requests first as they might affect the state. + state_deltas = [] for delta in deltas: + if isinstance(delta, (APIRequest, CommandRequest)): + _process_requests(self, delta) + else: + state_deltas.append(delta) + + # 2: Collect the state + state = self.state + + # 3: Apply the state delta + for delta in state_deltas: try: state += delta except Exception as e: @@ -351,8 +365,6 @@ def run_once(self): elif self.stage == AppStage.RESTARTING: return self._apply_restarting() - _process_command_requests(self) - t0 = time() try: @@ -411,8 +423,6 @@ def _run(self) -> bool: self._reset_run_time_monitor() - _populate_commands_endpoint(self) - while not done: done = self.run_once() diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index f6b6e34e81538..41c46cd868307 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -634,3 +634,36 @@ def my_remote_method(self, name): lightning my_command_name --args name=my_own_name """ raise NotImplementedError + + def configure_api(self): + """Configure the API routes of the LightningFlow. + + Returns a list of HttpMethod such as Post or Get. + + .. code-block:: python + + from lightning_app import LightningFlow + from lightning_app.api import Post + + from pydantic import BaseModel + + + class HandlerModel(BaseModel): + name: str + + + class Flow(L.LightningFlow): + def __init__(self): + super().__init__() + self.names = [] + + def handler(self, config: HandlerModel) -> None: + self.names.append(config.name) + + def configure_api(self): + return [Post("/v1/api/request", self.handler)] + + Once the app is running, you can access the Swagger UI of the app + under the ``/docs`` route. + """ + raise NotImplementedError diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py index efac8230047e0..2b7295d7f327f 100644 --- a/src/lightning_app/core/queues.py +++ b/src/lightning_app/core/queues.py @@ -36,9 +36,7 @@ ORCHESTRATOR_COPY_REQUEST_CONSTANT = "ORCHESTRATOR_COPY_REQUEST" ORCHESTRATOR_COPY_RESPONSE_CONSTANT = "ORCHESTRATOR_COPY_RESPONSE" WORK_QUEUE_CONSTANT = "WORK_QUEUE" -COMMANDS_REQUESTS_QUEUE_CONSTANT = "COMMANDS_REQUESTS_QUEUE" -COMMANDS_RESPONSES_QUEUE_CONSTANT = "COMMANDS_RESPONSES_QUEUE" -COMMANDS_METADATA_QUEUE_CONSTANT = "COMMANDS_METADATA_QUEUE" +API_RESPONSE_QUEUE_CONSTANT = "API_RESPONSE_QUEUE" class QueuingSystem(Enum): @@ -54,18 +52,8 @@ def _get_queue(self, queue_name: str) -> "BaseQueue": else: return SingleProcessQueue(queue_name, default_timeout=STATE_UPDATE_TIMEOUT) - def get_commands_requests_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = f"{queue_id}_{COMMANDS_REQUESTS_QUEUE_CONSTANT}" if queue_id else COMMANDS_REQUESTS_QUEUE_CONSTANT - return self._get_queue(queue_name) - - def get_commands_responses_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = ( - f"{queue_id}_{COMMANDS_RESPONSES_QUEUE_CONSTANT}" if queue_id else COMMANDS_RESPONSES_QUEUE_CONSTANT - ) - return self._get_queue(queue_name) - - def get_commands_metadata_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = f"{queue_id}_{COMMANDS_METADATA_QUEUE_CONSTANT}" if queue_id else COMMANDS_METADATA_QUEUE_CONSTANT + def get_api_response_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": + queue_name = f"{queue_id}_{API_RESPONSE_QUEUE_CONSTANT}" if queue_id else API_RESPONSE_QUEUE_CONSTANT return self._get_queue(queue_name) def get_readiness_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": @@ -98,10 +86,6 @@ def get_api_delta_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": queue_name = f"{queue_id}_{API_DELTA_QUEUE_CONSTANT}" if queue_id else API_DELTA_QUEUE_CONSTANT return self._get_queue(queue_name) - def get_api_refresh_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = f"{queue_id}_{API_REFRESH_QUEUE_CONSTANT}" if queue_id else API_REFRESH_QUEUE_CONSTANT - return self._get_queue(queue_name) - def get_orchestrator_request_queue(self, work_name: str, queue_id: Optional[str] = None) -> "BaseQueue": queue_name = ( f"{queue_id}_{ORCHESTRATOR_REQUEST_CONSTANT}_{work_name}" diff --git a/src/lightning_app/runners/backends/backend.py b/src/lightning_app/runners/backends/backend.py index 87bb103823fd2..a944cd4aa9093 100644 --- a/src/lightning_app/runners/backends/backend.py +++ b/src/lightning_app/runners/backends/backend.py @@ -82,11 +82,8 @@ def _prepare_queues(self, app): kw = dict(queue_id=self.queue_id) app.delta_queue = self.queues.get_delta_queue(**kw) app.readiness_queue = self.queues.get_readiness_queue(**kw) - app.commands_requests_queue = self.queues.get_commands_requests_queue(**kw) - app.commands_responses_queue = self.queues.get_commands_responses_queue(**kw) - app.commands_metadata_queue = self.queues.get_commands_metadata_queue(**kw) + app.api_response_queue = self.queues.get_api_response_queue(**kw) app.error_queue = self.queues.get_error_queue(**kw) - app.delta_queue = self.queues.get_delta_queue(**kw) app.api_publish_state_queue = self.queues.get_api_state_publish_queue(**kw) app.api_delta_queue = self.queues.get_api_delta_queue(**kw) app.request_queues = {} diff --git a/src/lightning_app/runners/multiprocess.py b/src/lightning_app/runners/multiprocess.py index 92ec900d89c65..16e373b0a37a2 100644 --- a/src/lightning_app/runners/multiprocess.py +++ b/src/lightning_app/runners/multiprocess.py @@ -3,10 +3,13 @@ from dataclasses import dataclass from typing import Any, Callable, Optional, Union +from lightning_app.api.http_methods import _add_tags_to_api, _validate_api from lightning_app.core.api import start_server from lightning_app.runners.backends import Backend from lightning_app.runners.runtime import Runtime from lightning_app.storage.orchestrator import StorageOrchestrator +from lightning_app.utilities.app_helpers import is_overridden +from lightning_app.utilities.commands.base import _commands_to_api, _prepare_commands from lightning_app.utilities.component import _set_flow_context, _set_frontend_context from lightning_app.utilities.load_app import extract_metadata_from_app from lightning_app.utilities.network import find_free_network_port @@ -60,15 +63,25 @@ def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwarg if self.start_server: self.app.should_publish_changes_to_api = True has_started_queue = self.backend.queues.get_has_server_started_queue() + + apis = [] + if is_overridden("configure_api", self.app.root): + apis = self.app.root.configure_api() + _validate_api(apis) + _add_tags_to_api(apis, ["app_api"]) + + if is_overridden("configure_commands", self.app.root): + commands = _prepare_commands(self.app) + apis += _commands_to_api(commands) + kwargs = dict( + apis=apis, host=self.host, port=self.port, + api_response_queue=self.app.api_response_queue, api_publish_state_queue=self.app.api_publish_state_queue, api_delta_queue=self.app.api_delta_queue, has_started_queue=has_started_queue, - commands_requests_queue=self.app.commands_requests_queue, - commands_responses_queue=self.app.commands_responses_queue, - commands_metadata_queue=self.app.commands_metadata_queue, spec=extract_metadata_from_app(self.app), ) server_proc = multiprocessing.Process(target=start_server, kwargs=kwargs) diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index fcce96ec64407..6000114c3d4d6 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -49,16 +49,42 @@ def _is_url(id: Optional[str]) -> bool: return False +def _get_metadata_from_openapi(paths: Dict, path: str): + parameters = paths[path]["post"].get("parameters", {}) + tag = paths[path]["post"].get("tags", [None])[0] + cls_path = paths[path]["post"].get("cls_path", None) + cls_name = paths[path]["post"].get("cls_name", None) + + metadata = {"tag": tag, "parameters": {}} + + if cls_path: + metadata["cls_path"] = cls_path + + if cls_name: + metadata["cls_name"] = cls_name + + if not parameters: + return metadata + + metadata["parameters"].update({d["name"]: d["schema"]["type"] for d in parameters}) + return metadata + + +def _extract_command_from_openapi(openapi_resp: Dict) -> Dict[str, Dict[str, str]]: + command_paths = [p for p in openapi_resp["paths"] if p.startswith("/command/")] + return {p.replace("/command/", ""): _get_metadata_from_openapi(openapi_resp["paths"], p) for p in command_paths} + + def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Optional[str]): """This function is used to retrieve the current url associated with an id.""" if _is_url(app_id_or_name_or_url): url = app_id_or_name_or_url assert url - resp = requests.get(url + "/api/v1/commands") + resp = requests.get(url + "/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return url, resp.json() + return url, _extract_command_from_openapi(resp.json()) # 2: If no identifier has been provided, evaluate the local application failed_locally = False @@ -66,10 +92,10 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti if app_id_or_name_or_url is None: try: url = f"http://localhost:{APP_SERVER_PORT}" - resp = requests.get(f"{url}/api/v1/commands") + resp = requests.get(f"{url}/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return url, resp.json() + return url, _extract_command_from_openapi(resp.json()) except requests.exceptions.ConnectionError: failed_locally = True @@ -88,8 +114,8 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti if lightningapp.id == app_id_or_name_or_url or lightningapp.name == app_id_or_name_or_url: if lightningapp.status.url == "": raise Exception("The application is starting. Try in a few moments.") - resp = requests.get(lightningapp.status.url + "/api/v1/commands") + resp = requests.get(lightningapp.status.url + "/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return lightningapp.status.url, resp.json() + return lightningapp.status.url, _extract_command_from_openapi(resp.json()) return None, None diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py index b87b41b05df42..c74926f542744 100644 --- a/src/lightning_app/utilities/commands/base.py +++ b/src/lightning_app/utilities/commands/base.py @@ -1,6 +1,5 @@ import errno import inspect -import logging import os import os.path as osp import shutil @@ -8,19 +7,18 @@ from getpass import getuser from importlib.util import module_from_spec, spec_from_file_location from tempfile import gettempdir -from typing import Any, Callable, Dict, List, Optional, Tuple -from uuid import uuid4 +from typing import Any, Callable, Dict, List, Optional, Union import requests from pydantic import BaseModel +from lightning_app.api.http_methods import Post +from lightning_app.api.request_types import APIRequest, CommandRequest from lightning_app.utilities.app_helpers import is_overridden from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.network import LightningClient from lightning_app.utilities.state import AppState -_logger = logging.getLogger(__name__) - def makedirs(path: str): r"""Recursive directory creation function.""" @@ -31,31 +29,18 @@ def makedirs(path: str): raise e -class _ClientCommandConfig(BaseModel): - command: str - affiliation: str - params: Dict[str, str] - is_client_command: bool - cls_path: str - cls_name: str - owner: str - requirements: Optional[List[str]] - - class ClientCommand: def __init__(self, method: Callable, requirements: Optional[List[str]] = None) -> None: self.method = method flow = getattr(method, "__self__", None) self.owner = flow.name if flow else None self.requirements = requirements - self.metadata = None self.models: Optional[Dict[str, BaseModel]] = None self.app_url = None self._state = None - def _setup(self, metadata: Dict[str, Any], models: Dict[str, BaseModel], app_url: str) -> None: - self.metadata = metadata - self.models = models + def _setup(self, command_name: str, app_url: str) -> None: + self.command_name = command_name self.app_url = app_url @property @@ -72,67 +57,50 @@ def state(self): def run(self, **cli_kwargs) -> None: """Overrides with the logic to execute on the client side.""" - def invoke_handler(self, **kwargs: Any) -> Dict[str, Any]: - from lightning.app.utilities.state import headers_for - - assert kwargs.keys() == self.models.keys() - for k, v in kwargs.items(): - assert isinstance(v, self.models[k]) - json = { - "command_name": self.metadata["command"], - "command_arguments": {k: v.json() for k, v in kwargs.items()}, - "affiliation": self.metadata["affiliation"], - "id": str(uuid4()), - } - resp = requests.post(self.app_url + "/api/v1/commands", json=json, headers=headers_for({})) + def invoke_handler(self, config: BaseModel) -> Dict[str, Any]: + resp = requests.post(self.app_url + f"/command/{self.command_name}", data=config.json()) assert resp.status_code == 200, resp.json() return resp.json() def _to_dict(self): return {"owner": self.owner, "requirements": self.requirements} - def __call__(self, **kwargs: Any) -> Any: - assert self.models - input = {} - for k, v in kwargs.items(): - input[k] = self.models[k].parse_raw(v) - return self.method(**input) + def __call__(self, **kwargs): + return self.method(**kwargs) def _download_command( - command_metadata: Dict[str, Any], - app_id: Optional[str], + command_name: str, + cls_path: str, + cls_name: str, + app_id: Optional[str] = None, debug_mode: bool = False, -) -> Tuple[ClientCommand, Dict[str, BaseModel]]: +) -> ClientCommand: # TODO: This is a skateboard implementation and the final version will rely on versioned # immutable commands for security concerns - config = _ClientCommandConfig(**command_metadata) tmpdir = osp.join(gettempdir(), f"{getuser()}_commands") makedirs(tmpdir) - target_file = osp.join(tmpdir, f"{config.command}.py") + target_file = osp.join(tmpdir, f"{command_name}.py") if app_id: client = LightningClient() project_id = _get_project(client).project_id response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts(project_id, app_id) for artifact in response.artifacts: - if f"commands/{config.command}.py" == artifact.filename: + if f"commands/{command_name}.py" == artifact.filename: r = requests.get(artifact.url, allow_redirects=True) with open(target_file, "wb") as f: f.write(r.content) else: if not debug_mode: - shutil.copy(config.cls_path, target_file) + shutil.copy(cls_path, target_file) - cls_name = config.cls_name - spec = spec_from_file_location(config.cls_name, config.cls_path if debug_mode else target_file) + spec = spec_from_file_location(cls_name, cls_path if debug_mode else target_file) mod = module_from_spec(spec) sys.modules[cls_name] = mod spec.loader.exec_module(mod) - command = getattr(mod, cls_name)(method=None, requirements=config.requirements) - models = {k: getattr(mod, v) for k, v in config.params.items()} - if debug_mode: - shutil.rmtree(tmpdir) - return command, models + command = getattr(mod, cls_name)(method=None, requirements=[]) + shutil.rmtree(tmpdir) + return command def _to_annotation(anno: str) -> str: @@ -142,7 +110,7 @@ def _to_annotation(anno: str) -> str: return anno -def _command_to_method_and_metadata(command: ClientCommand) -> Tuple[Callable, Dict[str, Any]]: +def _validate_client_command(command: ClientCommand): """Extract method and its metadata from a ClientCommand.""" params = inspect.signature(command.method).parameters command_metadata = { @@ -170,8 +138,6 @@ def _command_to_method_and_metadata(command: ClientCommand) -> Tuple[Callable, D raise Exception( f"The provided annotation for the argument {k} shouldn't an instance of pydantic BaseModel." ) - command.models[k] = config - return method, command_metadata def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]: @@ -192,55 +158,68 @@ def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]: return filepath -def _populate_commands_endpoint(app): +def _prepare_commands(app) -> List: if not is_overridden("configure_commands", app.root): - return + return [] - # 1: Populate commands metadata + # 1: Upload the command to s3. commands = app.root.configure_commands() - commands_metadata = [] - command_names = set() for command_mapping in commands: for command_name, command in command_mapping.items(): - is_client_command = isinstance(command, ClientCommand) - extras = {} - if is_client_command: + if isinstance(command, ClientCommand): _upload_command(command_name, command) - command, extras = _command_to_method_and_metadata(command) - if command_name in command_names: - raise Exception(f"The component name {command_name} has already been used. They need to be unique.") - command_names.add(command_name) - params = inspect.signature(command).parameters - commands_metadata.append( - { - "command": command_name, - "affiliation": command.__self__.name, - "params": list(params.keys()), - "is_client_command": is_client_command, - **extras, - } - ) - # 1.2: Pass the collected commands through the queue to the Rest API. - app.commands_metadata_queue.put(commands_metadata) + # 2: Cache the commands on the app. app.commands = commands + return commands -def _process_command_requests(app): - if not is_overridden("configure_commands", app.root): - return - - # 1: Populate commands metadata - commands = app.commands - - # 2: Collect requests metadata - command_query = app.get_state_changed_from_queue(app.commands_requests_queue) - if command_query: - for command in commands: - for command_name, method in command.items(): - if command_query["command_name"] == command_name: - # 2.1: Evaluate the method associated to a specific command. - # Validation is done on the CLI side. - response = method(**command_query["command_arguments"]) - app.commands_responses_queue.put({"response": response, "id": command_query["id"]}) - app._has_updated = True +def _process_api_request(app, request: APIRequest) -> None: + flow = app.get_component_by_name(request.name) + method = getattr(flow, request.method_name) + response = method(*request.args, **request.kwargs) + app.api_response_queue.put({"response": response, "id": request.id}) + + +def _process_command_requests(app, request: CommandRequest) -> None: + for command in app.commands: + for command_name, method in command.items(): + if request.method_name == command_name: + # 2.1: Evaluate the method associated to a specific command. + # Validation is done on the CLI side. + response = method(*request.args, **request.kwargs) + app.api_response_queue.put({"response": response, "id": request.id}) + + +def _process_requests(app, request: Union[APIRequest, CommandRequest]) -> None: + """Convert user commands to API endpoint.""" + if isinstance(request, APIRequest): + _process_api_request(app, request) + else: + _process_command_requests(app, request) + + +def _collect_open_api_extras(command) -> Dict: + if not isinstance(command, ClientCommand): + return {} + return { + "cls_path": inspect.getfile(command.__class__), + "cls_name": command.__class__.__name__, + } + + +def _commands_to_api(commands: List[Dict[str, Union[Callable, ClientCommand]]]) -> List: + """Convert user commands to API endpoint.""" + api = [] + for command in commands: + for k, v in command.items(): + api.append( + Post( + f"/command/{k}", + v.method if isinstance(v, ClientCommand) else v, + method_name=k, + tags=["app_client_command"] if isinstance(v, ClientCommand) else ["app_command"], + openapi_extra=_collect_open_api_extras(v), + ) + ) + return api diff --git a/src/lightning_app/utilities/enum.py b/src/lightning_app/utilities/enum.py index dbf20413aa9d9..2b88d93169930 100644 --- a/src/lightning_app/utilities/enum.py +++ b/src/lightning_app/utilities/enum.py @@ -72,3 +72,9 @@ def make_status(stage: str, message: Optional[str] = None, reason: Optional[str] class CacheCallsKeys: LATEST_CALL_HASH = "latest_call_hash" + + +class OpenAPITags: + APP_CLIENT_COMMAND = "app_client_command" + APP_COMMAND = "app_command" + APP_API = "app_api" diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index 7fd03750a515d..050734723acc1 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -48,11 +48,12 @@ def _configure_session() -> Session: return http -def _check_service_url_is_ready(url: str, timeout: float = 100) -> bool: +def _check_service_url_is_ready(url: str, timeout: float = 5) -> bool: try: response = requests.get(url, timeout=timeout) return response.status_code in (200, 404) except (ConnectionError, ConnectTimeout, ReadTimeout): + logger.debug(f"The url {url} is not ready.") return False diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index edd2896d1951d..1b2bf2fb52fd9 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -2,15 +2,27 @@ import multiprocessing as mp import os from copy import deepcopy +from multiprocessing import Process +from time import sleep from unittest import mock import pytest +import requests from deepdiff import DeepDiff, Delta from httpx import AsyncClient +from pydantic import BaseModel from lightning_app import LightningApp, LightningFlow, LightningWork +from lightning_app.api.http_methods import Post from lightning_app.core import api -from lightning_app.core.api import fastapi_service, global_app_state_store, start_server, UIRefresher +from lightning_app.core.api import ( + fastapi_service, + global_app_state_store, + register_global_routes, + start_server, + UIRefresher, +) +from lightning_app.core.constants import APP_SERVER_PORT from lightning_app.runners import MultiProcessRuntime, SingleProcessRuntime from lightning_app.storage.drive import Drive from lightning_app.testing.helpers import MockQueue @@ -20,6 +32,8 @@ from lightning_app.utilities.redis import check_if_redis_running from lightning_app.utilities.state import AppState, headers_for +register_global_routes() + class WorkA(LightningWork): def __init__(self): @@ -161,12 +175,11 @@ def test_update_publish_state_and_maybe_refresh_ui(): app = AppStageTestingApp(FlowA(), debug=True) publish_state_queue = MockQueue("publish_state_queue") - commands_metadata_queue = MockQueue("commands_metadata_queue") - commands_responses_queue = MockQueue("commands_metadata_queue") + api_response_queue = MockQueue("api_response_queue") publish_state_queue.put(app.state_with_changes) - thread = UIRefresher(publish_state_queue, commands_metadata_queue, commands_responses_queue) + thread = UIRefresher(publish_state_queue, api_response_queue) thread.run_once() assert global_app_state_store.get_app_state("1234") == app.state_with_changes @@ -192,18 +205,14 @@ def get(self, timeout: int = 0): publish_state_queue = InfiniteQueue("publish_state_queue") change_state_queue = MockQueue("change_state_queue") has_started_queue = MockQueue("has_started_queue") - commands_requests_queue = MockQueue("commands_requests_queue") - commands_responses_queue = MockQueue("commands_responses_queue") - commands_metadata_queue = MockQueue("commands_metadata_queue") + api_response_queue = MockQueue("api_response_queue") state = app.state_with_changes publish_state_queue.put(state) spec = extract_metadata_from_app(app) ui_refresher = start_server( publish_state_queue, change_state_queue, - commands_requests_queue, - commands_responses_queue, - commands_metadata_queue, + api_response_queue, has_started_queue=has_started_queue, uvicorn_run=False, spec=spec, @@ -343,16 +352,12 @@ def test_start_server_started(): api_publish_state_queue = mp.Queue() api_delta_queue = mp.Queue() has_started_queue = mp.Queue() - commands_requests_queue = mp.Queue() - commands_responses_queue = mp.Queue() - commands_metadata_queue = mp.Queue() + api_response_queue = mp.Queue() kwargs = dict( api_publish_state_queue=api_publish_state_queue, api_delta_queue=api_delta_queue, has_started_queue=has_started_queue, - commands_requests_queue=commands_requests_queue, - commands_responses_queue=commands_responses_queue, - commands_metadata_queue=commands_metadata_queue, + api_response_queue=api_response_queue, port=1111, ) @@ -372,18 +377,14 @@ def test_start_server_info_message(ui_refresher, uvicorn_run, caplog, monkeypatc api_publish_state_queue = MockQueue() api_delta_queue = MockQueue() has_started_queue = MockQueue() - commands_requests_queue = MockQueue() - commands_responses_queue = MockQueue() - commands_metadata_queue = MockQueue() + api_response_queue = MockQueue() kwargs = dict( host=host, port=1111, api_publish_state_queue=api_publish_state_queue, api_delta_queue=api_delta_queue, has_started_queue=has_started_queue, - commands_requests_queue=commands_requests_queue, - commands_responses_queue=commands_responses_queue, - commands_metadata_queue=commands_metadata_queue, + api_response_queue=api_response_queue, ) monkeypatch.setattr(api, "logger", logging.getLogger()) @@ -395,3 +396,65 @@ def test_start_server_info_message(ui_refresher, uvicorn_run, caplog, monkeypatc ui_refresher.assert_called_once() uvicorn_run.assert_called_once_with(host="0.0.0.1", port=1111, log_level="error", app=mock.ANY) + + +class InputRequestModel(BaseModel): + name: str + + +class OutputRequestModel(BaseModel): + name: str + counter: int + + +class FlowAPI(LightningFlow): + def __init__(self): + super().__init__() + self.counter = 0 + + def run(self): + if self.counter == 2: + sleep(0.5) + self._exit() + + def request(self, config: InputRequestModel) -> OutputRequestModel: + self.counter += 1 + return OutputRequestModel(name=config.name, counter=self.counter) + + def configure_api(self): + return [Post("/api/v1/request", self.request)] + + +def target(): + app = LightningApp(FlowAPI()) + MultiProcessRuntime(app).dispatch() + + +def test_configure_api(): + + process = Process(target=target) + process.start() + time_left = 15 + while time_left > 0: + try: + requests.get(f"http://localhost:{APP_SERVER_PORT}/healthz") + break + except requests.exceptions.ConnectionError: + sleep(0.1) + time_left -= 0.1 + + response = requests.post( + f"http://localhost:{APP_SERVER_PORT}/api/v1/request", data=InputRequestModel(name="hello").json() + ) + assert response.json() == {"name": "hello", "counter": 1} + response = requests.post( + f"http://localhost:{APP_SERVER_PORT}/api/v1/request", data=InputRequestModel(name="hello").json() + ) + assert response.json() == {"name": "hello", "counter": 2} + time_left = 15 + while time_left > 0: + if process.exitcode == 0: + break + sleep(0.1) + time_left -= 0.1 + assert process.exitcode == 0 diff --git a/tests/tests_app/utilities/test_app_logs.py b/tests/tests_app/utilities/test_app_logs.py index e7384dd72d6e2..7a0fe087e7c29 100644 --- a/tests/tests_app/utilities/test_app_logs.py +++ b/tests/tests_app/utilities/test_app_logs.py @@ -1,4 +1,5 @@ from datetime import datetime +from time import sleep from unittest.mock import MagicMock from lightning_app.utilities.app_logs import _LogEvent @@ -6,6 +7,7 @@ def test_log_event(): event_1 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) + sleep(0.1) event_2 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) assert event_1 < event_2 assert event_1 <= event_2 diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py index ed7f386395282..1be35a3a2e290 100644 --- a/tests/tests_app/utilities/test_commands.py +++ b/tests/tests_app/utilities/test_commands.py @@ -14,7 +14,7 @@ from lightning_app.core.constants import APP_SERVER_PORT from lightning_app.runners import MultiProcessRuntime from lightning_app.testing.helpers import RunIf -from lightning_app.utilities.commands.base import _command_to_method_and_metadata, _download_command, ClientCommand +from lightning_app.utilities.commands.base import _download_command, _validate_client_command, ClientCommand from lightning_app.utilities.state import AppState @@ -25,7 +25,6 @@ class SweepConfig(BaseModel): class SweepCommand(ClientCommand): def run(self) -> None: - print(sys.argv) parser = argparse.ArgumentParser() parser.add_argument("--sweep_name", type=str) parser.add_argument("--num_trials", type=int) @@ -91,15 +90,15 @@ def run_failure_2(name: CustomModel): @RunIf(skip_windows=True) -def test_command_to_method_and_metadata(): +def test_validate_client_command(): with pytest.raises(Exception, match="The provided annotation for the argument name"): - _command_to_method_and_metadata(ClientCommand(run_failure_0)) + _validate_client_command(ClientCommand(run_failure_0)) with pytest.raises(Exception, match="annotate your method"): - _command_to_method_and_metadata(ClientCommand(run_failure_1)) + _validate_client_command(ClientCommand(run_failure_1)) with pytest.raises(Exception, match="lightning_app/utilities/commands/base.py"): - _command_to_method_and_metadata(ClientCommand(run_failure_2)) + _validate_client_command(ClientCommand(run_failure_2)) def test_client_commands(monkeypatch): @@ -115,17 +114,13 @@ def test_client_commands(monkeypatch): url = "http//" kwargs = {"something": "1", "something_else": "1"} command = DummyCommand(run) - _, command_metadata = _command_to_method_and_metadata(command) - command_metadata.update( - { - "command": "dummy", - "affiliation": "root", - "is_client_command": True, - "owner": "root", - } + _validate_client_command(command) + client_command = _download_command( + command_name="something", + cls_path=__file__, + cls_name="DummyCommand", ) - client_command, models = _download_command(command_metadata, None) - client_command._setup(metadata=command_metadata, models=models, app_url=url) + client_command._setup("something", app_url=url) client_command.run(**kwargs) @@ -153,10 +148,12 @@ def test_configure_commands(monkeypatch): state = AppState() state._request_state() assert state.names == ["something"] - monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name", "my_name", "--num_trials", "1"]) + monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name=my_name", "--num_trials=1"]) app_command() time_left = 15 - while time_left > 0 and process.exitcode != 0: + while time_left > 0: + if process.exitcode == 0: + break sleep(0.1) time_left -= 0.1 assert process.exitcode == 0 diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py deleted file mode 100644 index 236e587e23101..0000000000000 --- a/tests/tests_app_examples/test_commands.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -from subprocess import Popen -from time import sleep -from unittest import mock - -import pytest -from tests_app import _PROJECT_ROOT - -from lightning_app.testing.testing import run_app_in_cloud - - -@mock.patch.dict(os.environ, {"SKIP_LIGHTING_UTILITY_WHEELS_BUILD": "0"}) -@pytest.mark.cloud -def test_commands_example_cloud() -> None: - with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands")) as ( - admin_page, - _, - fetch_logs, - _, - ): - app_id = admin_page.url.split("/")[-1] - cmd = f"lightning trigger_with_client_command --name=something --app_id {app_id}" - Popen(cmd, shell=True).wait() - cmd = f"lightning trigger_without_client_command --name=else --app_id {app_id}" - Popen(cmd, shell=True).wait() - - has_logs = False - while not has_logs: - for log in fetch_logs(["flow"]): - if "['something', 'else']" in log: - has_logs = True - sleep(1) diff --git a/tests/tests_app_examples/test_commands_and_api.py b/tests/tests_app_examples/test_commands_and_api.py new file mode 100644 index 0000000000000..8d84cf4847ebd --- /dev/null +++ b/tests/tests_app_examples/test_commands_and_api.py @@ -0,0 +1,42 @@ +import os +from subprocess import Popen +from time import sleep + +import pytest +import requests +from tests_app import _PROJECT_ROOT + +from lightning_app.testing.testing import run_app_in_cloud + + +@pytest.mark.cloud +def test_commands_and_api_example_cloud() -> None: + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands_and_api")) as ( + admin_page, + view_page, + fetch_logs, + _, + ): + # 1: Collect the app_id + app_id = admin_page.url.split("/")[-1] + + # 2: Send the first command with the client + cmd = f"lightning command_with_client --name=this --app_id {app_id}" + Popen(cmd, shell=True).wait() + + # 3: Send the second command without a client + cmd = f"lightning command_without_client --name=is --app_id {app_id}" + Popen(cmd, shell=True).wait() + + # 4: Send a request to the Rest API directly. + base_url = view_page.url.replace("/view", "").replace("/child_flow", "") + resp = requests.post(base_url + "/user/command_without_client?name=awesome") + assert resp.status_code == 200, resp.json() + + # 5: Validate the logs. + has_logs = False + while not has_logs: + for log in fetch_logs(): + if "['this', 'is', 'awesome']" in log: + has_logs = True + sleep(1) From 7c8c996f6acbcd3f497b6274bc96e7767e5695b1 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Thu, 11 Aug 2022 09:22:59 -0400 Subject: [PATCH 53/59] Feature GRID-9731: Update Lightning Cloud.py Backend to Accept Drive Specs (2/2) (#14106) initial work adding drives to create work API from framework cloud dispatcher --- requirements/app/base.txt | 2 +- src/lightning_app/runners/cloud.py | 43 ++++ src/lightning_app/storage/drive.py | 2 +- tests/tests_app/runners/test_cloud.py | 356 ++++++++++++++++++++++++++ 4 files changed, 401 insertions(+), 2 deletions(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 02eeb04bfa218..fcde2f18a300a 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,4 +1,4 @@ -lightning-cloud==0.5.0 +lightning-cloud==0.5.3 packaging deepdiff>=5.7.0, <=5.8.1 starsessions diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 957b60b5d2ab5..2cd98ebe4cf68 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -18,15 +18,22 @@ Gridv1ImageSpec, V1BuildSpec, V1DependencyFileInfo, + V1Drive, + V1DriveSpec, + V1DriveStatus, + V1DriveType, V1EnvVar, V1Flowserver, V1LightningappInstanceSpec, V1LightningappInstanceState, + V1LightningworkDrives, V1LightningworkSpec, + V1Metadata, V1NetworkConfig, V1PackageManager, V1ProjectClusterBinding, V1PythonDependencyInfo, + V1SourceType, V1UserRequestedComputeConfig, V1Work, ) @@ -36,6 +43,7 @@ from lightning_app.runners.backends.cloud import CloudBackend from lightning_app.runners.runtime import Runtime from lightning_app.source_code import LocalSourceCodeDir +from lightning_app.storage import Drive from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file @@ -107,10 +115,45 @@ def dispatch( preemptible=work.cloud_compute.preemptible, shm_size=work.cloud_compute.shm_size, ) + + drive_specs: List[V1LightningworkDrives] = [] + for drive_attr_name, drive in [ + (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive) + ]: + if drive.protocol == "lit://": + drive_type = V1DriveType.NO_MOUNT_S3 + source_type = V1SourceType.S3 + elif drive.protocol == "s3://": + drive_type = V1DriveType.INDEXED_S3 + source_type = V1SourceType.S3 + else: + raise RuntimeError( + f"unknown drive protocol `{drive.protocol}`. Please verify this " + f"drive type has been configured for use in the cloud dispatcher." + ) + + drive_specs.append( + V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name=f"{work.name}.{drive_attr_name}", + ), + spec=V1DriveSpec( + drive_type=drive_type, + source_type=source_type, + source=f"{drive.protocol}{drive.id}", + ), + status=V1DriveStatus(), + ), + mount_location=str(drive.root_folder), + ), + ) + random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) spec = V1LightningworkSpec( build_spec=build_spec, cluster_id=cluster_id, + drives=drive_specs, user_requested_compute_config=user_compute_config, network_config=[V1NetworkConfig(name=random_name, port=work.port)], ) diff --git a/src/lightning_app/storage/drive.py b/src/lightning_app/storage/drive.py index b69d2581851b8..f72ad38b6e130 100644 --- a/src/lightning_app/storage/drive.py +++ b/src/lightning_app/storage/drive.py @@ -59,7 +59,7 @@ def __init__( if self.protocol != "s3://" and "/" in self.id: raise Exception(f"The id should be unique to identify your drive. Found `{self.id}`.") - self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else os.getcwd() + self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else pathlib.Path(os.getcwd()) if not os.path.isdir(self.root_folder): raise Exception(f"The provided root_folder isn't a directory: {root_folder}") self.component_name = component_name diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 4b1cf08e8554d..640eb9c114c2d 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -1,4 +1,5 @@ import logging +from copy import copy from pathlib import Path from unittest import mock from unittest.mock import MagicMock @@ -9,21 +10,29 @@ Gridv1ImageSpec, V1BuildSpec, V1DependencyFileInfo, + V1Drive, + V1DriveSpec, + V1DriveStatus, + V1DriveType, V1LightningappInstanceState, + V1LightningworkDrives, V1LightningworkSpec, V1ListLightningappInstancesResponse, V1ListMembershipsResponse, V1Membership, + V1Metadata, V1NetworkConfig, V1PackageManager, V1ProjectClusterBinding, V1PythonDependencyInfo, + V1SourceType, V1UserRequestedComputeConfig, V1Work, ) from lightning_app import LightningApp, LightningWork from lightning_app.runners import backends, cloud +from lightning_app.storage import Drive from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash @@ -33,6 +42,25 @@ def run(self): print("my run") +class WorkWithSingleDrive(LightningWork): + def __init__(self): + super().__init__() + self.drive = None + + def run(self): + pass + + +class WorkWithTwoDrives(LightningWork): + def __init__(self): + super().__init__() + self.lit_drive = None + self.s3_drive = None + + def run(self): + pass + + class TestAppCreationClient: """Testing the calls made using GridRestClient to create the app.""" @@ -250,6 +278,134 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir): ), image="random_base_public_image", ), + drives=[], + user_requested_compute_config=V1UserRequestedComputeConfig( + name="default", count=1, disk_size=0, preemptible=False, shm_size=0 + ), + network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], + ), + ) + ], + ) + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + + # running dispatch with disabled dependency cache + mock_client.reset_mock() + monkeypatch.setattr(cloud, "DISABLE_DEPENDENCY_CACHE", True) + expected_body.dependency_cache_key = None + cloud_runtime.dispatch() + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + else: + mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with( + "test-project-id", mock.ANY, mock.ANY, mock.ANY + ) + + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) + @pytest.mark.parametrize("lightningapps", [[], [MagicMock()]]) + def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch, tmpdir): + source_code_root_dir = Path(tmpdir / "src").absolute() + source_code_root_dir.mkdir() + Path(source_code_root_dir / ".lightning").write_text("name: myapp") + requirements_file = Path(source_code_root_dir / "requirements.txt") + Path(requirements_file).touch() + + mock_client = mock.MagicMock() + if lightningapps: + lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=lightningapps) + ) + lightning_app_instance = MagicMock() + mock_client.lightningapp_v2_service_create_lightningapp_release = MagicMock(return_value=lightning_app_instance) + mock_client.lightningapp_v2_service_create_lightningapp_release_instance = MagicMock( + return_value=lightning_app_instance + ) + existing_instance = MagicMock() + existing_instance.status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_service_get_lightningapp = MagicMock(return_value=existing_instance) + cloud_backend = mock.MagicMock() + cloud_backend.client = mock_client + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) + app = mock.MagicMock() + flow = mock.MagicMock() + + mocked_drive = MagicMock(spec=Drive) + setattr(mocked_drive, "id", "foobar") + setattr(mocked_drive, "protocol", "lit://") + setattr(mocked_drive, "component_name", "test-work") + setattr(mocked_drive, "allow_duplicates", False) + setattr(mocked_drive, "root_folder", tmpdir) + # deepcopy on a MagicMock instance will return an empty magicmock instance. To + # overcome this we set the __deepcopy__ method `return_value` to equal what + # should be the results of the deepcopy operation (an instance of the original class) + mocked_drive.__deepcopy__.return_value = copy(mocked_drive) + + work = WorkWithSingleDrive() + monkeypatch.setattr(work, "drive", mocked_drive) + monkeypatch.setattr(work, "_state", {"_port", "drive"}) + monkeypatch.setattr(work, "_name", "test-work") + monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"]) + monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"]) + monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image") + monkeypatch.setattr(work._cloud_compute, "disk_size", 0) + monkeypatch.setattr(work._cloud_compute, "preemptible", False) + monkeypatch.setattr(work, "_port", 8080) + + flow.works = lambda recurse: [work] + app.flows = [flow] + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) + monkeypatch.setattr( + "lightning_app.runners.cloud._get_project", + lambda x: V1Membership(name="test-project", project_id="test-project-id"), + ) + cloud_runtime.dispatch() + + if lightningapps: + expected_body = Body8( + description=None, + local_source=True, + app_entrypoint_file="entrypoint.py", + enable_app_server=True, + flow_servers=[], + dependency_cache_key=get_hash(requirements_file), + image_spec=Gridv1ImageSpec( + dependency_file_info=V1DependencyFileInfo( + package_manager=V1PackageManager.PIP, path="requirements.txt" + ) + ), + works=[ + V1Work( + name="test-work", + spec=V1LightningworkSpec( + build_spec=V1BuildSpec( + commands=["echo 'start'"], + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0" + ), + image="random_base_public_image", + ), + drives=[ + V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name="test-work.drive", + ), + spec=V1DriveSpec( + drive_type=V1DriveType.NO_MOUNT_S3, + source_type=V1SourceType.S3, + source="lit://foobar", + ), + status=V1DriveStatus(), + ), + mount_location=str(tmpdir), + ), + ], user_requested_compute_config=V1UserRequestedComputeConfig( name="default", count=1, disk_size=0, preemptible=False, shm_size=0 ), @@ -275,6 +431,206 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir): "test-project-id", mock.ANY, mock.ANY, mock.ANY ) + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) + @pytest.mark.parametrize("lightningapps", [[], [MagicMock()]]) + def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, monkeypatch, tmpdir): + source_code_root_dir = Path(tmpdir / "src").absolute() + source_code_root_dir.mkdir() + Path(source_code_root_dir / ".lightning").write_text("name: myapp") + requirements_file = Path(source_code_root_dir / "requirements.txt") + Path(requirements_file).touch() + + mock_client = mock.MagicMock() + if lightningapps: + lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=lightningapps) + ) + lightning_app_instance = MagicMock() + mock_client.lightningapp_v2_service_create_lightningapp_release = MagicMock(return_value=lightning_app_instance) + mock_client.lightningapp_v2_service_create_lightningapp_release_instance = MagicMock( + return_value=lightning_app_instance + ) + existing_instance = MagicMock() + existing_instance.status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_service_get_lightningapp = MagicMock(return_value=existing_instance) + cloud_backend = mock.MagicMock() + cloud_backend.client = mock_client + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) + app = mock.MagicMock() + flow = mock.MagicMock() + + mocked_lit_drive = MagicMock(spec=Drive) + setattr(mocked_lit_drive, "id", "foobar") + setattr(mocked_lit_drive, "protocol", "lit://") + setattr(mocked_lit_drive, "component_name", "test-work") + setattr(mocked_lit_drive, "allow_duplicates", False) + setattr(mocked_lit_drive, "root_folder", tmpdir) + # deepcopy on a MagicMock instance will return an empty magicmock instance. To + # overcome this we set the __deepcopy__ method `return_value` to equal what + # should be the results of the deepcopy operation (an instance of the original class) + mocked_lit_drive.__deepcopy__.return_value = copy(mocked_lit_drive) + + mocked_s3_drive = MagicMock(spec=Drive) + setattr(mocked_s3_drive, "id", "some-bucket/path/") + setattr(mocked_s3_drive, "protocol", "s3://") + setattr(mocked_s3_drive, "component_name", "test-work") + setattr(mocked_s3_drive, "allow_duplicates", False) + setattr(mocked_s3_drive, "root_folder", "/hello/") + # deepcopy on a MagicMock instance will return an empty magicmock instance. To + # overcome this we set the __deepcopy__ method `return_value` to equal what + # should be the results of the deepcopy operation (an instance of the original class) + mocked_s3_drive.__deepcopy__.return_value = copy(mocked_s3_drive) + + work = WorkWithTwoDrives() + monkeypatch.setattr(work, "lit_drive", mocked_lit_drive) + monkeypatch.setattr(work, "s3_drive", mocked_s3_drive) + monkeypatch.setattr(work, "_state", {"_port", "_name", "lit_drive", "s3_drive"}) + monkeypatch.setattr(work, "_name", "test-work") + monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"]) + monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"]) + monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image") + monkeypatch.setattr(work._cloud_compute, "disk_size", 0) + monkeypatch.setattr(work._cloud_compute, "preemptible", False) + monkeypatch.setattr(work, "_port", 8080) + + flow.works = lambda recurse: [work] + app.flows = [flow] + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) + monkeypatch.setattr( + "lightning_app.runners.cloud._get_project", + lambda x: V1Membership(name="test-project", project_id="test-project-id"), + ) + cloud_runtime.dispatch() + + if lightningapps: + s3_drive_spec = V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name="test-work.s3_drive", + ), + spec=V1DriveSpec( + drive_type=V1DriveType.INDEXED_S3, + source_type=V1SourceType.S3, + source="s3://some-bucket/path/", + ), + status=V1DriveStatus(), + ), + mount_location="/hello/", + ) + lit_drive_spec = V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name="test-work.lit_drive", + ), + spec=V1DriveSpec( + drive_type=V1DriveType.NO_MOUNT_S3, + source_type=V1SourceType.S3, + source="lit://foobar", + ), + status=V1DriveStatus(), + ), + mount_location=str(tmpdir), + ) + + # order of drives in the spec is non-deterministic, so there are two options + # depending for the expected body value on which drive is ordered in the list first. + + expected_body_option_1 = Body8( + description=None, + local_source=True, + app_entrypoint_file="entrypoint.py", + enable_app_server=True, + flow_servers=[], + dependency_cache_key=get_hash(requirements_file), + image_spec=Gridv1ImageSpec( + dependency_file_info=V1DependencyFileInfo( + package_manager=V1PackageManager.PIP, path="requirements.txt" + ) + ), + works=[ + V1Work( + name="test-work", + spec=V1LightningworkSpec( + build_spec=V1BuildSpec( + commands=["echo 'start'"], + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0" + ), + image="random_base_public_image", + ), + drives=[lit_drive_spec, s3_drive_spec], + user_requested_compute_config=V1UserRequestedComputeConfig( + name="default", count=1, disk_size=0, preemptible=False, shm_size=0 + ), + network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], + ), + ) + ], + ) + + expected_body_option_2 = Body8( + description=None, + local_source=True, + app_entrypoint_file="entrypoint.py", + enable_app_server=True, + flow_servers=[], + dependency_cache_key=get_hash(requirements_file), + image_spec=Gridv1ImageSpec( + dependency_file_info=V1DependencyFileInfo( + package_manager=V1PackageManager.PIP, path="requirements.txt" + ) + ), + works=[ + V1Work( + name="test-work", + spec=V1LightningworkSpec( + build_spec=V1BuildSpec( + commands=["echo 'start'"], + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0" + ), + image="random_base_public_image", + ), + drives=[s3_drive_spec, lit_drive_spec], + user_requested_compute_config=V1UserRequestedComputeConfig( + name="default", count=1, disk_size=0, preemptible=False, shm_size=0 + ), + network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], + ), + ) + ], + ) + + # try both options for the expected body to avoid false + # positive test failures depending on system randomness + + expected_body = expected_body_option_1 + try: + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + except Exception: + expected_body = expected_body_option_2 + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + + # running dispatch with disabled dependency cache + mock_client.reset_mock() + monkeypatch.setattr(cloud, "DISABLE_DEPENDENCY_CACHE", True) + expected_body.dependency_cache_key = None + cloud_runtime.dispatch() + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + else: + mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with( + "test-project-id", mock.ANY, mock.ANY, mock.ANY + ) + @mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock()) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", MagicMock()) From 98ded4524f373d906aac475b6a7599b6f1661c39 Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Thu, 11 Aug 2022 08:19:21 -0700 Subject: [PATCH 54/59] [CLI] change cluster creation cost savings mode default (#14132) * [CLI] change cluster creation cost savings mode default instead of having customers opt-into cost savings mode, we'll ask them to opt-out of cost savings mode. --- src/lightning_app/CHANGELOG.md | 2 +- src/lightning_app/cli/lightning_cli_create.py | 14 +++++++------- tests/tests_app/cli/test_cli.py | 17 ++++++++++------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 7158d1ff7a2da..2aa5c7cdd837c 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -28,7 +28,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) ### Changed -- +- Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132)) ### Changed diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index d400db4b6f337..c9cea2a5676f9 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -33,14 +33,14 @@ def create(): help="Instance types that you want to support, for computer jobs within the cluster.", ) @click.option( - "--cost-savings", - "cost_savings", + "--enable-performance", + "enable_performance", type=bool, required=False, default=False, is_flag=True, - help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for cost savings. - This makes runs cheaper but start-up times may increase.""", + help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance. + This makes runs more expensive but start-up times decrease.""", ) @click.option( "--edit-before-creation", @@ -65,12 +65,12 @@ def create_cluster( provider: str, instance_types: str, edit_before_creation: bool, - cost_savings: bool, + enable_performance: bool, wait: bool, **kwargs, ): """Create a Lightning AI BYOC compute cluster with your cloud provider credentials.""" - if provider != "aws": + if provider.lower() != "aws": click.echo("Only AWS is supported for now. But support for more providers is coming soon.") return cluster_manager = AWSClusterManager() @@ -81,6 +81,6 @@ def create_cluster( external_id=external_id, instance_types=instance_types.split(",") if instance_types is not None else None, edit_before_creation=edit_before_creation, - cost_savings=cost_savings, + cost_savings=not enable_performance, wait=wait, ) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 8cc5dd50f836e..48e1a26bb6f2b 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -71,14 +71,17 @@ def test_main_lightning_cli_help(): @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") @pytest.mark.parametrize( - "instance_types,expected_instance_types", + "extra_arguments,expected_instance_types,expected_cost_savings_mode", [ - (["--instance-types", "t3.xlarge"], ["t3.xlarge"]), - (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"]), - ([], None), + (["--instance-types", "t3.xlarge"], ["t3.xlarge"], True), + (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"], True), + ([], None, True), + (["--enable-performance"], None, False), ], ) -def test_create_cluster(create_command: mock.MagicMock, instance_types, expected_instance_types): +def test_create_cluster( + create_command: mock.MagicMock, extra_arguments, expected_instance_types, expected_cost_savings_mode +): runner = CliRunner() runner.invoke( create_cluster, @@ -91,7 +94,7 @@ def test_create_cluster(create_command: mock.MagicMock, instance_types, expected "--role-arn", "arn:aws:iam::1234567890:role/lai-byoc", ] - + instance_types, + + extra_arguments, ) create_command.assert_called_once_with( @@ -101,7 +104,7 @@ def test_create_cluster(create_command: mock.MagicMock, instance_types, expected external_id="dummy", instance_types=expected_instance_types, edit_before_creation=False, - cost_savings=False, + cost_savings=expected_cost_savings_mode, wait=False, ) From 3b18da3eafa8ece27cde46ad978f765a1390d72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 17:49:46 +0200 Subject: [PATCH 55/59] Fix saving hyperparameters in a composition where parent is not a LM or LDM (#14151) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 4 ++++ src/pytorch_lightning/utilities/parsing.py | 17 ++++++++++++----- tests/tests_pytorch/models/test_hparams.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 97bb317b02a14..04db3d1908bb2 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -70,6 +70,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) + + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 81877f1dffba7..073423ab60773 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -162,7 +162,10 @@ def get_init_args(frame: types.FrameType) -> Dict[str, Any]: def collect_init_args( - frame: types.FrameType, path_args: List[Dict[str, Any]], inside: bool = False + frame: types.FrameType, + path_args: List[Dict[str, Any]], + inside: bool = False, + classes: Tuple[Type, ...] = (), ) -> List[Dict[str, Any]]: """Recursively collects the arguments passed to the child constructors in the inheritance tree. @@ -170,6 +173,7 @@ def collect_init_args( frame: the current stack frame path_args: a list of dictionaries containing the constructor args in all parent classes inside: track if we are inside inheritance path, avoid terminating too soon + classes: the classes in which to inspect the frames Return: A list of dictionaries where each dictionary contains the arguments passed to the @@ -181,13 +185,13 @@ def collect_init_args( if not isinstance(frame.f_back, types.FrameType): return path_args - if "__class__" in local_vars: + if "__class__" in local_vars and (not classes or issubclass(local_vars["__class__"], classes)): local_args = get_init_args(frame) # recursive update path_args.append(local_args) - return collect_init_args(frame.f_back, path_args, inside=True) + return collect_init_args(frame.f_back, path_args, inside=True, classes=classes) if not inside: - return collect_init_args(frame.f_back, path_args, inside) + return collect_init_args(frame.f_back, path_args, inside, classes=classes) return path_args @@ -225,7 +229,10 @@ def save_hyperparameters( init_args = {f.name: getattr(obj, f.name) for f in fields(obj)} else: init_args = {} - for local_args in collect_init_args(frame, []): + + from pytorch_lightning.core.mixins import HyperparametersMixin + + for local_args in collect_init_args(frame, [], classes=(HyperparametersMixin,)): init_args.update(local_args) if ignore is None: diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index c130381c7832d..84311d6f780fb 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -29,6 +29,7 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule +from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict, is_picklable @@ -399,6 +400,24 @@ def _raw_checkpoint_path(trainer) -> str: return raw_checkpoint_path +@pytest.mark.parametrize("base_class", (HyperparametersMixin, LightningModule, LightningDataModule)) +def test_save_hyperparameters_under_composition(base_class): + """Test that in a composition where the parent is not a Lightning-like module, the parent's arguments don't get + collected.""" + + class ChildInComposition(base_class): + def __init__(self, same_arg): + super().__init__() + self.save_hyperparameters() + + class NotPLSubclass: # intentionally not subclassing LightningModule/LightningDataModule + def __init__(self, same_arg="parent_default", other_arg="other"): + self.child = ChildInComposition(same_arg="cocofruit") + + parent = NotPLSubclass() + assert parent.child.hparams == dict(same_arg="cocofruit") + + class LocalVariableModelSuperLast(BoringModel): """This model has the super().__init__() call at the end.""" From 56533368afe14407867dc999a65b799d0f4bd89b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 18:17:56 +0200 Subject: [PATCH 56/59] Remove DeepSpeed version restriction from Lite (#13967) --- .azure/gpu-tests.yml | 2 +- requirements/pytorch/strategies.txt | 2 +- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/lite/lite.py | 15 --------------- tests/tests_pytorch/lite/test_lite.py | 13 +------------ 5 files changed, 6 insertions(+), 29 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 68ba6974a3527..8ae670d265ced 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -75,7 +75,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] - pip install deepspeed>0.6.4 # TODO: remove when docker images are upgraded + pip install -U deepspeed # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 4e916fbc6c61f..c5fc92a67a837 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment fairscale>=0.4.5, <=0.4.6 -deepspeed>=0.6.0, <0.7.0 +deepspeed>=0.6.0, <=0.7.0 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 04db3d1908bb2..6d67d2d58643a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -22,6 +22,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([13961](https://github.com/Lightning-AI/lightning/pull/13961)) +- Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) + + ### Deprecated - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 5125bf4486a9d..981eed30635f6 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -40,7 +40,6 @@ has_iterable_dataset, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import seed_everything @@ -106,8 +105,6 @@ def __init__( self._precision_plugin = self._strategy.precision_plugin self._models_setup: int = 0 - self._check_deepspeed_support() - # wrap the run method so we can inject setup logic or spawn processes for the user setattr(self, "run", partial(self._run_impl, self.run)) @@ -459,18 +456,6 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N f" Choose one of {supported} or pass in a `Strategy` instance." ) - def _check_deepspeed_support(self) -> None: - if ( - isinstance(self._strategy, DeepSpeedStrategy) - and self._strategy.zero_stage_3 - and _RequirementAvailable("deepspeed>=0.6.5") - ): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise RuntimeError( - "DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`." - " Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available." - ) - @staticmethod def _supported_device_types() -> Sequence[_AcceleratorType]: return ( diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 2215ab3129780..86a0a5a82195a 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib import os from copy import deepcopy from unittest import mock @@ -30,7 +29,6 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -480,13 +478,4 @@ def run(self): assert self.broadcast(True) assert self.is_global_zero == (self.local_rank == 0) - if _RequirementAvailable("deepspeed>=0.6.5"): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise_if_deepspeed_incompatible = pytest.raises( - RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite" - ) - else: - raise_if_deepspeed_incompatible = contextlib.suppress() - - with raise_if_deepspeed_incompatible: - Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() + Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() From d0f82abe35c271247d58da35442719e01a54604c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Aug 2022 18:55:01 +0200 Subject: [PATCH 57/59] Configure the check-group app (#14165) Co-authored-by: Jirka --- .github/checkgroup.yml | 165 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 .github/checkgroup.yml diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml new file mode 100644 index 0000000000000..8f1d3c6fb5e86 --- /dev/null +++ b/.github/checkgroup.yml @@ -0,0 +1,165 @@ +custom_service_name: "Lightning CI required checker" +subprojects: + - id: "CI: CircleCI" + paths: + - ".circleci/**" + checks: + - "test-on-tpus" + + - id: "CI: Azure" + paths: + - ".azure/**" + checks: + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + + - id: "pytorch_lightning" + paths: + # all examples don't need to be added because they aren't used in CI, but these are + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch*.yml" + - ".github/workflows/docs-*.yml" + checks: + - "conda (3.8, 1.10)" + - "conda (3.8, 1.9)" + - "conda (3.9, 1.11)" + - "conda (3.9, 1.12)" + - "cpu (macOS-11, 3.10, latest, stable)" + - "cpu (macOS-11, 3.7, latest, stable)" + - "cpu (macOS-11, 3.7, oldest, stable)" + - "cpu (ubuntu-20.04, 3.10, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "cpu (windows-2022, 3.10, latest, stable)" + - "cpu (windows-2022, 3.7, latest, stable)" + - "cpu (windows-2022, 3.7, oldest, stable)" + - "doctest (pytorch)" + - "make-docs (pytorch)" + - "mypy" + - "PR Gatekeeper (pytorch)" + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + - "slow (macOS-11, 3.7, 1.11)" + - "slow (ubuntu-20.04, 3.7, 1.11)" + - "slow (windows-2022, 3.7, 1.11)" + - "test-on-tpus" + + - id: "pytorch_lightning: Docs" + paths: + - "docs/source-pytorch/**" + - ".github/workflows/docs-*.yml" + - "requirements/pytorch/**" + checks: + - "doctest (pytorch)" + - "make-docs (pytorch)" + + - id: "pytorch_lightning: Docker" + paths: + - "dockers/**" + checks: + - "build-conda (3.8, 1.10)" + - "build-conda (3.8, 1.9)" + - "build-conda (3.9, 1.11)" + - "build-conda (3.9, 1.12)" + - "build-cuda (3.8, 1.9, 11.1.1)" + - "build-cuda (3.9, 1.10, 11.3.1)" + - "build-cuda (3.9, 1.11, 11.3.1)" + - "build-cuda (3.9, 1.12, 11.3.1)" + - "build-cuda (3.9, 1.9, 11.1.1)" + - "build-hpu (1.5.0, 1.11.0)" + - "build-ipu (3.9, 1.9)" + - "build-NGC" + - "build-pl (3.9, 1.10, 11.3.1)" + - "build-pl (3.9, 1.11, 11.3.1)" + - "build-pl (3.9, 1.12, 11.3.1)" + - "build-pl (3.9, 1.9, 11.1.1)" + - "build-xla (3.7, 1.12)" + + - id: "pytorch_lightning: mypy" + paths: + - ".github/workflows/code-checks.yml" + - "pyproject.toml" # includes mypy config + checks: + - "mypy" + + - id: "lightning_app" + paths: + - ".github/workflows/ci-app*.yml" + - "examples/app_**" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app/**" + - "tests/tests_app_examples/**" + - "tests/tests_clusters/**" + # the examples are used in the app CI + - "examples/app_*" + checks: + - "Cloud Test (boring_app)" + - "Cloud Test (collect_failures)" + - "Cloud Test (commands_and_api)" + - "Cloud Test (custom_work_dependencies)" + - "Cloud Test (drive)" + - "Cloud Test (idle_timeout)" + - "Cloud Test (payload)" + - "Cloud Test (template_jupyterlab)" + - "Cloud Test (template_react_ui)" + - "Cloud Test (template_streamlit_ui)" + - "Cloud Test (v0_app)" + - "doctest (app)" + - "make-docs (app)" + - "pytest (macOS-11, 3.8, latest)" + - "pytest (macOS-11, 3.8, oldest)" + - "pytest (ubuntu-20.04, 3.8, latest)" + - "pytest (ubuntu-20.04, 3.8, oldest)" + - "pytest (windows-2022, 3.8, latest)" + - "pytest (windows-2022, 3.8, oldest)" + + - id: "lightning_app: Docs" + paths: + - "docs/source-app/**" + - ".github/workflows/docs-*.yml" + - "requirements/app/**" + checks: + - "doctest (app)" + - "make-docs (app)" + + - id: "install" + paths: + - ".actions/setup_tools.py" + - ".github/workflows/ci-pkg-install.yml" + - "setup.py" + - "src/lightning/**" + # all __about__, __version__, __setup__ + - "src/*/__*.py" + checks: + - "install-meta-pypi (macOS-11, 3.8)" + - "install-meta-pypi (ubuntu-20.04, 3.8)" + - "install-meta-pypi (windows-2022, 3.8)" + - "install-meta-src (macOS-11, 3.8)" + - "install-meta-src (macOS-11, lightning, 3.8)" + - "install-meta-src (ubuntu-20.04, 3.8)" + - "install-meta-src (ubuntu-20.04, lightning, 3.8)" + - "install-meta-src (windows-2022, 3.8)" + - "install-meta-src (windows-2022, lightning, 3.8)" + - "install-standalone (macOS-11, app, 3.8)" + - "install-standalone (macOS-11, pytorch, 3.8)" + - "install-standalone (ubuntu-20.04, app, 3.8)" + - "install-standalone (ubuntu-20.04, pytorch, 3.8)" + - "install-standalone (windows-2022, app, 3.8)" + - "install-standalone (windows-2022, pytorch, 3.8)" From 31ecf9bfac32e226eb670e743c79dbceb4f88345 Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Thu, 11 Aug 2022 11:34:24 -0700 Subject: [PATCH 58/59] [CLI] adjust command description (#14130) * adjust CLI copy Co-authored-by: RobertLaurella <99420295+RobertLaurella@users.noreply.github.com> --- src/lightning_app/cli/lightning_cli.py | 10 +++++----- src/lightning_app/cli/lightning_cli_create.py | 2 +- src/lightning_app/cli/lightning_cli_delete.py | 2 +- src/lightning_app/cli/lightning_cli_list.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 6a6e41df57026..81d2a773b4619 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -143,7 +143,7 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: @_main.command() def login(): - """Log in to your Lightning.ai account.""" + """Log in to your lightning.ai account.""" auth = Auth() auth.clear() @@ -156,7 +156,7 @@ def login(): @_main.command() def logout(): - """Log out of your Lightning.ai account.""" + """Log out of your lightning.ai account.""" Auth().clear() @@ -215,7 +215,7 @@ def on_before_run(*args): @_main.group() def run(): - """Run your application.""" + """Run a Lightning application locally or on the cloud.""" @run.command("app") @@ -321,7 +321,7 @@ def stop(): @_main.group() def install(): - """Install Lightning apps and components.""" + """Install a Lightning App and/or component.""" @install.command("app") @@ -379,7 +379,7 @@ def install_component(name, yes, version): @_main.group() def init(): - """Init a Lightning app and component.""" + """Init a Lightning App and/or component.""" @init.command("app") diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index c9cea2a5676f9..7e9a6b9d2143b 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -5,7 +5,7 @@ @click.group("create") def create(): - """Create Lightning AI BYOC managed resources.""" + """Create Lightning AI self-managed resources (clusters, etc…)""" pass diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index c304b130bdf5d..366f4aa01e995 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -5,7 +5,7 @@ @click.group("delete") def delete(): - """Delete Lightning AI BYOC managed resources.""" + """Delete Lightning AI self-managed resources (clusters, etc…)""" pass diff --git a/src/lightning_app/cli/lightning_cli_list.py b/src/lightning_app/cli/lightning_cli_list.py index d0d1d34a6dd4d..7d38b5b57760f 100644 --- a/src/lightning_app/cli/lightning_cli_list.py +++ b/src/lightning_app/cli/lightning_cli_list.py @@ -6,7 +6,7 @@ @click.group(name="list") def get_list(): - """List your Lightning AI BYOC managed resources.""" + """List Lightning AI self-managed resources (clusters, etc…)""" pass From e53c4e8e6c14c92968df9bed8861e578bfe731aa Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Thu, 11 Aug 2022 22:10:05 +0100 Subject: [PATCH 59/59] Fix mypy errors attributed to `pytorch_lightning. strategies.sharded_spawn` (#14102) Co-authored-by: rohitgr7 Co-authored-by: Jirka Borovec Co-authored-by: awaelchli --- pyproject.toml | 1 - src/pytorch_lightning/overrides/base.py | 1 + src/pytorch_lightning/strategies/sharded_spawn.py | 14 +++++++++----- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b5e806bc69900..9f7cc28d0b002 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,6 @@ module = [ "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.strategies.sharded", - "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", "pytorch_lightning.trainer.connectors.data_connector", "pytorch_lightning.trainer.supporters", diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py index 26c2837bda7e3..3e9fda2f966f5 100644 --- a/src/pytorch_lightning/overrides/base.py +++ b/src/pytorch_lightning/overrides/base.py @@ -75,6 +75,7 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any: trainer = pl_module._trainer if trainer is not None: + assert isinstance(self.module, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) if trainer.training: output = self.module.training_step(*inputs, **kwargs) # In manual_optimization, we need to prevent DDP reducer as diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py index 4550e397ded80..882302e101cb6 100644 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ b/src/pytorch_lightning/strategies/sharded_spawn.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple from torch import Tensor from torch.nn import Module from torch.optim import Optimizer import pytorch_lightning as pl +from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn @@ -42,7 +43,9 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy): def configure_ddp(self) -> None: # set up optimizers after the wrapped module has been moved to the device + assert self.lightning_module is not None self.setup_optimizers(self.lightning_module.trainer) + assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model, self.optimizers = self._setup_model_and_optimizers( model=LightningShardedDataParallel(self.model), optimizers=self.optimizers ) @@ -69,12 +72,13 @@ def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS" return optimizers def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]: - if self.model is not None and self.model.trainer.state.fn != TrainerFn.FITTING: + assert self.lightning_module + if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING: return optimizers return self._reinit_optimizers_with_oss(optimizers) - def optimizer_state(self, optimizer: "OSS") -> Optional[dict]: + def optimizer_state(self, optimizer: "OSS") -> Dict[str, Any]: if isinstance(optimizer, OSS): optimizer.consolidate_state_dict() return self._optim_state_dict(optimizer) @@ -93,7 +97,7 @@ def block_backward_sync(self) -> Generator: yield None @rank_zero_only - def _optim_state_dict(self, optimizer): + def _optim_state_dict(self, optimizer: Optimizer) -> Dict[str, Any]: """ Retrieves state dict only on rank 0, which contains the entire optimizer state after calling :meth:`consolidate_state_dict`. @@ -112,7 +116,7 @@ def lightning_module(self) -> Optional["pl.LightningModule"]: def pre_backward(self, closure_loss: Tensor) -> None: pass - def post_training_step(self): + def post_training_step(self) -> None: pass @classmethod