From 5c05719f27b160a7f6db9345542182986d4feb27 Mon Sep 17 00:00:00 2001
From: Jirka Borovec
Date: Mon, 8 Aug 2022 08:15:54 +0200
Subject: [PATCH 01/59] Freeze requirements for CI (#14007)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* free requirements
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* typo
* typo
* ui
* mypy
* todo
* mypy
Co-authored-by: Carlos Mocholí
* mypy
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Carlos Mocholí
Co-authored-by: Akihiro Nitta
---
.github/workflows/code-checks.yml | 1 +
requirements/app/base.txt | 7 +++----
requirements/app/cloud.txt | 3 +--
requirements/app/docs.txt | 17 ++++++++---------
requirements/app/test.txt | 15 +++++----------
requirements/app/ui.txt | 2 +-
requirements/pytorch/docs.txt | 16 ++++++++--------
requirements/pytorch/test.txt | 27 +++++++++++++--------------
8 files changed, 40 insertions(+), 48 deletions(-)
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index ed9cd46adbe44..7b5f3f26602e8 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -34,6 +34,7 @@ jobs:
run: |
pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt
+ # todo: adjust requirements for both code-bases
pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
pip list
diff --git a/requirements/app/base.txt b/requirements/app/base.txt
index 0a0b9cdb4719d..02eeb04bfa218 100644
--- a/requirements/app/base.txt
+++ b/requirements/app/base.txt
@@ -1,9 +1,8 @@
-py
lightning-cloud==0.5.0
packaging
-deepdiff>=5.7.0
+deepdiff>=5.7.0, <=5.8.1
starsessions
-fsspec>=2022.01.0
-s3fs>=2022.1.0
+fsspec>=2022.01.0, <=2022.7.1
+s3fs>=2022.1.0, <=2022.7.1
croniter # for now until we found something more robust.
traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741
diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt
index 5f8bf0c48692f..ff18d47b44565 100644
--- a/requirements/app/cloud.txt
+++ b/requirements/app/cloud.txt
@@ -1,5 +1,4 @@
starsessions
redis>=4.0.0, <=4.2.4
docker==5.0.3
-setuptools==59.5.0
-s3fs==2022.1.0
+# setuptools==59.5.0
diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt
index b35cc585b40c7..bf22aef2c2d92 100644
--- a/requirements/app/docs.txt
+++ b/requirements/app/docs.txt
@@ -1,18 +1,17 @@
sphinx>=4.0,<5.0
-myst-parser>=0.15
-nbsphinx>=0.8.5
+myst-parser>=0.15,<0.17
+nbsphinx>=0.8.5, <=0.8.9
ipython[notebook]
ipython_genutils
-pandoc>=1.0
-docutils>=0.16
-sphinxcontrib-fulltoc>=1.0
+pandoc>=1.0, <=2.2
+docutils>=0.16, <0.19
+sphinxcontrib-fulltoc>=1.0, <=1.2.0
sphinxcontrib-mockautodoc
https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip
sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405)
-sphinx-paramlinks>=0.5.1
-sphinx-togglebutton>=0.2
-sphinx-copybutton>=0.3
+sphinx-paramlinks>=0.5.1, <=0.5.4
+sphinx-togglebutton>=0.2, <=0.3.2
+sphinx-copybutton>=0.3, <=0.5.0
sphinx-autobuild
-typing-extensions # already in `requirements.txt` but the docs CI job does not install it
jinja2>=3.0.0,<3.1.0
diff --git a/requirements/app/test.txt b/requirements/app/test.txt
index 9d2ed0af910ca..ab5ef8f1e85ac 100644
--- a/requirements/app/test.txt
+++ b/requirements/app/test.txt
@@ -1,15 +1,10 @@
-coverage>=5.0
-codecov>=2.1
-pytest>=5.0
-pytest-timeout
-pytest-cov
+coverage>=6.4, <=6.4.2
+codecov>=2.1, <=2.1.12
+pytest>=7.0, <=7.1.2
+pytest-timeout <=2.1.0
+pytest-cov <=3.0.0
playwright==1.22.0
# pytest-flake8
-flake8>=3.0
-check-manifest
-twine>=3.2
-isort>=5.0
-mypy>=0.720
httpx
trio
pympler
diff --git a/requirements/app/ui.txt b/requirements/app/ui.txt
index 28df7f9c2ffe0..f0e4b2cdef471 100644
--- a/requirements/app/ui.txt
+++ b/requirements/app/ui.txt
@@ -1 +1 @@
-streamlit>=1.3.1
+streamlit>=1.3.1, <=1.11.1
diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt
index e6fbbe322b6bf..50e7c2049f6f6 100644
--- a/requirements/pytorch/docs.txt
+++ b/requirements/pytorch/docs.txt
@@ -1,16 +1,16 @@
sphinx>=4.0,<5.0
myst-parser>=0.15,<0.17
-nbsphinx>=0.8.5
+nbsphinx>=0.8.5, <=0.8.9
ipython[notebook]
-pandoc>=1.0
-docutils>=0.16
-sphinxcontrib-fulltoc>=1.0
+pandoc>=1.0, <=2.2
+docutils>=0.16, <0.19
+sphinxcontrib-fulltoc>=1.0, <=1.2.0
sphinxcontrib-mockautodoc
pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip
-sphinx-autodoc-typehints>=1.11,<1.15 # v1.15 failing on master (#11405)
-sphinx-paramlinks>=0.5.1
-sphinx-togglebutton>=0.2
-sphinx-copybutton>=0.3
+sphinx-autodoc-typehints>=1.11,<1.15 # strict; v1.15 failing on master (#11405)
+sphinx-paramlinks>=0.5.1, <=0.5.4
+sphinx-togglebutton>=0.2, <=0.3.2
+sphinx-copybutton>=0.3, <=0.5.0
typing-extensions # already in `requirements.txt` but the docs CI job does not install it
jinja2>=3.0.0,<3.1.0
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
index ce54cd087b1de..c155400a3d35f 100644
--- a/requirements/pytorch/test.txt
+++ b/requirements/pytorch/test.txt
@@ -1,18 +1,17 @@
-coverage>=6.4
-codecov>=2.1
-pytest>=7.0
-pytest-cov
-pytest-forked
+coverage>=6.4, <=6.4.2
+codecov>=2.1, <=2.1.12
+pytest>=7.0, <=7.1.2
+pytest-cov <=3.0.0
+pytest-forked <=1.4.0
pytest-rerunfailures>=10.2
-mypy>=0.920
-flake8>=3.9.2
pre-commit>=1.0
+mypy==0.971
# needed in tests
-cloudpickle>=1.3
-scikit-learn>0.22.1
-onnxruntime
-psutil # for `DeviceStatsMonitor`
-pandas # needed in benchmarks
-fastapi
-uvicorn
+cloudpickle>=1.3, <=2.1.0
+scikit-learn>0.22.1, <=1.1.1
+onnxruntime<=1.12.0
+psutil<=5.9.1 # for `DeviceStatsMonitor`
+pandas>1.0, <=1.4.3 # needed in benchmarks
+fastapi<=0.79.0
+uvicorn<=0.18.2
From 76836a33cdfa63e2c85c6f4ea9b2a1f174c973e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Mon, 8 Aug 2022 10:06:41 +0200
Subject: [PATCH 02/59] Run mypy with PyTorch 1.12 (#14044)
---
.github/workflows/code-checks.yml | 2 +-
pyproject.toml | 1 -
.../plugins/precision/fully_sharded_native_amp.py | 2 +-
.../strategies/fully_sharded_native.py | 2 +-
.../strategies/launchers/multiprocessing.py | 2 +-
src/pytorch_lightning/utilities/cloud_io.py | 11 ++++++-----
6 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml
index 7b5f3f26602e8..15bd5e9911740 100644
--- a/.github/workflows/code-checks.yml
+++ b/.github/workflows/code-checks.yml
@@ -32,7 +32,7 @@ jobs:
- name: Install dependencies
run: |
- pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
+ pip install torch==1.12 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt
# todo: adjust requirements for both code-bases
pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html
diff --git a/pyproject.toml b/pyproject.toml
index 5473e73c52e19..9b8400ba27577 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,7 +52,6 @@ module = [
"pytorch_lightning.callbacks.progress.rich_progress",
"pytorch_lightning.callbacks.quantization",
"pytorch_lightning.core.datamodule",
- "pytorch_lightning.core.decorators",
"pytorch_lightning.core.module",
"pytorch_lightning.core.saving",
"pytorch_lightning.demos.boring_classes",
diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
index 8c693f2975bbd..60e53b880c84d 100644
--- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
+++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py
@@ -23,7 +23,7 @@
if _TORCH_GREATER_EQUAL_1_12:
from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision
else:
- MixedPrecision = None
+ MixedPrecision = None # type: ignore[misc,assignment]
class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin):
diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py
index 4c351f26fa3b9..d92931fb5cdb2 100644
--- a/src/pytorch_lightning/strategies/fully_sharded_native.py
+++ b/src/pytorch_lightning/strategies/fully_sharded_native.py
@@ -51,7 +51,7 @@
)
from torch.distributed.fsdp.wrap import enable_wrap
else:
- MixedPrecision = None
+ MixedPrecision = None # type: ignore[misc,assignment]
BackwardPrefetch = None # type: ignore[misc,assignment]
CPUOffload = None # type: ignore[misc,assignment]
diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py
index 39bba092e9c60..2617e5fe27b10 100644
--- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py
+++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py
@@ -144,7 +144,7 @@ def _recover_results_in_main_process(self, worker_output: "_WorkerOutput", train
# load last weights
if worker_output.weights_path is not None:
ckpt = self._strategy.checkpoint_io.load_checkpoint(worker_output.weights_path)
- trainer.lightning_module.load_state_dict(ckpt) # type: ignore[arg-type]
+ trainer.lightning_module.load_state_dict(ckpt)
self._strategy.checkpoint_io.remove_checkpoint(worker_output.weights_path)
trainer.state = worker_output.trainer_state
diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py
index 81482a8ab24f9..ee3358be59541 100644
--- a/src/pytorch_lightning/utilities/cloud_io.py
+++ b/src/pytorch_lightning/utilities/cloud_io.py
@@ -22,14 +22,12 @@
from fsspec.core import url_to_fs
from fsspec.implementations.local import AbstractFileSystem
-from pytorch_lightning.utilities.types import _PATH
+from pytorch_lightning.utilities.types import _DEVICE, _PATH
def load(
path_or_url: Union[IO, _PATH],
- map_location: Optional[
- Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]]
- ] = None,
+ map_location: Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] = None,
) -> Any:
"""Loads a checkpoint.
@@ -41,7 +39,10 @@ def load(
# any sort of BytesIO or similar
return torch.load(path_or_url, map_location=map_location)
if str(path_or_url).startswith("http"):
- return torch.hub.load_state_dict_from_url(str(path_or_url), map_location=map_location)
+ return torch.hub.load_state_dict_from_url(
+ str(path_or_url),
+ map_location=map_location, # type: ignore[arg-type] # upstream annotation is not correct
+ )
fs = get_filesystem(path_or_url)
with fs.open(path_or_url, "rb") as f:
return torch.load(f, map_location=map_location)
From aaeff90254aa0a1b91aaed759d15e66123533618 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Mon, 8 Aug 2022 10:07:54 +0200
Subject: [PATCH 03/59] Remove deprecated `DistributedType` and `DeviceType`
enum classes (#14045)
---
src/pytorch_lightning/CHANGELOG.md | 8 +-
src/pytorch_lightning/utilities/__init__.py | 1 -
src/pytorch_lightning/utilities/enums.py | 95 +------------------
.../deprecated_api/test_remove_1-8.py | 13 ---
4 files changed, 8 insertions(+), 109 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 5d77a3ad293b9..565ef0e8438b5 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -30,7 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Deprecated `amp_level` from `Trainer` in favour of passing it explictly via precision plugin ([#13898](https://github.com/Lightning-AI/lightning/pull/13898))
--
+- Deprecated the calls to `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868))
### Removed
@@ -44,6 +44,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Removed the deprecated `DDP2Strategy` ([#14026](https://github.com/Lightning-AI/lightning/pull/14026))
+- Removed the deprecated `DistributedType` and `DeviceType` enum classes ([#14045](https://github.com/Lightning-AI/lightning/pull/14045))
+
+
+- Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868))
+
+
### Fixed
- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983))
diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py
index df5084dd85490..c849ba0a05d68 100644
--- a/src/pytorch_lightning/utilities/__init__.py
+++ b/src/pytorch_lightning/utilities/__init__.py
@@ -21,7 +21,6 @@
_AcceleratorType,
_StrategyType,
AMPType,
- DistributedType,
GradClipAlgorithmType,
LightningEnum,
)
diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py
index e687d3f9f046b..06d616f87259f 100644
--- a/src/pytorch_lightning/utilities/enums.py
+++ b/src/pytorch_lightning/utilities/enums.py
@@ -15,11 +15,9 @@
from __future__ import annotations
import os
-from enum import Enum, EnumMeta
-from typing import Any
+from enum import Enum
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.warnings import rank_zero_deprecation
class LightningEnum(str, Enum):
@@ -43,37 +41,6 @@ def __hash__(self) -> int:
return hash(self.value.lower())
-class _DeprecatedEnumMeta(EnumMeta):
- """Enum that calls `deprecate()` whenever a member is accessed.
-
- Adapted from: https://stackoverflow.com/a/62309159/208880
- """
-
- def __getattribute__(cls, name: str) -> Any:
- obj = super().__getattribute__(name)
- # ignore __dunder__ names -- prevents potential recursion errors
- if not (name.startswith("__") and name.endswith("__")) and isinstance(obj, Enum):
- obj.deprecate()
- return obj
-
- def __getitem__(cls, name: str) -> Any:
- member: _DeprecatedEnumMeta = super().__getitem__(name)
- member.deprecate()
- return member
-
- def __call__(cls, *args: Any, **kwargs: Any) -> Any:
- obj = super().__call__(*args, **kwargs)
- if isinstance(obj, Enum):
- obj.deprecate()
- return obj
-
-
-class _DeprecatedEnum(LightningEnum, metaclass=_DeprecatedEnumMeta):
- """_DeprecatedEnum calls an enum's `deprecate()` method on member access."""
-
- pass
-
-
class AMPType(LightningEnum):
"""Type of Automatic Mixed Precission used for training.
@@ -110,66 +77,6 @@ def supported_types() -> list[str]:
return [x.value for x in PrecisionType]
-class DistributedType(_DeprecatedEnum):
- """Define type of training strategy.
-
- Deprecated since v1.6.0 and will be removed in v1.8.0.
-
- Use `_StrategyType` instead.
- """
-
- DP = "dp"
- DDP = "ddp"
- DDP_SPAWN = "ddp_spawn"
- TPU_SPAWN = "tpu_spawn"
- DEEPSPEED = "deepspeed"
- HOROVOD = "horovod"
- DDP_SHARDED = "ddp_sharded"
- DDP_SHARDED_SPAWN = "ddp_sharded_spawn"
- DDP_FULLY_SHARDED = "ddp_fully_sharded"
- HPU_PARALLEL = "hpu_parallel"
-
- @staticmethod
- def interactive_compatible_types() -> list[DistributedType]:
- """Returns a list containing interactive compatible DistributeTypes."""
- return [
- DistributedType.DP,
- DistributedType.DDP_SPAWN,
- DistributedType.DDP_SHARDED_SPAWN,
- DistributedType.TPU_SPAWN,
- ]
-
- def is_interactive_compatible(self) -> bool:
- """Returns whether self is interactive compatible."""
- return self in DistributedType.interactive_compatible_types()
-
- def deprecate(self) -> None:
- rank_zero_deprecation(
- "`DistributedType` Enum has been deprecated in v1.6 and will be removed in v1.8."
- f" Use the string value `{self.value!r}` instead."
- )
-
-
-class DeviceType(_DeprecatedEnum):
- """Define Device type by its nature - accelerators.
-
- Deprecated since v1.6.0 and will be removed in v1.8.0.
-
- Use `_AcceleratorType` instead.
- """
-
- CPU = "CPU"
- GPU = "GPU"
- IPU = "IPU"
- TPU = "TPU"
-
- def deprecate(self) -> None:
- rank_zero_deprecation(
- "`DeviceType` Enum has been deprecated in v1.6 and will be removed in v1.8."
- f" Use the string value `{self.value!r}` instead."
- )
-
-
class GradClipAlgorithmType(LightningEnum):
"""Define gradient_clip_algorithm types - training-tricks.
NORM type means "clipping gradients by norm". This computed over all model parameters together.
diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
index aa6c1a615f9d2..91be34c55078f 100644
--- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
+++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py
@@ -36,7 +36,6 @@
from pytorch_lightning.trainer.states import RunningStage
from pytorch_lightning.utilities import device_parser
from pytorch_lightning.utilities.apply_func import move_data_to_device
-from pytorch_lightning.utilities.enums import DeviceType, DistributedType
from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY
from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn
from tests_pytorch.deprecated_api import no_deprecated_call
@@ -44,18 +43,6 @@
from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator
-def test_v1_8_0_deprecated_distributed_type_enum():
-
- with pytest.deprecated_call(match="has been deprecated in v1.6 and will be removed in v1.8."):
- _ = DistributedType.DDP
-
-
-def test_v1_8_0_deprecated_device_type_enum():
-
- with pytest.deprecated_call(match="has been deprecated in v1.6 and will be removed in v1.8."):
- _ = DeviceType.CPU
-
-
@pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.")
def test_v1_8_0_deprecated_torchtext_batch():
From 355fda3702d640330fddbe25ad127879b0a7cbfa Mon Sep 17 00:00:00 2001
From: Dan Dale
Date: Mon, 8 Aug 2022 01:16:53 -0700
Subject: [PATCH 04/59] Add Promoted CLI to API Reference Section (#14072)
---
docs/source-pytorch/api_references.rst | 14 ++++++++++++++
1 file changed, 14 insertions(+)
diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst
index db4fc1e2c4cf8..8daed5ddcaf41 100644
--- a/docs/source-pytorch/api_references.rst
+++ b/docs/source-pytorch/api_references.rst
@@ -47,6 +47,20 @@ callbacks
Timer
TQDMProgressBar
+cli
+-----
+
+.. currentmodule:: pytorch_lightning.cli
+
+.. autosummary::
+ :toctree: api
+ :nosignatures:
+ :template: classtemplate.rst
+
+ LightningCLI
+ LightningArgumentParser
+ SaveConfigCallback
+
core
----
From 5271ed93e6823178d1698d150b7146fe7a288695 Mon Sep 17 00:00:00 2001
From: Krishna Kalyan
Date: Mon, 8 Aug 2022 10:03:52 +0100
Subject: [PATCH 05/59] Fix mypy errors attributed to
`pytorch_lightning.trainer.connectors.callback_connector.py` (#13750)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* Apply suggestions from code review
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: otaj <6065855+otaj@users.noreply.github.com>
Co-authored-by: Adrian Wälchli
Co-authored-by: Rohit Gupta
---
pyproject.toml | 1 -
.../trainer/connectors/callback_connector.py | 26 +++++++++++--------
2 files changed, 15 insertions(+), 12 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 9b8400ba27577..2f0e290440f44 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -63,7 +63,6 @@ module = [
"pytorch_lightning.strategies.sharded",
"pytorch_lightning.strategies.sharded_spawn",
"pytorch_lightning.trainer.callback_hook",
- "pytorch_lightning.trainer.connectors.callback_connector",
"pytorch_lightning.trainer.connectors.data_connector",
"pytorch_lightning.trainer.supporters",
"pytorch_lightning.trainer.trainer",
diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py
index 83881905beeb1..bb7f912420256 100644
--- a/src/pytorch_lightning/trainer/connectors/callback_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py
@@ -17,6 +17,7 @@
from datetime import timedelta
from typing import Dict, List, Optional, Sequence, Union
+import pytorch_lightning as pl
from pytorch_lightning.callbacks import (
Callback,
Checkpoint,
@@ -37,7 +38,7 @@
class CallbackConnector:
- def __init__(self, trainer):
+ def __init__(self, trainer: "pl.Trainer"):
self.trainer = trainer
def on_trainer_init(
@@ -50,7 +51,7 @@ def on_trainer_init(
enable_model_summary: bool,
max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None,
accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None,
- ):
+ ) -> None:
# init folder paths for checkpoint + weights save callbacks
self.trainer._default_root_dir = default_root_dir or os.getcwd()
if weights_save_path:
@@ -95,16 +96,18 @@ def on_trainer_init(
def _configure_accumulated_gradients(
self, accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None
) -> None:
- grad_accum_callback = [cb for cb in self.trainer.callbacks if isinstance(cb, GradientAccumulationScheduler)]
+ grad_accum_callbacks: List[GradientAccumulationScheduler] = [
+ cb for cb in self.trainer.callbacks if isinstance(cb, GradientAccumulationScheduler)
+ ]
- if grad_accum_callback:
+ if grad_accum_callbacks:
if accumulate_grad_batches is not None:
raise MisconfigurationException(
"You have set both `accumulate_grad_batches` and passed an instance of "
"`GradientAccumulationScheduler` inside callbacks. Either remove `accumulate_grad_batches` "
"from trainer or remove `GradientAccumulationScheduler` from callbacks list."
)
- grad_accum_callback = grad_accum_callback[0]
+ grad_accum_callback = grad_accum_callbacks[0]
else:
if accumulate_grad_batches is None:
accumulate_grad_batches = 1
@@ -148,6 +151,7 @@ def _configure_model_summary_callback(self, enable_model_summary: bool) -> None:
progress_bar_callback = self.trainer.progress_bar_callback
is_progress_bar_rich = isinstance(progress_bar_callback, RichProgressBar)
+ model_summary: ModelSummary
if progress_bar_callback is not None and is_progress_bar_rich:
model_summary = RichModelSummary()
else:
@@ -188,7 +192,7 @@ def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dic
timer = Timer(duration=max_time, interval="step")
self.trainer.callbacks.append(timer)
- def _configure_fault_tolerance_callbacks(self):
+ def _configure_fault_tolerance_callbacks(self) -> None:
from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint
if any(isinstance(cb, _FaultToleranceCheckpoint) for cb in self.trainer.callbacks):
@@ -196,7 +200,7 @@ def _configure_fault_tolerance_callbacks(self):
# don't use `log_dir` to minimize the chances of failure
self.trainer.callbacks.append(_FaultToleranceCheckpoint(dirpath=self.trainer.default_root_dir))
- def _attach_model_logging_functions(self):
+ def _attach_model_logging_functions(self) -> None:
lightning_module = self.trainer.lightning_module
for callback in self.trainer.callbacks:
callback.log = lightning_module.log
@@ -243,7 +247,7 @@ def _reorder_callbacks(callbacks: List[Callback]) -> List[Callback]:
A new list in which the last elements are Checkpoint if there were any present in the
input.
"""
- checkpoints = [c for c in callbacks if isinstance(c, Checkpoint)]
+ checkpoints: List[Callback] = [c for c in callbacks if isinstance(c, Checkpoint)]
not_checkpoints = [c for c in callbacks if not isinstance(c, Checkpoint)]
return not_checkpoints + checkpoints
@@ -263,12 +267,12 @@ def _configure_external_callbacks() -> List[Callback]:
else:
from pkg_resources import iter_entry_points
- factories = iter_entry_points("pytorch_lightning.callbacks_factory")
+ factories = iter_entry_points("pytorch_lightning.callbacks_factory") # type: ignore[assignment]
- external_callbacks = []
+ external_callbacks: List[Callback] = []
for factory in factories:
callback_factory = factory.load()
- callbacks_list: List[Callback] = callback_factory()
+ callbacks_list: Union[List[Callback], Callback] = callback_factory()
callbacks_list = [callbacks_list] if isinstance(callbacks_list, Callback) else callbacks_list
_log.info(
f"Adding {len(callbacks_list)} callbacks from entry point '{factory.name}':"
From 5c9b352eea38d39360324f9740e119dc42b2078e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Aug 2022 09:25:15 +0000
Subject: [PATCH 06/59] Update wandb requirement from <0.12.20,>=0.10.22 to
>=0.10.22,<0.13.2 in /requirements (#14080)
---
requirements/pytorch/loggers.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt
index 48a15c30f842f..df83a077f8457 100644
--- a/requirements/pytorch/loggers.txt
+++ b/requirements/pytorch/loggers.txt
@@ -7,4 +7,4 @@ neptune-client>=0.10.0, <0.16.4
comet-ml>=3.1.12, <3.31.8
mlflow>=1.0.0, <1.28.0
test_tube>=0.7.5, <=0.7.5
-wandb>=0.10.22, <0.12.20
+wandb>=0.10.22, <0.13.2
From b4ade232c8d8889fcadbf9b7b49380a3690f8acd Mon Sep 17 00:00:00 2001
From: Rick Izzo
Date: Mon, 8 Aug 2022 07:13:25 -0400
Subject: [PATCH 07/59] Fix: Start Lightning App on Cloud if Repo Begins With
Name "Lightning" (#14025)
---
.../utilities/packaging/lightning_utils.py | 7 ++++++-
.../utilities/packaging/test_lightning_utils.py | 16 ++++++++++++++++
2 files changed, 22 insertions(+), 1 deletion(-)
diff --git a/src/lightning_app/utilities/packaging/lightning_utils.py b/src/lightning_app/utilities/packaging/lightning_utils.py
index 37f4ff22988eb..073d4d7ab613a 100644
--- a/src/lightning_app/utilities/packaging/lightning_utils.py
+++ b/src/lightning_app/utilities/packaging/lightning_utils.py
@@ -89,8 +89,13 @@ def get_dist_path_if_editable_install(project_name) -> str:
def _prepare_lightning_wheels_and_requirements(root: Path) -> Optional[Callable]:
+ """This function determines if lightning is installed in editable mode (for developers) and packages the
+ current lightning source along with the app.
- if "site-packages" in _PROJECT_ROOT:
+ For normal users who install via PyPi or Conda, then this function does not do anything.
+ """
+
+ if not get_dist_path_if_editable_install("lightning"):
return
# Packaging the Lightning codebase happens only inside the `lightning` repo.
diff --git a/tests/tests_app/utilities/packaging/test_lightning_utils.py b/tests/tests_app/utilities/packaging/test_lightning_utils.py
index b34e3162d5a0c..8f30aa21dd396 100644
--- a/tests/tests_app/utilities/packaging/test_lightning_utils.py
+++ b/tests/tests_app/utilities/packaging/test_lightning_utils.py
@@ -1,4 +1,5 @@
import os
+from unittest import mock
import pytest
@@ -21,6 +22,21 @@ def test_prepare_lightning_wheels_and_requirement(tmpdir):
assert os.listdir(tmpdir) == []
+def _mocked_get_dist_path_if_editable_install(*args, **kwargs):
+ return None
+
+
+@mock.patch(
+ "lightning_app.utilities.packaging.lightning_utils.get_dist_path_if_editable_install",
+ new=_mocked_get_dist_path_if_editable_install,
+)
+def test_prepare_lightning_wheels_and_requirement_for_packages_installed_in_editable_mode(tmpdir):
+ """This test ensures the source does not get packaged inside the lightning repo if not installed in editable
+ mode."""
+ cleanup_handle = _prepare_lightning_wheels_and_requirements(tmpdir)
+ assert cleanup_handle is None
+
+
@pytest.mark.skip(reason="TODO: Find a way to check for the latest version")
@RunIf(skip_windows=True)
def test_verify_lightning_version(monkeypatch):
From d072e4451a73f8fc2d7886086a220fbaf614b49e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Mon, 8 Aug 2022 13:35:06 +0200
Subject: [PATCH 08/59] Fix dtype inference during gradient norm computation
(#14051)
---
src/pytorch_lightning/CHANGELOG.md | 3 +++
src/pytorch_lightning/utilities/grads.py | 6 +++---
tests/tests_pytorch/utilities/test_grads.py | 14 ++++++++++++++
3 files changed, 20 insertions(+), 3 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 565ef0e8438b5..915436e5a0bcf 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992))
+- Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051))
+
+
## [1.7.0] - 2022-08-02
### Added
diff --git a/src/pytorch_lightning/utilities/grads.py b/src/pytorch_lightning/utilities/grads.py
index 66c1b7d988522..76c3f39bdc013 100644
--- a/src/pytorch_lightning/utilities/grads.py
+++ b/src/pytorch_lightning/utilities/grads.py
@@ -41,12 +41,12 @@ def grad_norm(module: Module, norm_type: Union[float, int, str], group_separator
raise ValueError(f"`norm_type` must be a positive number or 'inf' (infinity norm). Got {norm_type}")
norms = {
- f"grad_{norm_type}_norm{group_separator}{name}": p.grad.data.norm(norm_type).item()
+ f"grad_{norm_type}_norm{group_separator}{name}": p.grad.data.norm(norm_type)
for name, p in module.named_parameters()
if p.grad is not None
}
if norms:
- total_norm = torch.tensor(list(norms.values())).norm(norm_type).item()
+ total_norm = torch.tensor(list(norms.values())).norm(norm_type)
norms[f"grad_{norm_type}_norm_total"] = total_norm
- norms = {k: round(v, 4) for k, v in norms.items()}
+ norms = {k: round(v.item(), 4) for k, v in norms.items()}
return norms
diff --git a/tests/tests_pytorch/utilities/test_grads.py b/tests/tests_pytorch/utilities/test_grads.py
index a548de66ab85d..49aab76403847 100644
--- a/tests/tests_pytorch/utilities/test_grads.py
+++ b/tests/tests_pytorch/utilities/test_grads.py
@@ -76,3 +76,17 @@ def __init__(self):
def test_grad_norm_invalid_norm_type(norm_type):
with pytest.raises(ValueError, match="`norm_type` must be a positive number or 'inf'"):
grad_norm(Mock(), norm_type)
+
+
+def test_grad_norm_with_double_dtype():
+ class Model(nn.Module):
+ def __init__(self):
+ super().__init__()
+ dtype = torch.double
+ self.param = nn.Parameter(torch.tensor(1.0, dtype=dtype))
+ # grad norm of this would become infinite
+ self.param.grad = torch.tensor(1e23, dtype=dtype)
+
+ model = Model()
+ norms = grad_norm(model, 2)
+ assert all(torch.isfinite(torch.tensor(v)) for v in norms.values()), norms
From 61a9f3a9bc19272ed8117a9e4dd25bd9c0608105 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 8 Aug 2022 12:53:57 +0000
Subject: [PATCH 09/59] Update tqdm requirement from <=4.63.0,>=4.57.0 to
>=4.57.0,<4.65.0 in /requirements (#13875)
Update tqdm requirement in /requirements
Updates the requirements on [tqdm](https://github.com/tqdm/tqdm) to permit the latest version.
- [Release notes](https://github.com/tqdm/tqdm/releases)
- [Commits](https://github.com/tqdm/tqdm/compare/v4.57.0...v4.64.0)
---
updated-dependencies:
- dependency-name: tqdm
dependency-type: direct:production
...
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Jirka Borovec
---
requirements/pytorch/base.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt
index e8743b18c73b0..49e2243319206 100644
--- a/requirements/pytorch/base.txt
+++ b/requirements/pytorch/base.txt
@@ -3,7 +3,7 @@
numpy>=1.17.2, <1.23.1
torch>=1.9.*, <=1.12.0
-tqdm>=4.57.0, <=4.63.0
+tqdm>=4.57.0, <4.65.0
PyYAML>=5.4, <=6.0
fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0
tensorboard>=2.9.1, <2.10.0
From 890156a0163668149a47943907694c40cad153d3 Mon Sep 17 00:00:00 2001
From: JongMok Lee
Date: Mon, 8 Aug 2022 22:16:56 +0900
Subject: [PATCH 10/59] Fix mypy errors in
`pytorch_lightning/strategies/ddp.py` (#13885)
Co-authored-by: awaelchli
---
pyproject.toml | 1 -
.../overrides/distributed.py | 2 -
src/pytorch_lightning/strategies/ddp.py | 69 +++++++++++++------
src/pytorch_lightning/strategies/ddp_spawn.py | 3 +-
src/pytorch_lightning/strategies/deepspeed.py | 4 +-
5 files changed, 51 insertions(+), 28 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 2f0e290440f44..761c7be04cc0e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,6 @@ module = [
"pytorch_lightning.profilers.base",
"pytorch_lightning.profilers.pytorch",
"pytorch_lightning.profilers.simple",
- "pytorch_lightning.strategies.ddp",
"pytorch_lightning.strategies.sharded",
"pytorch_lightning.strategies.sharded_spawn",
"pytorch_lightning.trainer.callback_hook",
diff --git a/src/pytorch_lightning/overrides/distributed.py b/src/pytorch_lightning/overrides/distributed.py
index f09a7b9e3ae08..929d1ed486f4a 100644
--- a/src/pytorch_lightning/overrides/distributed.py
+++ b/src/pytorch_lightning/overrides/distributed.py
@@ -45,8 +45,6 @@ def _find_tensors(
# https://github.com/pytorch/pytorch/blob/v1.7.1/torch/nn/parallel/distributed.py#L626-L638
def prepare_for_backward(model: DistributedDataParallel, output: Any) -> None:
# `prepare_for_backward` is `DistributedDataParallel` specific.
- if not isinstance(model, DistributedDataParallel):
- return
if torch.is_grad_enabled() and model.require_backward_grad_sync:
model.require_forward_param_sync = True # type: ignore[assignment]
# We'll return the output object verbatim since it is a freeform
diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py
index 922730df35269..57ab3a151b011 100644
--- a/src/pytorch_lightning/strategies/ddp.py
+++ b/src/pytorch_lightning/strategies/ddp.py
@@ -32,6 +32,7 @@
import pytorch_lightning as pl
from pytorch_lightning.core.optimizer import LightningOptimizer
from pytorch_lightning.overrides import LightningDistributedModule
+from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase
from pytorch_lightning.overrides.distributed import prepare_for_backward
from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment
@@ -39,6 +40,7 @@
from pytorch_lightning.plugins.precision import PrecisionPlugin
from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher
from pytorch_lightning.strategies.parallel import ParallelStrategy
+from pytorch_lightning.strategies.strategy import TBroadcast
from pytorch_lightning.trainer.states import TrainerFn
from pytorch_lightning.utilities.distributed import (
_get_process_group_backend_from_env,
@@ -57,7 +59,7 @@
from pytorch_lightning.utilities.optimizer import optimizers_to_device
from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn
from pytorch_lightning.utilities.seed import reset_seed
-from pytorch_lightning.utilities.types import STEP_OUTPUT
+from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep
if _FAIRSCALE_AVAILABLE:
from fairscale.optim import OSS
@@ -83,12 +85,12 @@ def __init__(
checkpoint_io: Optional[CheckpointIO] = None,
precision_plugin: Optional[PrecisionPlugin] = None,
ddp_comm_state: Optional[object] = None,
- ddp_comm_hook: Optional[callable] = None,
- ddp_comm_wrapper: Optional[callable] = None,
+ ddp_comm_hook: Optional[Callable] = None,
+ ddp_comm_wrapper: Optional[Callable] = None,
model_averaging_period: Optional[int] = None,
process_group_backend: Optional[str] = None,
timeout: Optional[timedelta] = default_pg_timeout,
- **kwargs: Union[Any, Dict[str, Any]],
+ **kwargs: Any,
) -> None:
super().__init__(
accelerator=accelerator,
@@ -105,7 +107,7 @@ def __init__(
self._ddp_comm_wrapper = ddp_comm_wrapper
self._model_averaging_period = model_averaging_period
self._model_averager: Optional[ModelAverager] = None
- self._pids: Optional[List[int]] = None
+ self._pids: List[int] = []
self._sync_dir: Optional[str] = None
self._rank_0_will_call_children_scripts: bool = False
self._process_group_backend: Optional[str] = process_group_backend
@@ -117,6 +119,7 @@ def is_distributed(self) -> bool:
@property
def root_device(self) -> torch.device:
+ assert self.parallel_devices is not None
return self.parallel_devices[self.local_rank]
@property
@@ -129,11 +132,11 @@ def num_nodes(self, num_nodes: int) -> None:
self._num_nodes = num_nodes
@property
- def num_processes(self):
+ def num_processes(self) -> int:
return len(self.parallel_devices) if self.parallel_devices is not None else 0
@property
- def distributed_sampler_kwargs(self):
+ def distributed_sampler_kwargs(self) -> Dict[str, Any]:
distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank)
return distributed_sampler_kwargs
@@ -146,6 +149,7 @@ def process_group_backend(self) -> Optional[str]:
return self._process_group_backend
def _configure_launcher(self) -> None:
+ assert self.cluster_environment is not None
if not self.cluster_environment.creates_processes_externally:
self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes)
self._rank_0_will_call_children_scripts = True
@@ -156,10 +160,11 @@ def setup_environment(self) -> None:
def setup(self, trainer: "pl.Trainer") -> None:
# share ddp pids to all processes
- self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts)
+ self._rank_0_will_call_children_scripts = bool(self.broadcast(self._rank_0_will_call_children_scripts))
if self._should_run_deadlock_detection():
self._share_information_to_prevent_deadlock()
+ assert self.accelerator is not None
self.accelerator.setup(trainer)
# move the model to the correct device
@@ -170,6 +175,7 @@ def setup(self, trainer: "pl.Trainer") -> None:
if trainer_fn == TrainerFn.FITTING:
if self._layer_sync:
+ assert self.model is not None
self.model = self._layer_sync.apply(self.model)
self.setup_precision_plugin()
@@ -193,7 +199,7 @@ def _setup_model(self, model: Module) -> DistributedDataParallel:
log.detail(f"setting up DDP model with device ids: {device_ids}, kwargs: {self._ddp_kwargs}")
return DistributedDataParallel(module=model, device_ids=device_ids, **self._ddp_kwargs)
- def setup_distributed(self):
+ def setup_distributed(self) -> None:
log.detail(f"{self.__class__.__name__}: setting up distributed...")
reset_seed()
@@ -204,6 +210,7 @@ def setup_distributed(self):
rank_zero_only.rank = self.global_rank
self._process_group_backend = self._get_process_group_backend()
+ assert self.cluster_environment is not None
init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout)
def _get_process_group_backend(self) -> str:
@@ -230,6 +237,7 @@ def pre_configure_ddp(self) -> None:
def _register_ddp_hooks(self) -> None:
log.detail(f"{self.__class__.__name__}: registering ddp hooks")
if self.root_device.type == "cuda" and self._is_single_process_single_device:
+ assert isinstance(self.model, DistributedDataParallel)
register_ddp_comm_hook(
model=self.model,
ddp_comm_state=self._ddp_comm_state,
@@ -262,6 +270,7 @@ def _enable_model_averaging(self) -> None:
f"{optimizer.__class__.__name__}."
)
+ assert self._ddp_comm_state is not None
self._model_averager = torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager(
period=self._model_averaging_period, warmup_steps=self._ddp_comm_state.start_localSGD_iter
)
@@ -296,15 +305,16 @@ def optimizer_step(
def configure_ddp(self) -> None:
log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel")
self.pre_configure_ddp()
+ assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
self.model = self._setup_model(LightningDistributedModule(self.model))
self._register_ddp_hooks()
- def determine_ddp_device_ids(self):
+ def determine_ddp_device_ids(self) -> Optional[List[int]]:
if self.root_device.type == "cpu":
return None
return [self.root_device.index]
- def barrier(self, *args, **kwargs) -> None:
+ def barrier(self, *args: Any, **kwargs: Any) -> None:
if not distributed_available():
return
if torch.distributed.get_backend() == "nccl":
@@ -312,23 +322,29 @@ def barrier(self, *args, **kwargs) -> None:
else:
torch.distributed.barrier()
- def broadcast(self, obj: object, src: int = 0) -> object:
+ def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast:
obj = [obj]
if self.global_rank != src:
- obj = [None]
+ obj = [None] # type: ignore[list-item]
torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD)
return obj[0]
def pre_backward(self, closure_loss: Tensor) -> None:
"""Run before precision plugin executes backward."""
+ if not isinstance(self.model, DistributedDataParallel):
+ return
+ assert self.lightning_module is not None
if not self.lightning_module.automatic_optimization:
prepare_for_backward(self.model, closure_loss)
- def model_to_device(self):
+ def model_to_device(self) -> None:
log.detail(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...")
+ assert self.model is not None
self.model.to(self.root_device)
- def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> Tensor:
+ def reduce(
+ self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean"
+ ) -> Tensor:
"""Reduces a tensor from several distributed processes to one aggregated tensor.
Args:
@@ -344,30 +360,38 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp,
tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op)
return tensor
- def training_step(self, *args, **kwargs) -> STEP_OUTPUT:
+ def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
+ assert self.model is not None
with self.precision_plugin.train_step_context():
return self.model(*args, **kwargs)
- def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]:
+ def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
with self.precision_plugin.val_step_context():
+ assert self.lightning_module is not None
+ assert self.model is not None
if self.lightning_module.trainer.state.fn == TrainerFn.FITTING:
# used when calling `trainer.fit`
return self.model(*args, **kwargs)
else:
# used when calling `trainer.validate`
+ assert isinstance(self.model, ValidationStep)
return self.model.validation_step(*args, **kwargs)
- def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]:
+ def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]:
with self.precision_plugin.test_step_context():
+ assert isinstance(self.model, TestStep)
return self.model.test_step(*args, **kwargs)
- def predict_step(self, *args, **kwargs) -> STEP_OUTPUT:
+ def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT:
with self.precision_plugin.predict_step_context():
+ assert isinstance(self.model, PredictStep)
return self.model.predict_step(*args, **kwargs)
- def post_training_step(self):
+ def post_training_step(self) -> None:
+ assert self.lightning_module is not None
if not self.lightning_module.automatic_optimization:
- self.model.require_backward_grad_sync = True
+ assert self.model is not None
+ self.model.require_backward_grad_sync = True # type: ignore[assignment]
@classmethod
def register_strategies(cls, strategy_registry: Dict) -> None:
@@ -458,7 +482,7 @@ def teardown(self) -> None:
if (
_TORCH_GREATER_EQUAL_1_11
and not self.model.static_graph
- and self.model._get_ddp_logging_data().get("can_set_static_graph")
+ and self.model._get_ddp_logging_data().get("can_set_static_graph") # type: ignore[operator]
):
rank_zero_info(
"Your model can run with static graph optimizations. For future training runs, we suggest you"
@@ -475,6 +499,7 @@ def teardown(self) -> None:
and pl_module._trainer.state.fn == TrainerFn.FITTING
and self._layer_sync
):
+ assert self.model is not None
self.model = self._layer_sync.revert(self.model)
super().teardown()
diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py
index 30bcef457c44a..21602e60a5754 100644
--- a/src/pytorch_lightning/strategies/ddp_spawn.py
+++ b/src/pytorch_lightning/strategies/ddp_spawn.py
@@ -254,9 +254,10 @@ def model_to_device(self) -> None:
def pre_backward(self, closure_loss: Tensor) -> None:
"""Run before precision plugin executes backward."""
+ if not isinstance(self.model, DistributedDataParallel):
+ return
assert self.lightning_module is not None
if not self.lightning_module.automatic_optimization:
- assert isinstance(self.model, DistributedDataParallel)
prepare_for_backward(self.model, closure_loss)
def reduce(
diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py
index b0b55374ba1a9..3c31aeb7a7657 100644
--- a/src/pytorch_lightning/strategies/deepspeed.py
+++ b/src/pytorch_lightning/strategies/deepspeed.py
@@ -19,7 +19,7 @@
import platform
from collections import OrderedDict
from pathlib import Path
-from typing import Any, cast, Dict, Generator, List, Mapping, Optional, Tuple, Union
+from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union
import torch
from torch import Tensor
@@ -831,7 +831,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
if self.load_full_weights and self.zero_stage_3:
# Broadcast to ensure we load from the rank 0 checkpoint
# This doesn't have to be the case when using deepspeed sharded checkpointing
- checkpoint_path = cast(_PATH, self.broadcast(checkpoint_path))
+ checkpoint_path = self.broadcast(checkpoint_path)
return super().load_checkpoint(checkpoint_path)
# Rely on deepspeed to load the checkpoint and necessary information
From 7439f5d7491a87fbbb33f47ac18fa4ff8c7eeb23 Mon Sep 17 00:00:00 2001
From: Sean Naren
Date: Mon, 8 Aug 2022 14:23:07 +0100
Subject: [PATCH 11/59] Update CODEOWNERS (remove myself from defaults + some
specifics) (#14084)
Update CODEOWNERS
---
.github/CODEOWNERS | 14 +++++++-------
1 file changed, 7 insertions(+), 7 deletions(-)
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index e40828557c2cf..05f7e91104589 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,7 +5,7 @@
# the repo. Unless a later match takes precedence,
# @global-owner1 and @global-owner2 will be requested for
# review when someone opens a pull request.
-* @williamfalcon @borda @tchaton @SeanNaren @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7
+* @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7
# CI/CD and configs
/.github/ @borda @carmocca @akihironitta @tchaton
@@ -28,22 +28,22 @@
# Packages
/src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11
/src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11
-/src/pytorch_lightning/core @tchaton @SeanNaren @borda @carmocca @justusschock @kaushikb11
+/src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11
/src/pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11
/src/pytorch_lightning/lite @tchaton @awaelchli @carmocca
/src/pytorch_lightning/loggers @tchaton @awaelchli @borda
/src/pytorch_lightning/loggers/wandb.py @borisdayma
/src/pytorch_lightning/loggers/neptune.py @shnela @HubertJaworski @pkasprzyk @pitercl @Raalsky @aniezurawski @kamil-kaczmarek
/src/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca
-/src/pytorch_lightning/overrides @tchaton @SeanNaren @borda
-/src/pytorch_lightning/plugins @tchaton @SeanNaren @awaelchli @justusschock
+/src/pytorch_lightning/overrides @tchaton @borda
+/src/pytorch_lightning/plugins @tchaton @awaelchli @justusschock
/src/pytorch_lightning/profilers @williamfalcon @tchaton @borda @carmocca
/src/pytorch_lightning/profilers/pytorch.py @nbcsm @guotuofeng
/src/pytorch_lightning/strategies @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11
-/src/pytorch_lightning/trainer @williamfalcon @borda @tchaton @SeanNaren @carmocca @awaelchli @justusschock @kaushikb11
-/src/pytorch_lightning/trainer/connectors @tchaton @SeanNaren @carmocca @borda
+/src/pytorch_lightning/trainer @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11
+/src/pytorch_lightning/trainer/connectors @tchaton @carmocca @borda
/src/pytorch_lightning/tuner @SkafteNicki @borda @awaelchli
-/src/pytorch_lightning/utilities @borda @tchaton @SeanNaren @carmocca
+/src/pytorch_lightning/utilities @borda @tchaton @carmocca
/src/lightning_app @tchaton @awaelchli @manskx @hhsecond
From 55ae812dbf11f6568c73d5743aef0745715fb9fd Mon Sep 17 00:00:00 2001
From: thomas chaton
Date: Mon, 8 Aug 2022 15:48:50 +0200
Subject: [PATCH 12/59] Resolve increased time. (#14074)
---
src/lightning_app/CHANGELOG.md | 2 ++
src/lightning_app/utilities/proxies.py | 2 +-
tests/tests_app/core/test_lightning_app.py | 1 +
3 files changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index 07927a1b01f87..78a4e370e76ee 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -27,3 +27,5 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Fixed
- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970))
+
+- Resolved a bug about a race condition when sending the work state through the caller_queue ([#14074](https://github.com/Lightning-AI/lightning/pull/14074))
diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py
index 2c93a6c89f38c..99ad6e2aad0cf 100644
--- a/src/lightning_app/utilities/proxies.py
+++ b/src/lightning_app/utilities/proxies.py
@@ -74,7 +74,7 @@ def _send_data_to_caller_queue(work: "LightningWork", caller_queue: "BaseQueue",
data.update({"state": work_state})
logger.debug(f"Sending to {work.name}: {data}")
- caller_queue.put(data)
+ caller_queue.put(deepcopy(data))
# Reset the calls entry.
work_state["calls"] = calls
diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py
index a3a15085b98e3..e6c715f87ef03 100644
--- a/tests/tests_app/core/test_lightning_app.py
+++ b/tests/tests_app/core/test_lightning_app.py
@@ -896,6 +896,7 @@ def __init__(self, **kwargs):
def run(self, signal: int):
self.counter += 1
+ assert len(self._calls) == 2
class SizeFlow(LightningFlow):
From 34afde742ebe3acb56f5d0f14cd79d589d9771e0 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta
Date: Tue, 9 Aug 2022 00:00:46 +0900
Subject: [PATCH 13/59] CI: Enable Python 3.10 in full CPU testing (#13829)
* Update docker images to build
---
.github/workflows/README.md | 2 +-
.github/workflows/ci-pytorch_test-full.yml | 8 ++++++--
.github/workflows/ci-pytorch_test-slow.yml | 2 +-
3 files changed, 8 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index 8b9e7d173b03c..f559551e1237f 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -6,7 +6,7 @@
| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS |
| -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- |
-| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.8), (3.7, 1.11), (3.9, 1.8), (3.9, 1.12) | linux, mac, windows |
+| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows |
| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux |
| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows |
| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux |
diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch_test-full.yml
index fb6916d1414fe..445707d340c4b 100644
--- a/.github/workflows/ci-pytorch_test-full.yml
+++ b/.github/workflows/ci-pytorch_test-full.yml
@@ -21,9 +21,13 @@ jobs:
fail-fast: false
matrix:
os: [ubuntu-20.04, windows-2019, macOS-11]
- python-version: ["3.7", "3.9"] # minimum, maximum
+ python-version: ["3.7", "3.10"] # minimum, maximum
requires: ["oldest", "latest"]
release: ["stable"]
+ exclude:
+ # There's no distribution of the oldest PyTorch 1.9 for Python 3.10.
+ # TODO: Remove the exclusion when dropping PyTorch 1.9 support.
+ - {python-version: "3.10", requires: "oldest"}
# TODO: re-enable RC testing
# include:
# - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"}
@@ -41,7 +45,7 @@ jobs:
id: skip
shell: bash -l {0}
run: |
- FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+ FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-full.yml'
echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES
diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch_test-slow.yml
index 905f60aa85699..b3756bbe8c2f7 100644
--- a/.github/workflows/ci-pytorch_test-slow.yml
+++ b/.github/workflows/ci-pytorch_test-slow.yml
@@ -36,7 +36,7 @@ jobs:
id: skip
shell: bash -l {0}
run: |
- FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+ FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-slow.yml'
echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES
From 82d2d1d85746c7743cca47e760422d3e13af6a6f Mon Sep 17 00:00:00 2001
From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com>
Date: Mon, 8 Aug 2022 16:21:26 -0400
Subject: [PATCH 14/59] Fix mypy errors attributed to
`pytorch_lightning.core.saving` (#13932)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Adrian Wälchli
Co-authored-by: otaj <6065855+otaj@users.noreply.github.com>
Co-authored-by: Rohit Gupta
---
pyproject.toml | 1 -
src/pytorch_lightning/core/saving.py | 39 +++++++++++----------
src/pytorch_lightning/utilities/cloud_io.py | 6 ++--
src/pytorch_lightning/utilities/parsing.py | 4 ++-
src/pytorch_lightning/utilities/types.py | 3 +-
5 files changed, 29 insertions(+), 24 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 761c7be04cc0e..8db782df357d8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,6 @@ module = [
"pytorch_lightning.callbacks.quantization",
"pytorch_lightning.core.datamodule",
"pytorch_lightning.core.module",
- "pytorch_lightning.core.saving",
"pytorch_lightning.demos.boring_classes",
"pytorch_lightning.demos.mnist_datamodule",
"pytorch_lightning.profilers.base",
diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py
index da81e4c212560..ffdc0988a1a6e 100644
--- a/src/pytorch_lightning/core/saving.py
+++ b/src/pytorch_lightning/core/saving.py
@@ -20,10 +20,9 @@
from argparse import Namespace
from copy import deepcopy
from enum import Enum
-from typing import Any, Callable, Dict, IO, MutableMapping, Optional, Union
+from typing import Any, Callable, cast, Dict, IO, MutableMapping, Optional, Type, Union
from warnings import warn
-import torch
import yaml
import pytorch_lightning as pl
@@ -34,7 +33,7 @@
from pytorch_lightning.utilities.migration import pl_legacy_patch
from pytorch_lightning.utilities.parsing import parse_class_init_keys
from pytorch_lightning.utilities.rank_zero import rank_zero_warn
-from pytorch_lightning.utilities.types import _PATH
+from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH
log = logging.getLogger(__name__)
PRIMITIVE_TYPES = (bool, int, float, str)
@@ -58,11 +57,11 @@ class ModelIO:
def load_from_checkpoint(
cls,
checkpoint_path: Union[str, IO],
- map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None,
+ map_location: _MAP_LOCATION_TYPE = None,
hparams_file: Optional[str] = None,
strict: bool = True,
- **kwargs,
- ):
+ **kwargs: Any,
+ ) -> Union["pl.LightningModule", "pl.LightningDataModule"]:
r"""
Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint
it stores the arguments passed to ``__init__`` in the checkpoint under ``"hyper_parameters"``.
@@ -171,15 +170,15 @@ def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None:
def _load_from_checkpoint(
- cls: Union["pl.LightningModule", "pl.LightningDataModule"],
+ cls: Union[Type["ModelIO"], Type["pl.LightningModule"], Type["pl.LightningDataModule"]],
checkpoint_path: Union[str, IO],
- map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None,
+ map_location: _MAP_LOCATION_TYPE = None,
hparams_file: Optional[str] = None,
- strict: Optional[bool] = None,
+ strict: bool = True,
**kwargs: Any,
-) -> Any:
+) -> Union["pl.LightningModule", "pl.LightningDataModule"]:
if map_location is None:
- map_location = lambda storage, loc: storage
+ map_location = cast(_MAP_LOCATION_TYPE, lambda storage, loc: storage)
with pl_legacy_patch():
checkpoint = pl_load(checkpoint_path, map_location=map_location)
@@ -202,15 +201,18 @@ def _load_from_checkpoint(
if issubclass(cls, pl.LightningDataModule):
return _load_state(cls, checkpoint, **kwargs)
- return _load_state(cls, checkpoint, strict=strict, **kwargs)
+ # allow cls to be evaluated as subclassed LightningModule or,
+ # as LightningModule for internal tests
+ if issubclass(cls, pl.LightningModule):
+ return _load_state(cls, checkpoint, strict=strict, **kwargs)
def _load_state(
- cls: Union["pl.LightningModule", "pl.LightningDataModule"],
+ cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]],
checkpoint: Dict[str, Any],
- strict: Optional[bool] = None,
+ strict: bool = True,
**cls_kwargs_new: Any,
-) -> Any:
+) -> Union["pl.LightningModule", "pl.LightningDataModule"]:
cls_spec = inspect.getfullargspec(cls.__init__)
cls_init_args_name = inspect.signature(cls.__init__).parameters.keys()
@@ -228,8 +230,7 @@ def _load_state(
cls_kwargs_loaded.update(checkpoint.get(_old_hparam_key, {}))
# 2. Try to restore model hparams from checkpoint using the new key
- _new_hparam_key = cls.CHECKPOINT_HYPER_PARAMS_KEY
- cls_kwargs_loaded.update(checkpoint.get(_new_hparam_key))
+ cls_kwargs_loaded.update(checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_KEY, {}))
# 3. Ensure that `cls_kwargs_old` has the right type, back compatibility between dict and Namespace
cls_kwargs_loaded = _convert_loaded_hparams(cls_kwargs_loaded, checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_TYPE))
@@ -271,7 +272,9 @@ def _load_state(
return obj
-def _convert_loaded_hparams(model_args: dict, hparams_type: Optional[Union[Callable, str]] = None) -> object:
+def _convert_loaded_hparams(
+ model_args: Dict[str, Any], hparams_type: Optional[Union[Callable, str]] = None
+) -> Dict[str, Any]:
"""Convert hparams according given type in callable or string (past) format."""
# if not hparams type define
if not hparams_type:
diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py
index ee3358be59541..99629bcda8980 100644
--- a/src/pytorch_lightning/utilities/cloud_io.py
+++ b/src/pytorch_lightning/utilities/cloud_io.py
@@ -15,19 +15,19 @@
import io
from pathlib import Path
-from typing import Any, Callable, Dict, IO, Optional, Union
+from typing import Any, Dict, IO, Union
import fsspec
import torch
from fsspec.core import url_to_fs
from fsspec.implementations.local import AbstractFileSystem
-from pytorch_lightning.utilities.types import _DEVICE, _PATH
+from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH
def load(
path_or_url: Union[IO, _PATH],
- map_location: Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] = None,
+ map_location: _MAP_LOCATION_TYPE = None,
) -> Any:
"""Loads a checkpoint.
diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py
index 9f5fe2d6b6841..81877f1dffba7 100644
--- a/src/pytorch_lightning/utilities/parsing.py
+++ b/src/pytorch_lightning/utilities/parsing.py
@@ -108,7 +108,9 @@ def clean_namespace(hparams: Union[Dict[str, Any], Namespace]) -> None:
del hparams_dict[k]
-def parse_class_init_keys(cls: Type["pl.LightningModule"]) -> Tuple[str, Optional[str], Optional[str]]:
+def parse_class_init_keys(
+ cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]]
+) -> Tuple[str, Optional[str], Optional[str]]:
"""Parse key words for standard ``self``, ``*args`` and ``**kwargs``.
Examples:
diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py
index f6c14d366805f..18e2db6feb6c6 100644
--- a/src/pytorch_lightning/utilities/types.py
+++ b/src/pytorch_lightning/utilities/types.py
@@ -19,7 +19,7 @@
from contextlib import contextmanager
from dataclasses import dataclass
from pathlib import Path
-from typing import Any, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, Union
+from typing import Any, Callable, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, Union
import torch
from torch import Tensor
@@ -49,6 +49,7 @@
]
EVAL_DATALOADERS = Union[DataLoader, Sequence[DataLoader]]
_DEVICE = Union[torch.device, str, int]
+_MAP_LOCATION_TYPE = Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]]
@runtime_checkable
From 0cfc53d6b423531f598c9cfb386b3febbb8eb333 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Tue, 9 Aug 2022 10:26:02 +0200
Subject: [PATCH 15/59] Fix regression on default value for
`find_unused_parameters` (#14095)
---
src/pytorch_lightning/CHANGELOG.md | 3 +++
src/pytorch_lightning/strategies/ddp_spawn.py | 14 ++++++++++++--
tests/tests_pytorch/strategies/test_ddp.py | 12 ++++++++++++
.../strategies/test_ddp_spawn_strategy.py | 16 ++++++++++++++++
.../strategies/test_sharded_strategy.py | 14 ++++++++++++++
5 files changed, 57 insertions(+), 2 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 915436e5a0bcf..04eddf2c735f4 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -70,6 +70,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051))
+- Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095))
+
+
## [1.7.0] - 2022-08-02
### Added
diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py
index 21602e60a5754..de34320f54093 100644
--- a/src/pytorch_lightning/strategies/ddp_spawn.py
+++ b/src/pytorch_lightning/strategies/ddp_spawn.py
@@ -315,10 +315,20 @@ def post_training_step(self) -> None:
def register_strategies(cls, strategy_registry: Dict) -> None:
entries = (
("ddp_spawn", "spawn"),
- ("ddp_spawn_find_unused_parameters_false", "spawn"),
("ddp_fork", "fork"),
- ("ddp_fork_find_unused_parameters_false", "fork"),
("ddp_notebook", "fork"),
+ )
+ for name, start_method in entries:
+ strategy_registry.register(
+ name,
+ cls,
+ description=f"DDP strategy with `start_method` '{start_method}'",
+ start_method=start_method,
+ )
+
+ entries = (
+ ("ddp_spawn_find_unused_parameters_false", "spawn"),
+ ("ddp_fork_find_unused_parameters_false", "fork"),
("ddp_notebook_find_unused_parameters_false", "fork"),
)
for name, start_method in entries:
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
index 4610f6153386b..1a2a0475e7ed6 100644
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@@ -194,3 +194,15 @@ def root_device(self):
assert strategy._get_process_group_backend() == expected_process_group_backend
else:
assert strategy._get_process_group_backend() == expected_process_group_backend
+
+
+@pytest.mark.parametrize(
+ "strategy_name,expected_ddp_kwargs",
+ [
+ ("ddp", {}),
+ ("ddp_find_unused_parameters_false", {"find_unused_parameters": False}),
+ ],
+)
+def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
+ trainer = Trainer(strategy=strategy_name)
+ assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
index 52427c2c8cc3a..f485060833320 100644
--- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
+++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
@@ -178,3 +178,19 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group):
mock_init_process_group.assert_called_with(
process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta
)
+
+
+@pytest.mark.parametrize(
+ "strategy_name,expected_ddp_kwargs",
+ [
+ ("ddp_spawn", {}),
+ ("ddp_fork", {}),
+ ("ddp_notebook", {}),
+ ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}),
+ ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}),
+ ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}),
+ ],
+)
+def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
+ trainer = Trainer(strategy=strategy_name)
+ assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py
index a047a10df32e3..ad0673ed1a5fa 100644
--- a/tests/tests_pytorch/strategies/test_sharded_strategy.py
+++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py
@@ -300,3 +300,17 @@ def test_block_backward_sync():
with strategy.block_backward_sync():
pass
model.no_sync.assert_called_once()
+
+
+@pytest.mark.parametrize(
+ "strategy_name,expected_ddp_kwargs",
+ [
+ ("ddp_sharded", {}),
+ ("ddp_sharded_find_unused_parameters_false", {"find_unused_parameters": False}),
+ ("ddp_sharded_spawn", {}),
+ ("ddp_sharded_spawn_find_unused_parameters_false", {"find_unused_parameters": False}),
+ ],
+)
+def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
+ trainer = Trainer(strategy=strategy_name)
+ assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs
From d29a552b3c701ebc14d608347c1dbf55c3dfaa6a Mon Sep 17 00:00:00 2001
From: Robert S Lee
Date: Tue, 9 Aug 2022 04:27:08 -0400
Subject: [PATCH 16/59] Fix import in doctest example (#14067)
---
src/lightning_app/structures/dict.py | 2 +-
src/lightning_app/structures/list.py | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/src/lightning_app/structures/dict.py b/src/lightning_app/structures/dict.py
index 93e2b161b2e7a..b414269b93eec 100644
--- a/src/lightning_app/structures/dict.py
+++ b/src/lightning_app/structures/dict.py
@@ -22,7 +22,7 @@ def __init__(self, **kwargs: T):
.. doctest::
>>> from lightning_app import LightningFlow, LightningWork
- >>> from lightning_app.core import Dict
+ >>> from lightning_app.structures import Dict
>>> class CounterWork(LightningWork):
... def __init__(self):
... super().__init__()
diff --git a/src/lightning_app/structures/list.py b/src/lightning_app/structures/list.py
index f5a7c5c9913ad..cf691c98a8c38 100644
--- a/src/lightning_app/structures/list.py
+++ b/src/lightning_app/structures/list.py
@@ -24,7 +24,7 @@ def __init__(self, *items: T):
.. doctest::
>>> from lightning_app import LightningFlow, LightningWork
- >>> from lightning_app.core import List
+ >>> from lightning_app.structures import List
>>> class CounterWork(LightningWork):
... def __init__(self):
... super().__init__()
From c55fe7105b4d00735d22147612434ae9aebee4ab Mon Sep 17 00:00:00 2001
From: Anton Shevtsov <32237302+MrShevan@users.noreply.github.com>
Date: Tue, 9 Aug 2022 16:40:30 +0300
Subject: [PATCH 17/59] Prefix seed_everything log messages with rank info
(#14031)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Anton Shevtsov
Co-authored-by: Rohit Gupta
Co-authored-by: Carlos Mocholí
---
src/pytorch_lightning/CHANGELOG.md | 2 +-
src/pytorch_lightning/utilities/seed.py | 6 ++----
tests/tests_pytorch/utilities/test_seed.py | 18 ++++++++++++++++++
3 files changed, 21 insertions(+), 5 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 04eddf2c735f4..4cea5685cac6f 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Added
--
+- Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290))
-
diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py
index 6648b5a56b2b1..8fce6a1debfcf 100644
--- a/src/pytorch_lightning/utilities/seed.py
+++ b/src/pytorch_lightning/utilities/seed.py
@@ -24,7 +24,7 @@
import numpy as np
import torch
-from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn
+from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only, rank_zero_warn
log = logging.getLogger(__name__)
@@ -66,9 +66,7 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int:
rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}")
seed = _select_seed_randomly(min_seed_value, max_seed_value)
- # using `log.info` instead of `rank_zero_info`,
- # so users can verify the seed is properly set in distributed training.
- log.info(f"Global seed set to {seed}")
+ log.info(f"[rank: {_get_rank()}] Global seed set to {seed}")
os.environ["PL_GLOBAL_SEED"] = str(seed)
random.seed(seed)
np.random.seed(seed)
diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py
index 7f162bd605640..6908badf1a037 100644
--- a/tests/tests_pytorch/utilities/test_seed.py
+++ b/tests/tests_pytorch/utilities/test_seed.py
@@ -1,6 +1,8 @@
import os
import random
+from typing import Mapping
from unittest import mock
+from unittest.mock import MagicMock
import numpy as np
import pytest
@@ -96,3 +98,19 @@ def test_isolate_rng():
with isolate_rng():
generated = [random.random() for _ in range(3)]
assert random.random() == generated[0]
+
+
+@mock.patch("pytorch_lightning.utilities.seed.log.info")
+@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}])
+def test_seed_everything_log_info(log_mock: MagicMock, env_vars: Mapping[str, str]):
+ """Test that log message prefix with correct rank info."""
+ with mock.patch.dict(os.environ, env_vars, clear=True):
+ from pytorch_lightning.utilities.rank_zero import _get_rank
+
+ rank = _get_rank()
+
+ seed_utils.seed_everything(123)
+
+ expected_log = f"[rank: {rank}] Global seed set to 123"
+
+ log_mock.assert_called_once_with(expected_log)
From 9722127a741e9e108e49d9cffbc10a2842302c9c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Tue, 9 Aug 2022 16:03:36 +0200
Subject: [PATCH 18/59] Add missing codeowners for app package (#13542)
---
.github/CODEOWNERS | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 05f7e91104589..f83924b9566ce 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -45,7 +45,13 @@
/src/pytorch_lightning/tuner @SkafteNicki @borda @awaelchli
/src/pytorch_lightning/utilities @borda @tchaton @carmocca
-/src/lightning_app @tchaton @awaelchli @manskx @hhsecond
+/src/lightning_app @tchaton @manskx
+/src/lightning_app/cli/pl-app-template @awaelchli @tchaton @Borda
+/src/lightning_app/core @tchaton @awaelchli @manskx
+/src/lightning_app/core/queues.py @tchaton @hhsecond @manskx
+/src/lightning_app/runners/cloud.py @tchaton @hhsecond
+/src/lightning_app/testing @tchaton @manskx
+/src/lightning_app/__about__.py @nohalon @edenlightning @lantiga
# Examples
/examples/app_* @tchaton @awaelchli @manskx @hhsecond
From ac369f5570d0a492d08ac9c2ba6622e451d7e131 Mon Sep 17 00:00:00 2001
From: Rohit Gupta
Date: Tue, 9 Aug 2022 21:25:23 +0530
Subject: [PATCH 19/59] Fix incorrect `precision="mixed"` being used with
`DeepSpeedStrategy` and `IPUStrategy` (#14041)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Carlos Mocholí
---
src/pytorch_lightning/CHANGELOG.md | 3 +++
.../plugins/precision/deepspeed.py | 2 +-
src/pytorch_lightning/plugins/precision/ipu.py | 3 ++-
src/pytorch_lightning/strategies/deepspeed.py | 2 +-
src/pytorch_lightning/strategies/ipu.py | 2 +-
src/pytorch_lightning/strategies/utils.py | 2 +-
tests/tests_pytorch/accelerators/test_ipu.py | 2 +-
.../strategies/test_deepspeed_strategy.py | 13 ++++++-------
8 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 4cea5685cac6f..dac5533a6cb17 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992))
+- Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041))
+
+
- Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051))
diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py
index 01d3017760b0e..456bba1e77823 100644
--- a/src/pytorch_lightning/plugins/precision/deepspeed.py
+++ b/src/pytorch_lightning/plugins/precision/deepspeed.py
@@ -60,7 +60,7 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona
amp_level = amp_level or "O2"
- supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT, PrecisionType.MIXED)
+ supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT)
if precision not in supported_precision:
raise ValueError(
f"`Trainer(strategy='deepspeed', precision={precision!r})` is not supported."
diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py
index 89f544575f63f..67e5e373e9f52 100644
--- a/src/pytorch_lightning/plugins/precision/ipu.py
+++ b/src/pytorch_lightning/plugins/precision/ipu.py
@@ -19,6 +19,7 @@
import pytorch_lightning as pl
from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin
from pytorch_lightning.utilities import GradClipAlgorithmType
+from pytorch_lightning.utilities.enums import PrecisionType
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.model_helpers import is_overridden
from pytorch_lightning.utilities.warnings import WarningCache
@@ -35,7 +36,7 @@ class IPUPrecisionPlugin(PrecisionPlugin):
"""
def __init__(self, precision: int) -> None:
- supported_precision_values = (16, 32)
+ supported_precision_values = (PrecisionType.HALF, PrecisionType.FLOAT)
if precision not in supported_precision_values:
raise ValueError(
f"`Trainer(accelerator='ipu', precision={precision!r})` is not supported."
diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py
index 3c31aeb7a7657..8acbc80257bd1 100644
--- a/src/pytorch_lightning/strategies/deepspeed.py
+++ b/src/pytorch_lightning/strategies/deepspeed.py
@@ -696,7 +696,7 @@ def _auto_select_batch_size(self) -> int:
def _format_precision_config(self) -> None:
assert isinstance(self.config, dict)
- if self.precision_plugin.precision in (PrecisionType.HALF, PrecisionType.MIXED):
+ if self.precision_plugin.precision == PrecisionType.HALF:
if "fp16" not in self.config and self.precision_plugin.amp_type == AMPType.NATIVE:
# FP16 is a DeepSpeed standalone AMP implementation
rank_zero_info("Enabling DeepSpeed FP16.")
diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py
index c40addd4244b2..4bedbfd6d70fc 100644
--- a/src/pytorch_lightning/strategies/ipu.py
+++ b/src/pytorch_lightning/strategies/ipu.py
@@ -58,7 +58,7 @@ def __init__(
self.precision = precision
def forward(self, *inputs: Any, **kwargs: Any) -> Any:
- if self.precision in (PrecisionType.MIXED, PrecisionType.HALF):
+ if self.precision == PrecisionType.HALF:
inputs = self._move_float_tensors_to_half(inputs)
return super().forward(*inputs, **kwargs)
diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py
index b71458bfc30d3..cdae7bf434eca 100644
--- a/src/pytorch_lightning/strategies/utils.py
+++ b/src/pytorch_lightning/strategies/utils.py
@@ -24,7 +24,7 @@ def on_colab_kaggle() -> bool:
def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor:
if torch.is_floating_point(tensor):
- if precision in (PrecisionType.MIXED, PrecisionType.HALF):
+ if precision == PrecisionType.HALF:
return tensor.half()
if precision == PrecisionType.BFLOAT:
return tensor.bfloat16()
diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py
index 33d59d9a835ca..db3b9d1f91952 100644
--- a/tests/tests_pytorch/accelerators/test_ipu.py
+++ b/tests/tests_pytorch/accelerators/test_ipu.py
@@ -185,7 +185,7 @@ def test_optimization(tmpdir):
@RunIf(ipu=True)
-def test_mixed_precision(tmpdir):
+def test_half_precision(tmpdir):
class TestCallback(Callback):
def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None:
assert trainer.strategy.model.precision == 16
diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
index 4f2cc14b6c62d..272b03a846688 100644
--- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
+++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
@@ -171,12 +171,11 @@ def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config):
@RunIf(deepspeed=True)
@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1)
-@pytest.mark.parametrize("precision", [16, "mixed"])
@pytest.mark.parametrize(
"amp_backend",
["native", pytest.param("apex", marks=RunIf(amp_apex=True))],
)
-def test_deepspeed_precision_choice(_, amp_backend, precision, tmpdir):
+def test_deepspeed_precision_choice(_, amp_backend, tmpdir):
"""Test to ensure precision plugin is also correctly chosen.
DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin
@@ -188,16 +187,16 @@ def test_deepspeed_precision_choice(_, amp_backend, precision, tmpdir):
accelerator="gpu",
strategy="deepspeed",
amp_backend=amp_backend,
- precision=precision,
+ precision=16,
)
assert isinstance(trainer.strategy, DeepSpeedStrategy)
assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecisionPlugin)
- assert trainer.strategy.precision_plugin.precision == precision
+ assert trainer.strategy.precision_plugin.precision == 16
@RunIf(deepspeed=True)
-def test_deepspeed_with_invalid_config_path(tmpdir):
+def test_deepspeed_with_invalid_config_path():
"""Test to ensure if we pass an invalid config path we throw an exception."""
with pytest.raises(
@@ -218,7 +217,7 @@ def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config):
@RunIf(deepspeed=True)
-def test_deepspeed_defaults(tmpdir):
+def test_deepspeed_defaults():
"""Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed."""
strategy = DeepSpeedStrategy()
assert strategy.config is not None
@@ -663,7 +662,7 @@ def training_step(self, batch, batch_idx):
@RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True)
-def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config):
+def test_deepspeed_multigpu_stage_3(tmpdir):
"""Test to ensure ZeRO Stage 3 works with a parallel model."""
model = ModelParallelBoringModel()
trainer = Trainer(
From 56abd60f048f43a7abd036380a6b5297baaa3854 Mon Sep 17 00:00:00 2001
From: Gautier Dagan
Date: Tue, 9 Aug 2022 17:32:18 +0100
Subject: [PATCH 20/59] Fix assert wandb Run when mode="disabled" (#14112)
---
src/pytorch_lightning/loggers/wandb.py | 8 +++++---
1 file changed, 5 insertions(+), 3 deletions(-)
diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py
index 8e30827759b99..530fb58fabe5e 100644
--- a/src/pytorch_lightning/loggers/wandb.py
+++ b/src/pytorch_lightning/loggers/wandb.py
@@ -328,7 +328,7 @@ def __getstate__(self) -> Dict[str, Any]:
@property # type: ignore[misc]
@rank_zero_experiment
- def experiment(self) -> Run:
+ def experiment(self) -> Union[Run, RunDisabled]:
r"""
Actual wandb object. To use wandb features in your
@@ -361,11 +361,13 @@ def experiment(self) -> Run:
self._experiment = wandb.init(**self._wandb_init)
# define default x-axis
- if isinstance(self._experiment, Run) and getattr(self._experiment, "define_metric", None):
+ if isinstance(self._experiment, (Run, RunDisabled)) and getattr(
+ self._experiment, "define_metric", None
+ ):
self._experiment.define_metric("trainer/global_step")
self._experiment.define_metric("*", step_metric="trainer/global_step", step_sync=True)
- assert isinstance(self._experiment, Run)
+ assert isinstance(self._experiment, (Run, RunDisabled))
return self._experiment
def watch(self, model: nn.Module, log: str = "gradients", log_freq: int = 100, log_graph: bool = True) -> None:
From d85085479d9248d2f6a06821a0cb41c4a2eb02fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Tue, 9 Aug 2022 19:31:11 +0200
Subject: [PATCH 21/59] Reset all results on epoch end (#14061)
---
src/pytorch_lightning/CHANGELOG.md | 3 ++
.../logger_connector/logger_connector.py | 3 +-
.../logging_/test_train_loop_logging.py | 29 +++++++++++++++++--
3 files changed, 31 insertions(+), 4 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index dac5533a6cb17..5dfd871f933f5 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -61,6 +61,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988))
+- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061))
+
+
- Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262))
diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
index ff882912625d0..02e17a8d93494 100644
--- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py
@@ -163,8 +163,7 @@ def update_train_epoch_metrics(self) -> None:
self.log_metrics(self.metrics["log"])
# reset result collection for next epoch
- assert self.trainer._results is not None
- self.trainer._results.reset(metrics=True)
+ self.reset_results()
"""
Utilities and properties
diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
index 5855eba4c86af..d16be306b9365 100644
--- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
+++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
@@ -569,11 +569,12 @@ def on_train_epoch_end(self, trainer, pl_module):
"accelerator",
[
pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)),
+ "cpu",
],
)
def test_metric_are_properly_reduced(tmpdir, accelerator):
class TestingModel(BoringModel):
- def __init__(self, *args, **kwargs) -> None:
+ def __init__(self) -> None:
super().__init__()
self.val_acc = Accuracy()
@@ -592,7 +593,6 @@ def validation_step(self, batch, batch_idx):
return super().validation_step(batch, batch_idx)
early_stop = EarlyStopping(monitor="val_acc", mode="max")
-
checkpoint = ModelCheckpoint(monitor="val_acc", save_last=True, save_top_k=2, mode="max")
model = TestingModel()
@@ -812,3 +812,28 @@ def training_step(self, batch, batch_idx):
call(metrics={"foo_epoch": 0.0, "epoch": 1}, step=3),
]
)
+
+
+@mock.patch("pytorch_lightning.loggers.TensorBoardLogger.log_metrics")
+def test_log_on_train_start(mock_log_metrics, tmpdir):
+ """Tests that logged metrics on_train_start get reset after the first epoch."""
+
+ class MyModel(BoringModel):
+ def on_train_start(self):
+ self.log("foo", 123)
+
+ model = MyModel()
+ trainer = Trainer(
+ default_root_dir=tmpdir,
+ limit_train_batches=1,
+ limit_val_batches=0,
+ max_epochs=2,
+ log_every_n_steps=1,
+ enable_model_summary=False,
+ enable_checkpointing=False,
+ enable_progress_bar=False,
+ )
+ trainer.fit(model)
+
+ assert mock_log_metrics.mock_calls == [call(metrics={"foo": 123.0, "epoch": 0}, step=0)]
+ assert trainer.max_epochs > 1
From 619c2ff05827872973b2eed18d06651f7cd8bd4e Mon Sep 17 00:00:00 2001
From: Raphael Randschau
Date: Tue, 9 Aug 2022 12:17:57 -0700
Subject: [PATCH 22/59] [CLI] fix cluster creation CLI requiring instance-type
selection (#14056)
fix cluster creation CLI requiring instace-type selection
we've marked `instance_types` as `required=False`, but the CLI calls `split` on the value.
So if nothing is provided, we'll actually receive a runtime error, effectively rendering the flag as required.
Co-authored-by: thomas chaton
---
src/lightning_app/cli/lightning_cli_create.py | 2 +-
tests/tests_app/cli/test_cli.py | 19 +++++++++++++------
2 files changed, 14 insertions(+), 7 deletions(-)
diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py
index 7e45fe7e7c078..d400db4b6f337 100644
--- a/src/lightning_app/cli/lightning_cli_create.py
+++ b/src/lightning_app/cli/lightning_cli_create.py
@@ -79,7 +79,7 @@ def create_cluster(
region=region,
role_arn=role_arn,
external_id=external_id,
- instance_types=instance_types.split(","),
+ instance_types=instance_types.split(",") if instance_types is not None else None,
edit_before_creation=edit_before_creation,
cost_savings=cost_savings,
wait=wait,
diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py
index 16e641ac38f23..8cc5dd50f836e 100644
--- a/tests/tests_app/cli/test_cli.py
+++ b/tests/tests_app/cli/test_cli.py
@@ -70,7 +70,15 @@ def test_main_lightning_cli_help():
@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock())
@mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create")
-def test_create_cluster(create: mock.MagicMock):
+@pytest.mark.parametrize(
+ "instance_types,expected_instance_types",
+ [
+ (["--instance-types", "t3.xlarge"], ["t3.xlarge"]),
+ (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"]),
+ ([], None),
+ ],
+)
+def test_create_cluster(create_command: mock.MagicMock, instance_types, expected_instance_types):
runner = CliRunner()
runner.invoke(
create_cluster,
@@ -82,17 +90,16 @@ def test_create_cluster(create: mock.MagicMock):
"dummy",
"--role-arn",
"arn:aws:iam::1234567890:role/lai-byoc",
- "--instance-types",
- "t2.small",
- ],
+ ]
+ + instance_types,
)
- create.assert_called_once_with(
+ create_command.assert_called_once_with(
cluster_name="test-7",
region="us-east-1",
role_arn="arn:aws:iam::1234567890:role/lai-byoc",
external_id="dummy",
- instance_types=["t2.small"],
+ instance_types=expected_instance_types,
edit_before_creation=False,
cost_savings=False,
wait=False,
From 06c255c5c1889e6ecc640dc82a24193ce388511a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Wed, 10 Aug 2022 00:54:10 +0200
Subject: [PATCH 23/59] Skip ddp fork tests on windows (#14121)
---
.../strategies/test_ddp_spawn_strategy.py | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
index f485060833320..7fb22206c45c6 100644
--- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
+++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py
@@ -184,11 +184,17 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group):
"strategy_name,expected_ddp_kwargs",
[
("ddp_spawn", {}),
- ("ddp_fork", {}),
- ("ddp_notebook", {}),
+ pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)),
+ pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)),
("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}),
- ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}),
- ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}),
+ pytest.param(
+ "ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True)
+ ),
+ pytest.param(
+ "ddp_notebook_find_unused_parameters_false",
+ {"find_unused_parameters": False},
+ marks=RunIf(skip_windows=True),
+ ),
],
)
def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs):
From 975a4fc2f1daf5a1662a0d1f47212e7dcdae8b2b Mon Sep 17 00:00:00 2001
From: Adam Reeve
Date: Tue, 9 Aug 2022 16:18:21 -0700
Subject: [PATCH 24/59] Support checkpoint save and load with Stochastic Weight
Averaging (#9938)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: thomas chaton
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Adrian Wälchli
Co-authored-by: Carlos Mocholi
Co-authored-by: Kushashwa Ravi Shrimali
Co-authored-by: Jirka
Co-authored-by: Rohit Gupta
---
src/pytorch_lightning/CHANGELOG.md | 3 +
.../callbacks/stochastic_weight_avg.py | 78 ++++++++++-
.../callbacks/test_stochastic_weight_avg.py | 128 +++++++++++++++++-
3 files changed, 195 insertions(+), 14 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 5dfd871f933f5..8852367a116f6 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -73,6 +73,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041))
+- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938))
+
+
- Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051))
diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py
index 20a3dcc3f0f26..6650bb3f0c479 100644
--- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py
+++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py
@@ -16,7 +16,7 @@
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
"""
from copy import deepcopy
-from typing import Any, Callable, cast, List, Optional, Union
+from typing import Any, Callable, cast, Dict, List, Optional, Union
import torch
from torch import nn, Tensor
@@ -24,6 +24,7 @@
import pytorch_lightning as pl
from pytorch_lightning.callbacks.callback import Callback
+from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn
from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig
@@ -112,15 +113,22 @@ def __init__(
if device is not None and not isinstance(device, (torch.device, str)):
raise MisconfigurationException(f"device is expected to be a torch.device or a str. Found {device}")
+ self.n_averaged: Optional[torch.Tensor] = None
self._swa_epoch_start = swa_epoch_start
self._swa_lrs = swa_lrs
self._annealing_epochs = annealing_epochs
self._annealing_strategy = annealing_strategy
self._avg_fn = avg_fn or self.avg_fn
self._device = device
- self._max_epochs: int
- self._model_contains_batch_norm: bool
+ self._model_contains_batch_norm: Optional[bool] = None
self._average_model: "pl.LightningModule"
+ self._initialized = False
+ self._swa_scheduler: Optional[_LRScheduler] = None
+ self._scheduler_state: Optional[Dict] = None
+ self._init_n_averaged = 0
+ self._latest_update_epoch = -1
+ self.momenta: Optional[Dict[nn.modules.batchnorm._BatchNorm, float]] = None
+ self._max_epochs: int
@property
def swa_start(self) -> int:
@@ -147,6 +155,9 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
if len(trainer.lr_scheduler_configs) > 1:
raise MisconfigurationException("SWA currently not supported for more than 1 `lr_scheduler`.")
+ if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DeepSpeedStrategy)):
+ raise MisconfigurationException("SWA does not currently support sharded models.")
+
if isinstance(self._swa_epoch_start, float):
self._swa_epoch_start = int(trainer.max_epochs * self._swa_epoch_start)
@@ -158,8 +169,13 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
assert trainer.fit_loop.max_epochs is not None
trainer.fit_loop.max_epochs += 1
+ if self._scheduler_state is not None:
+ self._clear_schedulers(trainer)
+
def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None:
- if trainer.current_epoch == self.swa_start:
+ if (not self._initialized) and (self.swa_start <= trainer.current_epoch <= self.swa_end):
+ self._initialized = True
+
# move average model to request device.
self._average_model = self._average_model.to(self._device or pl_module.device)
@@ -180,6 +196,17 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1,
),
)
+ if self._scheduler_state is not None:
+ # Restore scheduler state from checkpoint
+ self._swa_scheduler.load_state_dict(self._scheduler_state)
+ elif trainer.current_epoch != self.swa_start:
+ # Log a warning if we're initializing after start without any checkpoint data,
+ # as behaviour will be different compared to having checkpoint data.
+ rank_zero_warn(
+ "SWA is initializing after swa_start without any checkpoint data. "
+ "This may be caused by loading a checkpoint from an older version of PyTorch Lightning."
+ )
+
# We assert that there is only one optimizer on fit start, so know opt_idx is always 0
default_scheduler_cfg = LRSchedulerConfig(self._swa_scheduler, opt_idx=0)
assert default_scheduler_cfg.interval == "epoch" and default_scheduler_cfg.frequency == 1
@@ -196,14 +223,18 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo
else:
trainer.lr_scheduler_configs.append(default_scheduler_cfg)
- self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device)
+ if self.n_averaged is None:
+ self.n_averaged = torch.tensor(self._init_n_averaged, dtype=torch.long, device=pl_module.device)
- if self.swa_start <= trainer.current_epoch <= self.swa_end:
+ if (self.swa_start <= trainer.current_epoch <= self.swa_end) and (
+ trainer.current_epoch > self._latest_update_epoch
+ ):
+ assert self.n_averaged is not None
self.update_parameters(self._average_model, pl_module, self.n_averaged, self._avg_fn)
+ self._latest_update_epoch = trainer.current_epoch
# Note: No > here in case the callback is saved with the model and training continues
if trainer.current_epoch == self.swa_end + 1:
-
# Transfer weights from average model to pl_module
self.transfer_weights(self._average_model, pl_module)
@@ -265,6 +296,7 @@ def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule") -> No
def reset_momenta(self) -> None:
"""Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165."""
+ assert self.momenta is not None
for bn_module in self.momenta:
bn_module.momentum = self.momenta[bn_module]
@@ -285,3 +317,35 @@ def update_parameters(
def avg_fn(averaged_model_parameter: Tensor, model_parameter: Tensor, num_averaged: Tensor) -> Tensor:
"""Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97."""
return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1)
+
+ def state_dict(self) -> Dict[str, Any]:
+ return {
+ "n_averaged": 0 if self.n_averaged is None else self.n_averaged.item(),
+ "latest_update_epoch": self._latest_update_epoch,
+ "scheduler_state": None if self._swa_scheduler is None else self._swa_scheduler.state_dict(),
+ "average_model_state": None if self._average_model is None else self._average_model.state_dict(),
+ }
+
+ def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
+ self._init_n_averaged = state_dict["n_averaged"]
+ self._latest_update_epoch = state_dict["latest_update_epoch"]
+ self._scheduler_state = state_dict["scheduler_state"]
+ self._load_average_model_state(state_dict["average_model_state"])
+
+ @staticmethod
+ def _clear_schedulers(trainer: "pl.Trainer") -> None:
+ # If we have scheduler state saved, clear the scheduler configs so that we don't try to
+ # load state into the wrong type of schedulers when restoring scheduler checkpoint state.
+ # We'll configure the scheduler and re-load its state in on_train_epoch_start.
+ # Note that this relies on the callback state being restored before the scheduler state is
+ # restored, and doesn't work if restore_checkpoint_after_setup is True, but at the time of
+ # writing that is only True for deepspeed which is already not supported by SWA.
+ # See https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 for background.
+ if trainer.lr_scheduler_configs:
+ assert len(trainer.lr_scheduler_configs) == 1
+ trainer.lr_scheduler_configs.clear()
+
+ def _load_average_model_state(self, model_state: Any) -> None:
+ if self._average_model is None:
+ return
+ self._average_model.load_state_dict(model_state)
diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
index 859cf2fa98c0c..65a0fea2fb4a5 100644
--- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
+++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
@@ -12,11 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
+import os
+from pathlib import Path
+from typing import ContextManager, Optional
from unittest import mock
import pytest
import torch
from torch import nn
+from torch.optim.lr_scheduler import LambdaLR
from torch.optim.swa_utils import SWALR
from torch.utils.data import DataLoader
@@ -30,7 +34,9 @@
class SwaTestModel(BoringModel):
- def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False):
+ def __init__(
+ self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False, crash_on_epoch=None
+ ):
super().__init__()
layers = [nn.Linear(32, 32)]
if batchnorm:
@@ -39,17 +45,18 @@ def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dat
self.layer = nn.Sequential(*layers)
self.interval = interval
self.iterable_dataset = iterable_dataset
+ self.crash_on_epoch = crash_on_epoch
def training_step(self, batch, batch_idx):
+ if self.crash_on_epoch and self.trainer.current_epoch >= self.crash_on_epoch:
+ raise Exception("SWA crash test")
output = self.forward(batch)
loss = self.loss(batch, output)
return {"loss": loss}
def train_dataloader(self):
-
dset_cls = RandomIterableDataset if self.iterable_dataset else RandomDataset
dset = dset_cls(32, 64)
-
return DataLoader(dset, batch_size=2)
def configure_optimizers(self):
@@ -66,6 +73,8 @@ def configure_optimizers(self):
class SwaTestCallback(StochasticWeightAveraging):
update_parameters_calls: int = 0
transfer_weights_calls: int = 0
+ # Record the first epoch, as if we are resuming from a checkpoint this may not be equal to 0
+ first_epoch: Optional[int] = None
def update_parameters(self, *args, **kwargs):
self.update_parameters_calls += 1
@@ -77,6 +86,11 @@ def transfer_weights(self, *args, **kwargs):
def on_train_epoch_start(self, trainer, *args):
super().on_train_epoch_start(trainer, *args)
+ if self.first_epoch is None and not trainer.fit_loop.restarting:
+ # since the checkpoint loaded was saved `on_train_epoch_end`, the first `FitLoop` iteration will
+ # not update the model and just call the epoch-level hooks, for that reason, we check that we are not
+ # restarting before choosing the first epoch
+ self.first_epoch = trainer.current_epoch
assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end)
if self.swa_start <= trainer.current_epoch:
assert isinstance(trainer.lr_scheduler_configs[0].scheduler, SWALR)
@@ -88,6 +102,7 @@ def on_train_epoch_end(self, trainer, *args):
if self.swa_start <= trainer.current_epoch <= self.swa_end:
swa_epoch = trainer.current_epoch - self.swa_start
assert self.n_averaged == swa_epoch + 1
+ assert self._swa_scheduler is not None
# Scheduler is stepped once on initialization and then at the end of each epoch
assert self._swa_scheduler._step_count == swa_epoch + 2
elif trainer.current_epoch > self.swa_end:
@@ -103,10 +118,13 @@ def on_train_end(self, trainer, pl_module):
if not isinstance(trainer.strategy, DDPSpawnStrategy):
# check backward call count. the batchnorm update epoch should not backward
- assert trainer.strategy.backward.call_count == trainer.max_epochs * trainer.limit_train_batches
+ assert trainer.strategy.backward.call_count == (
+ (trainer.max_epochs - self.first_epoch) * trainer.limit_train_batches
+ )
# check call counts
- assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1)
+ first_swa_epoch = max(self.first_epoch, self.swa_start)
+ assert self.update_parameters_calls == trainer.max_epochs - first_swa_epoch
assert self.transfer_weights_calls == 1
@@ -140,7 +158,7 @@ def train_with_swa(
devices=devices,
)
- with mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward):
+ with _backward_patch(trainer):
trainer.fit(model)
# check the model is the expected
@@ -226,9 +244,10 @@ def test_swa_multiple_lrs(tmpdir):
class TestModel(BoringModel):
def __init__(self):
- super(BoringModel, self).__init__()
+ super().__init__()
self.layer1 = torch.nn.Linear(32, 32)
self.layer2 = torch.nn.Linear(32, 2)
+ self.on_train_epoch_start_called = False
def forward(self, x):
x = self.layer1(x)
@@ -255,3 +274,98 @@ def on_train_epoch_start(self):
)
trainer.fit(model)
assert model.on_train_epoch_start_called
+
+
+def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False):
+ swa_start = 3
+ trainer_kwargs = {
+ "default_root_dir": tmpdir,
+ "max_epochs": 5,
+ "accelerator": "cpu",
+ "strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None,
+ "devices": 2 if ddp else 1,
+ "limit_train_batches": 5,
+ "limit_val_batches": 0,
+ "accumulate_grad_batches": 2,
+ "enable_progress_bar": False,
+ }
+ trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs)
+
+ with _backward_patch(trainer), pytest.raises(Exception, match="SWA crash test"):
+ trainer.fit(model)
+
+ checkpoint_dir = Path(tmpdir) / "lightning_logs" / "version_0" / "checkpoints"
+ checkpoint_files = os.listdir(checkpoint_dir)
+ assert len(checkpoint_files) == 1
+ ckpt_path = str(checkpoint_dir / checkpoint_files[0])
+
+ trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs)
+
+ with _backward_patch(trainer):
+ trainer.fit(resume_model, ckpt_path=ckpt_path)
+
+
+class CustomSchedulerModel(SwaTestModel):
+ def configure_optimizers(self):
+ optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1)
+
+ def lr_lambda(current_step: int):
+ return 0.1
+
+ scheduler = LambdaLR(optimizer, lr_lambda, -1)
+ return {
+ "optimizer": optimizer,
+ "lr_scheduler": {
+ "scheduler": scheduler,
+ "interval": self.interval,
+ },
+ }
+
+
+@pytest.mark.parametrize("crash_on_epoch", [1, 3])
+def test_swa_resume_training_from_checkpoint(tmpdir, crash_on_epoch):
+ model = SwaTestModel(crash_on_epoch=crash_on_epoch)
+ resume_model = SwaTestModel()
+ _swa_resume_training_from_checkpoint(tmpdir, model, resume_model)
+
+
+@pytest.mark.parametrize("crash_on_epoch", [1, 3])
+def test_swa_resume_training_from_checkpoint_custom_scheduler(tmpdir, crash_on_epoch):
+ # Reproduces the bug reported in https://github.com/PyTorchLightning/pytorch-lightning/issues/11665
+ model = CustomSchedulerModel(crash_on_epoch=crash_on_epoch)
+ resume_model = CustomSchedulerModel()
+ _swa_resume_training_from_checkpoint(tmpdir, model, resume_model)
+
+
+@RunIf(skip_windows=True)
+def test_swa_resume_training_from_checkpoint_ddp(tmpdir):
+ model = SwaTestModel(crash_on_epoch=3)
+ resume_model = SwaTestModel()
+ _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=True)
+
+
+@pytest.mark.parametrize(
+ "strategy",
+ [
+ pytest.param("fsdp", marks=RunIf(fairscale_fully_sharded=True, min_cuda_gpus=1)),
+ pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)),
+ ],
+)
+def test_misconfiguration_error_with_sharded_model(tmpdir, strategy: str):
+ model = SwaTestModel()
+ swa_callback = SwaTestCallback(swa_epoch_start=2, swa_lrs=0.1)
+ trainer = Trainer(
+ default_root_dir=tmpdir,
+ enable_progress_bar=False,
+ max_epochs=5,
+ callbacks=[swa_callback],
+ strategy=strategy,
+ accelerator="gpu",
+ devices=1,
+ )
+ with pytest.raises(MisconfigurationException, match="SWA does not currently support sharded models"):
+ trainer.fit(model)
+
+
+def _backward_patch(trainer: Trainer) -> ContextManager:
+ return mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward)
From 8fa9e8651d044e6122a4380f570fd2451a665f8c Mon Sep 17 00:00:00 2001
From: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>
Date: Wed, 10 Aug 2022 09:38:18 +0900
Subject: [PATCH 25/59] Update collect env details and issue template (#14017)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Co-authored-by: Carlos Mocholí
---
.github/ISSUE_TEMPLATE/bug_report.md | 8 ++++++
requirements/collect_env_details.py | 41 ++++++++++------------------
2 files changed, 22 insertions(+), 27 deletions(-)
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index f08865180ba1d..de4eacde1f39e 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -41,8 +41,16 @@ You can get the script and run it with:
```bash
wget https://raw.githubusercontent.com/Lightning-AI/lightning/master/requirements/collect_env_details.py
python collect_env_details.py
+
```
+
+
+ Details
+ Paste the output here and move this toggle outside of the comment block.
+
+
+
You can also fill out the list below manually.
-->
diff --git a/requirements/collect_env_details.py b/requirements/collect_env_details.py
index 1d65753a55553..b0c47efc43859 100644
--- a/requirements/collect_env_details.py
+++ b/requirements/collect_env_details.py
@@ -20,27 +20,17 @@
import platform
import sys
-import numpy
+import pkg_resources
import torch
-import tqdm
sys.path += [os.path.abspath(".."), os.path.abspath("")]
-import pytorch_lightning # noqa: E402
-try:
- import lightning
-except ModuleNotFoundError:
- pass
-try:
- import lightning_app
-except ModuleNotFoundError:
- pass
LEVEL_OFFSET = "\t"
KEY_PADDING = 20
-def info_system():
+def info_system() -> dict:
return {
"OS": platform.system(),
"architecture": platform.architecture(),
@@ -50,28 +40,24 @@ def info_system():
}
-def info_cuda():
+def info_cuda() -> dict:
return {
- "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
- # 'nvidia_driver': get_nvidia_driver_version(run_lambda),
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] or None,
"available": torch.cuda.is_available(),
"version": torch.version.cuda,
}
-def info_packages():
- return {
- "numpy": numpy.__version__,
- "pyTorch_version": torch.__version__,
- "pyTorch_debug": torch.version.debug,
- "pytorch-lightning": pytorch_lightning.__version__,
- "lightning": lightning.__version__ if "lightning" in sys.modules else None,
- "lightning_app": lightning_app.__version__ if "lightning_app" in sys.modules else None,
- "tqdm": tqdm.__version__,
- }
+def info_packages() -> dict:
+ """Get name and version of all installed packages."""
+ packages = {}
+ for dist in pkg_resources.working_set:
+ package = dist.as_requirement()
+ packages[package.key] = package.specs[0][1]
+ return packages
-def nice_print(details, level=0):
+def nice_print(details: dict, level: int = 0) -> list:
lines = []
for k in sorted(details):
key = f"* {k}:" if level == 0 else f"- {k}:"
@@ -88,8 +74,9 @@ def nice_print(details, level=0):
return lines
-def main():
+def main() -> None:
details = {"System": info_system(), "CUDA": info_cuda(), "Packages": info_packages()}
+ details["Lightning"] = {k: v for k, v in details["Packages"].items() if "torch" in k or "lightning" in k}
lines = nice_print(details)
text = os.linesep.join(lines)
print(text)
From d211d46e1db3ca4c4c938cafbfe51704f51b8ab4 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta
Date: Wed, 10 Aug 2022 11:35:41 +0900
Subject: [PATCH 26/59] CI: Replace `_` of in GHA workflow filenames with `-`
(#13917)
* Rename workflow files
* Update docs
* Fix azure badges
* Update the main readme
* bad rebase
* Update doc
---
.actions/setup_tools.py | 5 ++-
.github/workflows/README.md | 34 +++++++++----------
...e2e_test.yml => ci-app-cloud-e2e-test.yml} | 0
...i-app_examples.yml => ci-app-examples.yml} | 0
.../{ci-app_tests.yml => ci-app-tests.yml} | 0
...{ci_pkg-install.yml => ci-pkg-install.yml} | 0
...pr-gatekeeper.yml => ci-pr-gatekeeper.yml} | 0
...st-conda.yml => ci-pytorch-test-conda.yml} | 0
...test-full.yml => ci-pytorch-test-full.yml} | 0
...test-slow.yml => ci-pytorch-test-slow.yml} | 0
.../{ci_schema.yml => ci-schema.yml} | 0
...h_dockers.yml => cicd-pytorch-dockers.yml} | 0
README.md | 25 ++++++++------
src/pytorch_lightning/README.md | 26 +++++++-------
14 files changed, 46 insertions(+), 44 deletions(-)
rename .github/workflows/{ci-app_cloud_e2e_test.yml => ci-app-cloud-e2e-test.yml} (100%)
rename .github/workflows/{ci-app_examples.yml => ci-app-examples.yml} (100%)
rename .github/workflows/{ci-app_tests.yml => ci-app-tests.yml} (100%)
rename .github/workflows/{ci_pkg-install.yml => ci-pkg-install.yml} (100%)
rename .github/workflows/{ci_pr-gatekeeper.yml => ci-pr-gatekeeper.yml} (100%)
rename .github/workflows/{ci-pytorch_test-conda.yml => ci-pytorch-test-conda.yml} (100%)
rename .github/workflows/{ci-pytorch_test-full.yml => ci-pytorch-test-full.yml} (100%)
rename .github/workflows/{ci-pytorch_test-slow.yml => ci-pytorch-test-slow.yml} (100%)
rename .github/workflows/{ci_schema.yml => ci-schema.yml} (100%)
rename .github/workflows/{cicd-pytorch_dockers.yml => cicd-pytorch-dockers.yml} (100%)
diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py
index 5088be2020738..a76e81246798c 100644
--- a/.actions/setup_tools.py
+++ b/.actions/setup_tools.py
@@ -94,11 +94,10 @@ def load_readme_description(path_dir: str, homepage: str, version: str) -> str:
text = text.replace("pytorch-lightning.readthedocs.io/en/stable/", f"pytorch-lightning.readthedocs.io/en/{version}")
# codecov badge
text = text.replace("/branch/master/graph/badge.svg", f"/release/{version}/graph/badge.svg")
- # replace github badges for release ones
+ # github actions badge
text = text.replace("badge.svg?branch=master&event=push", f"badge.svg?tag={version}")
- # Azure...
+ # azure pipelines badge
text = text.replace("?branchName=master", f"?branchName=refs%2Ftags%2F{version}")
- text = re.sub(r"\?definitionId=\d+&branchName=master", f"?definitionId=2&branchName=refs%2Ftags%2F{version}", text)
skip_begin = r""
skip_end = r""
diff --git a/.github/workflows/README.md b/.github/workflows/README.md
index f559551e1237f..4ed903c0f3a93 100644
--- a/.github/workflows/README.md
+++ b/.github/workflows/README.md
@@ -4,16 +4,16 @@
## Unit and Integration Testing
-| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS |
-| -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- |
-| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows |
-| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux |
-| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows |
-| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux |
-| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux |
-| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux |
-| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux |
-| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux |
+| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS |
+| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------- | ------------------- |
+| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows |
+| Test PyTorch with Conda | .github/workflows/ci-pytorch-test-conda.yml | Same as ci-pytorch-test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.9), (3.8, 1.10), (3.8, 1.11), (3.9, 1.12) | linux |
+| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows |
+| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux |
+| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux |
+| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux |
+| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux |
+| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux |
- \*Accelerators used in CI
- GPU: 2 x NVIDIA Tesla V100
@@ -33,15 +33,15 @@
| --------------------------------- | ----------------------------------------------------------------------------------------- |
| .codecov.yml | Measure test coverage with [codecov.io](https://app.codecov.io/gh/Lightning-AI/lightning) |
| .github/workflows/code-checks.yml | Check Python typing with [MyPy](https://mypy.readthedocs.io/en/stable/). |
-| .github/workflows/ci_schema.yml | Validate the syntax of workflow files. |
+| .github/workflows/ci-schema.yml | Validate the syntax of workflow files. |
## Others
-| workflow file | action |
-| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| .github/workflows/ci_dockers.yml | Build docker images used for testing in CI without pushing to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). Publishing these built images takes place in `.github/workflows/release-docker.yml` which only runs in master. |
-| .github/workflows/ci_pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. |
-| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. |
+| workflow file | action |
+| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| .github/workflows/cicd-pytorch-dockers.yml | Build docker images used for testing in CI. If run on nightly schedule, push to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). |
+| .github/workflows/ci-pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. |
+| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. |
## Deployment
@@ -60,4 +60,4 @@
| .github/stale.yml | Close inactive issues/PRs sometimes after adding the "won't fix" label to them. |
| .github/workflows/probot-auto-cc.yml, .github/lightning-probot.yml | Notify maintainers of interest depending on labels added to an issue We utilize lightning-probot forked from PyTorch’s probot. |
| .pre-commit-config.yaml | pre-commit.ci runs a set of linters and formatters, such as black, flake8 and isort. When formatting is applied, the bot pushes a commit with its change. This configuration is also used for running pre-commit locally. |
-| .github/workflows/ci_pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. |
+| .github/workflows/ci-pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. |
diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml
similarity index 100%
rename from .github/workflows/ci-app_cloud_e2e_test.yml
rename to .github/workflows/ci-app-cloud-e2e-test.yml
diff --git a/.github/workflows/ci-app_examples.yml b/.github/workflows/ci-app-examples.yml
similarity index 100%
rename from .github/workflows/ci-app_examples.yml
rename to .github/workflows/ci-app-examples.yml
diff --git a/.github/workflows/ci-app_tests.yml b/.github/workflows/ci-app-tests.yml
similarity index 100%
rename from .github/workflows/ci-app_tests.yml
rename to .github/workflows/ci-app-tests.yml
diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci-pkg-install.yml
similarity index 100%
rename from .github/workflows/ci_pkg-install.yml
rename to .github/workflows/ci-pkg-install.yml
diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml
similarity index 100%
rename from .github/workflows/ci_pr-gatekeeper.yml
rename to .github/workflows/ci-pr-gatekeeper.yml
diff --git a/.github/workflows/ci-pytorch_test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml
similarity index 100%
rename from .github/workflows/ci-pytorch_test-conda.yml
rename to .github/workflows/ci-pytorch-test-conda.yml
diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch-test-full.yml
similarity index 100%
rename from .github/workflows/ci-pytorch_test-full.yml
rename to .github/workflows/ci-pytorch-test-full.yml
diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml
similarity index 100%
rename from .github/workflows/ci-pytorch_test-slow.yml
rename to .github/workflows/ci-pytorch-test-slow.yml
diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci-schema.yml
similarity index 100%
rename from .github/workflows/ci_schema.yml
rename to .github/workflows/ci-schema.yml
diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml
similarity index 100%
rename from .github/workflows/cicd-pytorch_dockers.yml
rename to .github/workflows/cicd-pytorch-dockers.yml
diff --git a/README.md b/README.md
index 2fef343425f17..9c03e3707ec24 100644
--- a/README.md
+++ b/README.md
@@ -80,21 +80,24 @@ ______________________________________________________________________
## Continuous Integration
-Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major Python and PyTorch versions.
+Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs and against major Python and PyTorch versions.
Current build statuses
-| System / PyTorch ver. | 1.8 (LTS, min. req.) | 1.9 | 1.10 (latest) |
-| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| Linux py3.7 \[GPUs\*\*\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - |
-| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - |
-| Linux py3.8 (with Conda | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) |
-| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) |
-| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) |
-| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) |
+| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) |
+| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Linux py3.7 \[GPUs\*\*\] | - | - | - |
+| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - |
+| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - |
+| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - |
+| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - |
+| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) |
+| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) |
+| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) |
+| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) |
- _\*\* tests run on two NVIDIA P100_
- _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._
@@ -136,8 +139,8 @@ conda install pytorch-lightning -c conda-forge
The actual status of 1.7 \[stable\] is the following:
-[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml?query=branch%3Arelease%2Fpytorch)
-[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml?query=branch%3Arelease%2Fpytorch)
+[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml?query=branch%3Arelease%2Fpytorch)
+[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml?query=branch%3Arelease%2Fpytorch)
[![TPU tests](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch)
[![Check Docs](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml?query=branch%3Arelease%2Fpytorch)
diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md
index eb1a42730b5f0..b57aea6fae147 100644
--- a/src/pytorch_lightning/README.md
+++ b/src/pytorch_lightning/README.md
@@ -78,17 +78,17 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) |
-| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
-| Linux py3.7 \[GPUs\*\*\] | - | - | - |
-| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - |
-| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - |
-| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - |
-| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | - |
-| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) |
-| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) |
-| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) |
-| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) |
+| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) |
+| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| Linux py3.7 \[GPUs\*\*\] | - | - | - |
+| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - |
+| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - |
+| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - |
+| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - |
+| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) |
+| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) |
+| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) |
+| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) |
- _\*\* tests run on two NVIDIA P100_
- _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._
@@ -130,8 +130,8 @@ conda install pytorch-lightning -c conda-forge
The actual status of stable is the following:
-[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml)
-[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml)
+[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml)
+[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml)
[![GPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=release%2Fpytorch)
[![TPU](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch)
[![IPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=release%2Fpytorch)
From dfda3f384e020ab3955a2cf3fe29dcd831d8a969 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta
Date: Wed, 10 Aug 2022 18:02:54 +0900
Subject: [PATCH 27/59] CI: Update Windows version from 2019 to 2022 (#14129)
Update windows
---
.github/workflows/ci-app-examples.yml | 2 +-
.github/workflows/ci-app-tests.yml | 6 +++---
.github/workflows/ci-pkg-install.yml | 6 +++---
.github/workflows/ci-pytorch-test-full.yml | 2 +-
.github/workflows/ci-pytorch-test-slow.yml | 2 +-
5 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml
index ec8becd5f70d1..01570f59c2c77 100644
--- a/.github/workflows/ci-app-examples.yml
+++ b/.github/workflows/ci-app-examples.yml
@@ -17,7 +17,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- os: [ubuntu-20.04, macOS-11, windows-2019]
+ os: [ubuntu-20.04, macOS-11, windows-2022]
python-version: [3.8]
requires: ["oldest", "latest"]
diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml
index 1678dab257301..fe3cc36dc16d3 100644
--- a/.github/workflows/ci-app-tests.yml
+++ b/.github/workflows/ci-app-tests.yml
@@ -21,7 +21,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- os: [ubuntu-20.04, macOS-11, windows-2019]
+ os: [ubuntu-20.04, macOS-11, windows-2022]
python-version: [3.8]
requires: ["oldest", "latest"]
@@ -126,7 +126,7 @@ jobs:
# - name: Clone Quick Start Example Repo
# uses: actions/checkout@v3
# # TODO: this needs to be git submodule
-# if: matrix.os == 'windows-2019' # because the install doesn't work on windows
+# if: matrix.os == 'windows-2022' # because the install doesn't work on windows
# with:
# repository: Lightning-AI/lightning-quick-start
# ref: 'main'
@@ -134,6 +134,6 @@ jobs:
#
# - name: Lightning Install quick-start
# shell: bash
-# if: matrix.os != 'windows-2019' # because the install doesn't work on windows
+# if: matrix.os != 'windows-2022' # because the install doesn't work on windows
# run: |
# python -m lightning install app lightning/quick-start -y
diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml
index 342e027b07cfe..a9fdd36693a67 100644
--- a/.github/workflows/ci-pkg-install.yml
+++ b/.github/workflows/ci-pkg-install.yml
@@ -33,7 +33,7 @@ jobs:
fail-fast: true
max-parallel: 1
matrix:
- os: [ubuntu-20.04, macOS-11, windows-2019]
+ os: [ubuntu-20.04, macOS-11, windows-2022]
pkg: ["app", "pytorch"]
python-version: [3.8] # , 3.9
@@ -67,7 +67,7 @@ jobs:
fail-fast: false
# max-parallel: 1
matrix:
- os: [ubuntu-20.04, macOS-11, windows-2019]
+ os: [ubuntu-20.04, macOS-11, windows-2022]
pkg: ["", "lightning"]
python-version: [3.8] # , 3.9
@@ -100,7 +100,7 @@ jobs:
fail-fast: false
# max-parallel: 1
matrix:
- os: [ubuntu-20.04, macOS-11, windows-2019]
+ os: [ubuntu-20.04, macOS-11, windows-2022]
python-version: [3.8] # , 3.9
steps:
diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml
index 445707d340c4b..7409ce25a5128 100644
--- a/.github/workflows/ci-pytorch-test-full.yml
+++ b/.github/workflows/ci-pytorch-test-full.yml
@@ -20,7 +20,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- os: [ubuntu-20.04, windows-2019, macOS-11]
+ os: [ubuntu-20.04, windows-2022, macOS-11]
python-version: ["3.7", "3.10"] # minimum, maximum
requires: ["oldest", "latest"]
release: ["stable"]
diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml
index b3756bbe8c2f7..36007d3311451 100644
--- a/.github/workflows/ci-pytorch-test-slow.yml
+++ b/.github/workflows/ci-pytorch-test-slow.yml
@@ -19,7 +19,7 @@ jobs:
strategy:
fail-fast: false
matrix:
- os: [ubuntu-20.04, windows-2019, macOS-11]
+ os: [ubuntu-20.04, windows-2022, macOS-11]
# same config as '.azure-pipelines/gpu-tests.yml'
python-version: ["3.7"]
pytorch-version: ["1.11"]
From dc8ff5ed2699b2ab9d21ee1ea6270191e290f620 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Wed, 10 Aug 2022 11:23:20 +0200
Subject: [PATCH 28/59] Fix device placement when `.cuda()` called without
specifying index (#14128)
---
src/pytorch_lightning/CHANGELOG.md | 3 +++
.../core/mixins/device_dtype_mixin.py | 10 ++++----
.../utilities/test_dtype_device_mixin.py | 24 ++++++++++++++++++-
3 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 8852367a116f6..b405665b9df88 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095))
+- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128))
+
+
## [1.7.0] - 2022-08-02
### Added
diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py
index 62e81e4839da6..2916d8b07cb4e 100644
--- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py
+++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py
@@ -116,14 +116,16 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: # ty
while being optimized.
Arguments:
- device: if specified, all parameters will be
- copied to that device
+ device: If specified, all parameters will be copied to that device. If `None`, the current CUDA device
+ index will be used.
Returns:
Module: self
"""
- if device is None or isinstance(device, int):
- device = torch.device("cuda", index=(device or 0))
+ if device is None:
+ device = torch.device("cuda", torch.cuda.current_device())
+ elif isinstance(device, int):
+ device = torch.device("cuda", index=device)
self.__update_properties(device=device)
return super().cuda(device=device)
diff --git a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py
index 38f72b555d52d..7c17b3d9f7642 100644
--- a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py
+++ b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py
@@ -113,7 +113,7 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir):
],
)
@RunIf(min_cuda_gpus=1)
-def test_gpu_cuda_device(device):
+def test_cuda_device(device):
model = TopModule()
model.cuda(device)
@@ -122,3 +122,25 @@ def test_gpu_cuda_device(device):
assert device.type == "cuda"
assert device.index is not None
assert device.index == torch.cuda.current_device()
+
+
+@RunIf(min_cuda_gpus=2)
+def test_cuda_current_device():
+ """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting."""
+
+ class CudaModule(DeviceDtypeModuleMixin):
+ def __init__(self):
+ super().__init__()
+ self.layer = nn.Linear(1, 1)
+
+ model = CudaModule()
+
+ torch.cuda.set_device(0)
+ model.cuda(1)
+ assert model.device == torch.device("cuda", 1)
+ assert model.layer.weight.device == torch.device("cuda", 1)
+
+ torch.cuda.set_device(1)
+ model.cuda() # model is already on device 1, and calling .cuda() without device index should not move model
+ assert model.device == torch.device("cuda", 1)
+ assert model.layer.weight.device == torch.device("cuda", 1)
From ddb476d334f501a655586ae3809587e09f71b9c8 Mon Sep 17 00:00:00 2001
From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com>
Date: Wed, 10 Aug 2022 11:48:06 +0200
Subject: [PATCH 29/59] [App] Application logs in CLI (#13634)
---
src/lightning_app/CHANGELOG.md | 2 +
src/lightning_app/cli/lightning_cli.py | 89 +++++++++++++
src/lightning_app/testing/testing.py | 2 +-
src/lightning_app/utilities/app_logs.py | 125 ++++++++++++++++++
.../utilities/logs_socket_api.py | 95 +++++++++++++
tests/tests_app/cli/test_cmd_show_logs.py | 61 +++++++++
tests/tests_app_examples/test_boring_app.py | 15 +++
.../test_collect_failures.py | 1 +
tests/tests_app_examples/test_commands.py | 1 +
.../test_custom_work_dependencies.py | 2 +-
tests/tests_app_examples/test_drive.py | 1 +
tests/tests_app_examples/test_idle_timeout.py | 1 +
tests/tests_app_examples/test_payload.py | 2 +-
tests/tests_app_examples/test_quick_start.py | 2 +-
.../test_template_react_ui.py | 1 +
.../test_template_streamlit_ui.py | 1 +
tests/tests_app_examples/test_v0_app.py | 1 +
17 files changed, 398 insertions(+), 4 deletions(-)
create mode 100644 src/lightning_app/utilities/app_logs.py
create mode 100644 src/lightning_app/utilities/logs_socket_api.py
create mode 100644 tests/tests_app/cli/test_cmd_show_logs.py
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index 78a4e370e76ee..ba8cdd796c5bb 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894))
- Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987))
- Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
+- Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634))
+
### Changed
diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py
index fb4c40330dfd9..45c80d4dcc357 100644
--- a/src/lightning_app/cli/lightning_cli.py
+++ b/src/lightning_app/cli/lightning_cli.py
@@ -8,7 +8,9 @@
import click
import requests
+import rich
from requests.exceptions import ConnectionError
+from rich.color import ANSI_COLOR_NAMES
from lightning_app import __version__ as ver
from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init
@@ -18,12 +20,15 @@
from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW
from lightning_app.runners.runtime import dispatch
from lightning_app.runners.runtime_type import RuntimeType
+from lightning_app.utilities.app_logs import _app_logs_reader
from lightning_app.utilities.cli_helpers import (
_format_input_env_variables,
_retrieve_application_url_and_available_commands,
)
+from lightning_app.utilities.cloud import _get_project
from lightning_app.utilities.install_components import register_all_external_components
from lightning_app.utilities.login import Auth
+from lightning_app.utilities.network import LightningClient
from lightning_app.utilities.state import headers_for
logger = logging.getLogger(__name__)
@@ -50,9 +55,93 @@ def main():
@click.version_option(ver)
def _main():
register_all_external_components()
+
+
+@_main.group()
+def show():
+ """Show given resource."""
pass
+@show.command()
+@click.argument("app_name", required=False)
+@click.argument("components", nargs=-1, required=False)
+@click.option("-f", "--follow", required=False, is_flag=True, help="Wait for new logs, to exit use CTRL+C.")
+def logs(app_name: str, components: List[str], follow: bool) -> None:
+ """Show cloud application logs. By default prints logs for all currently available components.
+
+ Example uses:
+
+ Print all application logs:
+
+ $ lightning show logs my-application
+
+
+ Print logs only from the flow (no work):
+
+ $ lightning show logs my-application flow
+
+
+ Print logs only from selected works:
+
+ $ lightning show logs my-application root.work_a root.work_b
+ """
+
+ client = LightningClient()
+ project = _get_project(client)
+
+ apps = {
+ app.name: app
+ for app in client.lightningapp_instance_service_list_lightningapp_instances(project.project_id).lightningapps
+ }
+
+ if not apps:
+ raise click.ClickException(
+ "You don't have any application in the cloud. Please, run an application first with `--cloud`."
+ )
+
+ if not app_name:
+ raise click.ClickException(
+ f"You have not specified any Lightning App. Please select one of available: [{', '.join(apps.keys())}]"
+ )
+
+ if app_name not in apps:
+ raise click.ClickException(
+ f"The Lightning App '{app_name}' does not exist. Please select one of following: [{', '.join(apps.keys())}]"
+ )
+
+ # Fetch all lightning works from given application
+ # 'Flow' component is somewhat implicit, only one for whole app,
+ # and not listed in lightningwork API - so we add it directly to the list
+ works = client.lightningwork_service_list_lightningwork(
+ project_id=project.project_id, app_id=apps[app_name].id
+ ).lightningworks
+ app_component_names = ["flow"] + [f.name for f in apps[app_name].spec.flow_servers] + [w.name for w in works]
+
+ if not components:
+ components = app_component_names
+
+ for component in components:
+ if component not in app_component_names:
+ raise click.ClickException(f"Component '{component}' does not exist in app {app_name}.")
+
+ log_reader = _app_logs_reader(
+ client=client,
+ project_id=project.project_id,
+ app_id=apps[app_name].id,
+ component_names=components,
+ follow=follow,
+ )
+
+ rich_colors = list(ANSI_COLOR_NAMES)
+ colors = {c: rich_colors[i + 1] for i, c in enumerate(components)}
+
+ for component_name, log_event in log_reader:
+ date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S")
+ color = colors[component_name]
+ rich.print(f"[{color}]{component_name}[/{color}] {date} {log_event.message}")
+
+
@_main.command()
def login():
"""Log in to your Lightning.ai account."""
diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py
index e1cc2e180dab5..74d57db38c427 100644
--- a/src/lightning_app/testing/testing.py
+++ b/src/lightning_app/testing/testing.py
@@ -318,7 +318,7 @@ def fetch_logs() -> str:
)
try:
- yield admin_page, view_page, fetch_logs
+ yield admin_page, view_page, fetch_logs, name
except KeyboardInterrupt:
pass
finally:
diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py
new file mode 100644
index 0000000000000..4a7af9b5c5143
--- /dev/null
+++ b/src/lightning_app/utilities/app_logs.py
@@ -0,0 +1,125 @@
+import json
+import queue
+import sys
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from json import JSONDecodeError
+from threading import Thread
+from typing import Iterator, List, Optional, Tuple
+
+import dateutil.parser
+from websocket import WebSocketApp
+
+from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI
+from lightning_app.utilities.network import LightningClient
+
+
+@dataclass
+class _LogEventLabels:
+ app: str
+ container: str
+ filename: str
+ job: str
+ namespace: str
+ node_name: str
+ pod: str
+ stream: Optional[str] = None
+
+
+@dataclass
+class _LogEvent:
+ message: str
+ timestamp: datetime
+ labels: _LogEventLabels
+
+
+def _push_logevents_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue):
+ """Pushes _LogEvents from websocket to read_queue.
+
+ Returns callback function used with `on_message_callback` of websocket.WebSocketApp.
+ """
+
+ def callback(ws_app: WebSocketApp, msg: str):
+ # We strongly trust that the contract on API will hold atm :D
+ event_dict = json.loads(msg)
+ labels = _LogEventLabels(**event_dict["labels"])
+ if "message" in event_dict:
+ event = _LogEvent(
+ message=event_dict["message"],
+ timestamp=dateutil.parser.isoparse(event_dict["timestamp"]),
+ labels=labels,
+ )
+ read_queue.put((event.timestamp, component_name, event))
+
+ return callback
+
+
+def _error_callback(ws_app: WebSocketApp, error: Exception):
+ errors = {
+ KeyError: "Malformed log message, missing key",
+ JSONDecodeError: "Malformed log message",
+ TypeError: "Malformed log format",
+ ValueError: "Malformed date format",
+ }
+ print(f"Error while reading logs ({errors.get(type(error), 'Unknown')})", file=sys.stderr)
+ ws_app.close()
+
+
+def _app_logs_reader(
+ client: LightningClient, project_id: str, app_id: str, component_names: List[str], follow: bool
+) -> Iterator[Tuple[str, _LogEvent]]:
+
+ read_queue = queue.PriorityQueue()
+ logs_api_client = _LightningLogsSocketAPI(client.api_client)
+
+ # We will use a socket per component
+ log_sockets = [
+ logs_api_client.create_lightning_logs_socket(
+ project_id=project_id,
+ app_id=app_id,
+ component=component_name,
+ on_message_callback=_push_logevents_to_read_queue_callback(component_name, read_queue),
+ on_error_callback=_error_callback,
+ )
+ for component_name in component_names
+ ]
+
+ # And each socket on separate thread pushing log event to print queue
+ # run_forever() will run until we close() the connection from outside
+ log_threads = [Thread(target=work.run_forever) for work in log_sockets]
+
+ # Establish connection and begin pushing logs to the print queue
+ for th in log_threads:
+ th.start()
+
+ user_log_start = "<<< BEGIN USER_RUN_FLOW SECTION >>>"
+ start_timestamp = None
+
+ # Print logs from queue when log event is available
+ try:
+ while True:
+ _, component_name, log_event = read_queue.get(timeout=None if follow else 1.0)
+ log_event: _LogEvent
+
+ if user_log_start in log_event.message:
+ start_timestamp = log_event.timestamp + timedelta(seconds=0.5)
+
+ if start_timestamp and log_event.timestamp > start_timestamp:
+ yield component_name, log_event
+
+ except queue.Empty:
+ # Empty is raised by queue.get if timeout is reached. Follow = False case.
+ pass
+
+ except KeyboardInterrupt:
+ # User pressed CTRL+C to exit, we sould respect that
+ pass
+
+ finally:
+ # Close connections - it will cause run_forever() to finish -> thread as finishes aswell
+ for socket in log_sockets:
+ socket.close()
+
+ # Because all socket were closed, we can just wait for threads to finish.
+ for th in log_threads:
+ th.join()
diff --git a/src/lightning_app/utilities/logs_socket_api.py b/src/lightning_app/utilities/logs_socket_api.py
new file mode 100644
index 0000000000000..0ab9a5c24f3e5
--- /dev/null
+++ b/src/lightning_app/utilities/logs_socket_api.py
@@ -0,0 +1,95 @@
+from typing import Callable, Optional
+from urllib.parse import urlparse
+
+from lightning_cloud.openapi import ApiClient, AuthServiceApi, V1LoginRequest
+from websocket import WebSocketApp
+
+from lightning_app.utilities.login import Auth
+
+
+class _LightningLogsSocketAPI:
+ def __init__(self, api_client: ApiClient):
+ self.api_client = api_client
+ self._auth = Auth()
+ self._auth.authenticate()
+ self._auth_service = AuthServiceApi(api_client)
+
+ def _get_api_token(self) -> str:
+ token_resp = self._auth_service.auth_service_login(
+ body=V1LoginRequest(
+ username=self._auth.username,
+ api_key=self._auth.api_key,
+ )
+ )
+ return token_resp.token
+
+ @staticmethod
+ def _socket_url(host: str, project_id: str, app_id: str, token: str, component: str) -> str:
+ return (
+ f"wss://{host}/v1/projects/{project_id}/appinstances/{app_id}/logs?"
+ f"token={token}&component={component}&follow=true"
+ )
+
+ def create_lightning_logs_socket(
+ self,
+ project_id: str,
+ app_id: str,
+ component: str,
+ on_message_callback: Callable[[WebSocketApp, str], None],
+ on_error_callback: Optional[Callable[[Exception, str], None]] = None,
+ ) -> WebSocketApp:
+ """Creates and returns WebSocketApp to listen to lightning app logs.
+
+ .. code-block:: python
+ # Synchronous reading, run_forever() is blocking
+
+
+ def print_log_msg(ws_app, msg):
+ print(msg)
+
+
+ flow_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "flow", print_log_msg)
+ flow_socket.run_forever()
+
+ .. code-block:: python
+ # Asynchronous reading (with Threads)
+
+
+ def print_log_msg(ws_app, msg):
+ print(msg)
+
+
+ flow_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "flow", print_log_msg)
+ work_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "work_1", print_log_msg)
+
+ flow_logs_thread = Thread(target=flow_logs_socket.run_forever)
+ work_logs_thread = Thread(target=work_logs_socket.run_forever)
+
+ flow_logs_thread.start()
+ work_logs_thread.start()
+ # .......
+
+ flow_logs_socket.close()
+ work_logs_thread.close()
+
+ Arguments:
+ project_id: Project ID.
+ app_id: Application ID.
+ component: Component name eg flow.
+ on_message_callback: Callback object which is called when received data.
+ on_error_callback: Callback object which is called when we get error.
+
+ Returns:
+ WebSocketApp of the wanted socket
+ """
+ _token = self._get_api_token()
+ clean_ws_host = urlparse(self.api_client.configuration.host).netloc
+ socket_url = self._socket_url(
+ host=clean_ws_host,
+ project_id=project_id,
+ app_id=app_id,
+ token=_token,
+ component=component,
+ )
+
+ return WebSocketApp(socket_url, on_message=on_message_callback, on_error=on_error_callback)
diff --git a/tests/tests_app/cli/test_cmd_show_logs.py b/tests/tests_app/cli/test_cmd_show_logs.py
new file mode 100644
index 0000000000000..0dc06025151fa
--- /dev/null
+++ b/tests/tests_app/cli/test_cmd_show_logs.py
@@ -0,0 +1,61 @@
+from unittest import mock
+
+from click.testing import CliRunner
+
+from lightning_app.cli.lightning_cli import logs
+
+
+@mock.patch("lightning_app.cli.lightning_cli.LightningClient")
+@mock.patch("lightning_app.cli.lightning_cli._get_project")
+def test_show_logs_errors(project, client):
+ """Test that the CLI prints the errors for the show logs command."""
+
+ runner = CliRunner()
+
+ # Response prep
+ app = mock.MagicMock()
+ app.name = "MyFakeApp"
+ work = mock.MagicMock()
+ work.name = "MyFakeWork"
+ flow = mock.MagicMock()
+ flow.name = "MyFakeFlow"
+
+ # No apps ever run
+ apps = {}
+ client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps
+
+ result = runner.invoke(logs, ["NonExistentApp"])
+
+ assert result.exit_code == 1
+ assert "Error: You don't have any application in the cloud" in result.output
+
+ # App not specified
+ apps = {app}
+ client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps
+
+ result = runner.invoke(logs)
+
+ assert result.exit_code == 1
+ assert "Please select one of available: [MyFakeApp]" in str(result.output)
+
+ # App does not exit
+ apps = {app}
+ client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps
+
+ result = runner.invoke(logs, ["ThisAppDoesNotExist"])
+
+ assert result.exit_code == 1
+ assert "The Lightning App 'ThisAppDoesNotExist' does not exist." in str(result.output)
+
+ # Component does not exist
+ apps = {app}
+ works = {work}
+ flows = {flow}
+ client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps
+ client.return_value.lightningwork_service_list_lightningwork.return_value.lightningworks = works
+ app.spec.flow_servers = flows
+
+ result = runner.invoke(logs, ["MyFakeApp", "NonExistentComponent"])
+
+ assert result.exit_code == 1
+ assert "Component 'NonExistentComponent' does not exist in app MyFakeApp." in result.output
diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py
index 1f681260de5c2..f8143b1db1a88 100644
--- a/tests/tests_app_examples/test_boring_app.py
+++ b/tests/tests_app_examples/test_boring_app.py
@@ -1,8 +1,10 @@
import os
import pytest
+from click.testing import CliRunner
from tests_app import _PROJECT_ROOT
+from lightning_app.cli.lightning_cli import logs
from lightning_app.testing.testing import run_app_in_cloud, wait_for
@@ -12,6 +14,7 @@ def test_boring_app_example_cloud() -> None:
_,
view_page,
_,
+ name,
):
def check_hello_there(*_, **__):
@@ -21,3 +24,15 @@ def check_hello_there(*_, **__):
return True
wait_for(view_page, check_hello_there)
+
+ runner = CliRunner()
+ result = runner.invoke(logs, [name])
+ lines = result.output.splitlines()
+
+ assert result.exit_code == 0
+ assert result.exception is None
+ assert len(lines) > 1, result.output
+ # We know that at some point we need to intstall lightning, so we check for that
+ assert any(
+ "Successfully built lightning" in line for line in lines
+ ), f"Did not find logs with lightning installation: {result.output}"
diff --git a/tests/tests_app_examples/test_collect_failures.py b/tests/tests_app_examples/test_collect_failures.py
index f263ebb1a9f58..c149211e10774 100644
--- a/tests/tests_app_examples/test_collect_failures.py
+++ b/tests/tests_app_examples/test_collect_failures.py
@@ -26,6 +26,7 @@ def test_collect_failures_example_cloud() -> None:
_,
_,
fetch_logs,
+ _,
):
last_found_log_index = -1
while len(expected_logs) != 0:
diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py
index 5116b1b9d54bb..266f0305c7604 100644
--- a/tests/tests_app_examples/test_commands.py
+++ b/tests/tests_app_examples/test_commands.py
@@ -16,6 +16,7 @@ def test_commands_example_cloud() -> None:
admin_page,
_,
fetch_logs,
+ _,
):
app_id = admin_page.url.split("/")[-1]
cmd = f"lightning trigger_with_client_command --name=something --app_id {app_id}"
diff --git a/tests/tests_app_examples/test_custom_work_dependencies.py b/tests/tests_app_examples/test_custom_work_dependencies.py
index 8390233e2eee3..d7c9db5ef610a 100644
--- a/tests/tests_app_examples/test_custom_work_dependencies.py
+++ b/tests/tests_app_examples/test_custom_work_dependencies.py
@@ -13,7 +13,7 @@ def test_custom_work_dependencies_example_cloud() -> None:
with run_app_in_cloud(
os.path.join(_PROJECT_ROOT, "tests/tests_app_examples/custom_work_dependencies/"),
app_name="app.py",
- ) as (_, _, fetch_logs):
+ ) as (_, _, fetch_logs, _):
has_logs = False
while not has_logs:
for log in fetch_logs():
diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py
index 9cebca9cf1072..14efc3458716e 100644
--- a/tests/tests_app_examples/test_drive.py
+++ b/tests/tests_app_examples/test_drive.py
@@ -13,6 +13,7 @@ def test_drive_example_cloud() -> None:
_,
view_page,
fetch_logs,
+ _,
):
has_logs = False
diff --git a/tests/tests_app_examples/test_idle_timeout.py b/tests/tests_app_examples/test_idle_timeout.py
index fb58a83aefc93..a39ae3f693f7a 100644
--- a/tests/tests_app_examples/test_idle_timeout.py
+++ b/tests/tests_app_examples/test_idle_timeout.py
@@ -13,6 +13,7 @@ def test_idle_timeout_example_cloud() -> None:
_,
_,
fetch_logs,
+ _,
):
has_logs = False
while not has_logs:
diff --git a/tests/tests_app_examples/test_payload.py b/tests/tests_app_examples/test_payload.py
index 28d2391c18a2a..58fc28a4a8d3c 100644
--- a/tests/tests_app_examples/test_payload.py
+++ b/tests/tests_app_examples/test_payload.py
@@ -9,7 +9,7 @@
@pytest.mark.cloud
def test_payload_example_cloud() -> None:
- with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_payload")) as (_, _, fetch_logs):
+ with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_payload")) as (_, _, fetch_logs, _):
has_logs = False
while not has_logs:
diff --git a/tests/tests_app_examples/test_quick_start.py b/tests/tests_app_examples/test_quick_start.py
index 9db693a5dc3d6..454c1084ca1bb 100644
--- a/tests/tests_app_examples/test_quick_start.py
+++ b/tests/tests_app_examples/test_quick_start.py
@@ -51,7 +51,7 @@ def test_quick_start_example(caplog, monkeypatch):
@pytest.mark.cloud
def test_quick_start_example_cloud() -> None:
- with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "lightning-quick-start/")) as (_, view_page, _):
+ with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "lightning-quick-start/")) as (_, view_page, _, _):
def click_gradio_demo(*_, **__):
button = view_page.locator('button:has-text("Interactive demo")')
diff --git a/tests/tests_app_examples/test_template_react_ui.py b/tests/tests_app_examples/test_template_react_ui.py
index 2e348035fe6e5..4b4588d2397e5 100644
--- a/tests/tests_app_examples/test_template_react_ui.py
+++ b/tests/tests_app_examples/test_template_react_ui.py
@@ -14,6 +14,7 @@ def test_template_react_ui_example_cloud() -> None:
_,
view_page,
fetch_logs,
+ _,
):
def click_button(*_, **__):
diff --git a/tests/tests_app_examples/test_template_streamlit_ui.py b/tests/tests_app_examples/test_template_streamlit_ui.py
index a8ba93794f2a0..e2c33305298f7 100644
--- a/tests/tests_app_examples/test_template_streamlit_ui.py
+++ b/tests/tests_app_examples/test_template_streamlit_ui.py
@@ -14,6 +14,7 @@ def test_template_streamlit_ui_example_cloud() -> None:
_,
view_page,
fetch_logs,
+ _,
):
def click_button(*_, **__):
diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py
index d34a92d6102f8..acc9e285c4d79 100644
--- a/tests/tests_app_examples/test_v0_app.py
+++ b/tests/tests_app_examples/test_v0_app.py
@@ -74,5 +74,6 @@ def test_v0_app_example_cloud() -> None:
_,
view_page,
fetch_logs,
+ _,
):
run_v0_app(fetch_logs, view_page)
From d5f35ece72fd253adeb8e9947fd9be4a5992f8f8 Mon Sep 17 00:00:00 2001
From: Akihiro Nitta
Date: Wed, 10 Aug 2022 19:37:50 +0900
Subject: [PATCH 30/59] CI/CD: Add CUDA version to docker image tags (#13831)
* append cuda version to tags
* revertme: push to hub
* Update docker readme
* Build base-conda-py3.9-torch1.12-cuda11.3.1
* Use new images in conda tests
* revertme: push to hub
* Revert "revertme: push to hub"
This reverts commit 0f7d534b2ae41e4bd227961a929c333c88e35f59.
* Revert "revertme: push to hub"
This reverts commit 46a05fccbb9b596aa98d5d68424917b5811c5b4f.
* Run conda if workflow edited
* Run gpu testing if workflow edited
* Use new tags in release/Dockerfile
* Build base-cuda and PL release images with all combinations
* Update release docker
* Update conda from py3.9-torch1.12 to py3.10-torch.1.12
* Fix ubuntu version
* Revert conda
* revertme: push to hub
* Don't build Python 3.10 for now...
* Fix pl release builder
* updating version contribute to the error? https://github.com/docker/buildx/issues/456
* Update actions' versions
* Update slack user to notify
* Don't use 11.6.0 to avoid bagua incompatibility
* Don't use 11.1, and use 11.1.1
* Update .github/workflows/ci-pytorch_test-conda.yml
Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>
* Update trigger
* Ignore artfacts from tutorials
* Trim docker images to distribute
* Add an image for tutorials
* Update conda image 3.8x1.10
* Try different conda variants
* No need to set cuda for conda jobs
* Update who to notify ipu failure
* Don't push
* update filenaem
Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com>
---
.azure/gpu-benchmark.yml | 2 +-
.azure/gpu-tests.yml | 4 +-
.github/workflows/ci-pytorch-test-conda.yml | 4 +-
.github/workflows/cicd-pytorch-dockers.yml | 80 +++++++++++----------
.github/workflows/release-docker.yml | 31 +++++---
.gitignore | 6 ++
dockers/README.md | 45 +++---------
dockers/release/Dockerfile | 3 +-
8 files changed, 87 insertions(+), 88 deletions(-)
diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml
index ac5ca6f60a6b4..0de590f2c54a6 100644
--- a/.azure/gpu-benchmark.yml
+++ b/.azure/gpu-benchmark.yml
@@ -28,7 +28,7 @@ jobs:
cancelTimeoutInMinutes: "2"
pool: azure-jirka-spot
container:
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
+ image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g"
workspace:
clean: all
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index f37c17613affc..68ba6974a3527 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -26,7 +26,7 @@ jobs:
strategy:
matrix:
'PyTorch - stable':
- image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12"
+ image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1"
# how long to run the job before automatically cancelling
timeoutInMinutes: "80"
# how much time to give 'run always even if cancelled tasks' before stopping them
@@ -44,7 +44,7 @@ jobs:
- bash: |
CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}')
- FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*'
+ FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml'
echo $CHANGED_FILES > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES
diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml
index 777ec2af759a0..2bbdb699c2c1e 100644
--- a/.github/workflows/ci-pytorch-test-conda.yml
+++ b/.github/workflows/ci-pytorch-test-conda.yml
@@ -22,13 +22,11 @@ jobs:
strategy:
fail-fast: false
matrix:
- # nightly: add when there's a release candidate
include:
- {python-version: "3.8", pytorch-version: "1.9"}
- {python-version: "3.8", pytorch-version: "1.10"}
- {python-version: "3.9", pytorch-version: "1.11"}
- {python-version: "3.9", pytorch-version: "1.12"}
-
timeout-minutes: 30
steps:
@@ -45,7 +43,7 @@ jobs:
id: skip
shell: bash -l {0}
run: |
- FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*'
+ FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml'
echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt
MATCHES=$(cat changed_files.txt | grep -E $FILTER)
echo $MATCHES
diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml
index a6ba2ac4aa5f4..84051cafd82d8 100644
--- a/.github/workflows/cicd-pytorch-dockers.yml
+++ b/.github/workflows/cicd-pytorch-dockers.yml
@@ -29,17 +29,22 @@ jobs:
strategy:
fail-fast: false
matrix:
- # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image
- python_version: ["3.9"]
- pytorch_version: ["1.12"]
+ include:
+ # We only release one docker image per PyTorch version.
+ # The matrix here is the same as the one in release-docker.yml.
+ - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+ - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+ - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+ - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- - uses: docker/build-push-action@v2
+ - uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
+ CUDA_VERSION=${{ matrix.cuda_version }}
file: dockers/release/Dockerfile
push: false # pushed in release-docker.yml only when PL is released
timeout-minutes: 50
@@ -53,14 +58,14 @@ jobs:
python_version: ["3.7"]
xla_version: ["1.12"]
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- - uses: docker/login-action@v1
+ - uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- - uses: docker/build-push-action@v2
+ - uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
@@ -85,30 +90,31 @@ jobs:
fail-fast: false
matrix:
include:
- # the config used in '.azure-pipelines/gpu-tests.yml'
- - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"}
- # latest (used in Tutorials)
- - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"}
- - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"}
- - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"}
+ # These are the base images for PL release docker images,
+ # so include at least all of the combinations in release-dockers.yml.
+ - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+ - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+ - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+ - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
+ # Used in Lightning-AI/tutorials
+ - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- - uses: docker/login-action@v1
+ - uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- - uses: docker/build-push-action@v2
+ - uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
CUDA_VERSION=${{ matrix.cuda_version }}
- UBUNTU_VERSION=${{ matrix.ubuntu_version }}
file: dockers/base-cuda/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
- tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
+ tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
timeout-minutes: 95
- uses: ravsamhq/notify-slack-action@v1
if: failure() && env.PUSH_TO_HUB == 'true'
@@ -126,25 +132,23 @@ jobs:
fail-fast: false
matrix:
include:
- - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"}
- - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"}
- - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
- # nightly: add when there's a release candidate
- # - {python_version: "3.9", pytorch_version: "1.12"}
+ - {python_version: "3.8", pytorch_version: "1.9"}
+ - {python_version: "3.8", pytorch_version: "1.10"}
+ - {python_version: "3.9", pytorch_version: "1.11"}
+ - {python_version: "3.9", pytorch_version: "1.12"}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- - uses: docker/login-action@v1
+ - uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- - uses: docker/build-push-action@v2
+ - uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
PYTORCH_VERSION=${{ matrix.pytorch_version }}
- CUDA_VERSION=${{ matrix.cuda_version }}
file: dockers/base-conda/Dockerfile
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
@@ -168,14 +172,14 @@ jobs:
# the config used in 'dockers/ci-runner-ipu/Dockerfile'
- {python_version: "3.9", pytorch_version: "1.9"}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- - uses: docker/login-action@v1
+ - uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- - uses: docker/build-push-action@v2
+ - uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
@@ -184,7 +188,7 @@ jobs:
push: ${{ env.PUSH_TO_HUB }}
tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}
timeout-minutes: 100
- - uses: docker/build-push-action@v2
+ - uses: docker/build-push-action@v3
with:
build-args: |
PYTHON_VERSION=${{ matrix.python_version }}
@@ -199,7 +203,7 @@ jobs:
status: ${{ job.status }}
token: ${{ secrets.GITHUB_TOKEN }}
notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }}
- message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' # SeanNaren
+ message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11
env:
SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
@@ -212,14 +216,14 @@ jobs:
# the config used in 'dockers/ci-runner-hpu/Dockerfile'
- {gaudi_version: "1.5.0", pytorch_version: "1.11.0"}
steps:
- - uses: actions/checkout@v2
+ - uses: actions/checkout@v3
- uses: docker/setup-buildx-action@v2
- - uses: docker/login-action@v1
+ - uses: docker/login-action@v2
if: env.PUSH_TO_HUB == 'true'
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- - uses: docker/build-push-action@v2
+ - uses: docker/build-push-action@v3
with:
build-args: |
DIST=latest
@@ -243,10 +247,10 @@ jobs:
runs-on: ubuntu-20.04
steps:
- name: Checkout
- uses: actions/checkout@v2
+ uses: actions/checkout@v3
- name: Build Conda Docker
# publish master/release
- uses: docker/build-push-action@v2
+ uses: docker/build-push-action@v3
with:
file: dockers/nvidia/Dockerfile
push: false
diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml
index 9d87f1a582fb1..6901a24204683 100644
--- a/.github/workflows/release-docker.yml
+++ b/.github/workflows/release-docker.yml
@@ -1,6 +1,5 @@
name: Docker
-# https://www.docker.com/blog/first-docker-github-action-is-here
-# https://github.com/docker/build-push-action
+
on:
push:
branches: [master, "release/*"]
@@ -15,8 +14,12 @@ jobs:
strategy:
fail-fast: false
matrix:
- python_version: ["3.7", "3.8", "3.9"]
- pytorch_version: ["1.9", "1.10"]
+ include:
+ # We only release one docker image per PyTorch version.
+ - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"}
+ - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"}
+ - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"}
+ - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"}
steps:
- name: Checkout
uses: actions/checkout@v2
@@ -32,19 +35,29 @@ jobs:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
dockerfile: dockers/release/Dockerfile
- build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
- tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}"
+ build_args: |
+ PYTHON_VERSION=${{ matrix.python_version }}
+ PYTORCH_VERSION=${{ matrix.pytorch_version }}
+ CUDA_VERSION=${{ matrix.cuda_version }}
+ LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+ tags: |
+ ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
+ latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }}
timeout-minutes: 55
- name: Publish Latest to Docker
uses: docker/build-push-action@v1.1.0
- # only on releases and latest Python and PyTorch
- if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10'
+ # Only latest Python and PyTorch
+ if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12'
with:
repository: pytorchlightning/pytorch_lightning
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
dockerfile: dockers/release/Dockerfile
- build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
+ build_args: |
+ PYTHON_VERSION=${{ matrix.python_version }}
+ PYTORCH_VERSION=${{ matrix.pytorch_version }}
+ CUDA_VERSION=${{ matrix.cuda_version }}
+ LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }}
tags: "latest"
timeout-minutes: 55
diff --git a/.gitignore b/.gitignore
index 719f291a492ca..259d9f271189c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -165,3 +165,9 @@ hars*
artifacts/*
*docs/examples*
*docs/source-app/api*
+
+# tutorials
+our_model.tar
+test.png
+saved_models
+data/
diff --git a/dockers/README.md b/dockers/README.md
index 533c85739f528..b1ff9826b6c1f 100644
--- a/dockers/README.md
+++ b/dockers/README.md
@@ -1,36 +1,17 @@
# Docker images
-## Builds images form attached Dockerfiles
+## Build images from Dockerfiles
You can build it on your own, note it takes lots of time, be prepared.
```bash
-git clone
-docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile .
-```
-
-or with specific arguments
-
-```bash
-git clone
-docker image build \
- -t pytorch-lightning:base-cuda-py3.9-pt1.10 \
- -f dockers/base-cuda/Dockerfile \
- --build-arg PYTHON_VERSION=3.9 \
- --build-arg PYTORCH_VERSION=1.10 \
- .
-```
+git clone https://github.com/Lightning-AI/lightning.git
-or nightly version from Conda
+# build with the default arguments
+docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile .
-```bash
-git clone
-docker image build \
- -t pytorch-lightning:base-conda-py3.9-pt1.11 \
- -f dockers/base-conda/Dockerfile \
- --build-arg PYTHON_VERSION=3.9 \
- --build-arg PYTORCH_VERSION=1.11 \
- .
+# build with specific arguments
+docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 .
```
To run your docker use
@@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest
## Run docker image with GPUs
-To run docker image with access to you GPUs you need to install
+To run docker image with access to your GPUs, you need to install
```bash
# Add the package repositories
@@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit
sudo systemctl restart docker
```
-and later run the docker image with `--gpus all` so for example
+and later run the docker image with `--gpus all`. For example,
```
-docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10
+docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1
```
## Run Jupyter server
@@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in-
1. Build the docker image:
```bash
- docker image build \
- -t pytorch-lightning:v1.3.1 \
- -f dockers/nvidia/Dockerfile \
- --build-arg LIGHTNING_VERSION=1.3.1 \
- .
+ docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 .
```
1. start the server and map ports:
```bash
- docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1
+ docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5
```
1. Connect in local browser:
- copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6`
diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile
index cb393c91dfbe0..c39e66509188c 100644
--- a/dockers/release/Dockerfile
+++ b/dockers/release/Dockerfile
@@ -14,8 +14,9 @@
ARG PYTHON_VERSION=3.9
ARG PYTORCH_VERSION=1.11
+ARG CUDA_VERSION=11.3.1
-FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}
+FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION}
LABEL maintainer="Lightning-AI "
From 2f7daac4b80bc13135f7e14dffcdd0bd3d50a654 Mon Sep 17 00:00:00 2001
From: thomas chaton
Date: Wed, 10 Aug 2022 13:17:29 +0200
Subject: [PATCH 31/59] Use websockets in e2es (#14138)
---
src/lightning_app/cli/lightning_cli.py | 6 +-
src/lightning_app/testing/testing.py | 72 ++++++++++++-------
src/lightning_app/utilities/app_logs.py | 41 +++++++----
tests/tests_app/utilities/test_app_logs.py | 11 +++
tests/tests_app_examples/test_commands.py | 2 +-
.../test_custom_work_dependencies.py | 2 +-
tests/tests_app_examples/test_drive.py | 4 +-
tests/tests_app_examples/test_idle_timeout.py | 2 +-
tests/tests_app_examples/test_payload.py | 2 +-
tests/tests_app_examples/test_v0_app.py | 2 +-
10 files changed, 97 insertions(+), 47 deletions(-)
create mode 100644 tests/tests_app/utilities/test_app_logs.py
diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py
index 45c80d4dcc357..babe0aa2b2abc 100644
--- a/src/lightning_app/cli/lightning_cli.py
+++ b/src/lightning_app/cli/lightning_cli.py
@@ -136,10 +136,10 @@ def logs(app_name: str, components: List[str], follow: bool) -> None:
rich_colors = list(ANSI_COLOR_NAMES)
colors = {c: rich_colors[i + 1] for i, c in enumerate(components)}
- for component_name, log_event in log_reader:
+ for log_event in log_reader:
date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S")
- color = colors[component_name]
- rich.print(f"[{color}]{component_name}[/{color}] {date} {log_event.message}")
+ color = colors[log_event.component_name]
+ rich.print(f"[{color}]{log_event.component_name}[/{color}] {date} {log_event.message}")
@_main.command()
diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py
index 74d57db38c427..884c02a0521c1 100644
--- a/src/lightning_app/testing/testing.py
+++ b/src/lightning_app/testing/testing.py
@@ -1,26 +1,30 @@
import asyncio
import json
+import logging
import os
import shutil
import subprocess
import sys
import tempfile
import time
+import traceback
from contextlib import contextmanager
from subprocess import Popen
from time import sleep
-from typing import Any, Callable, Dict, Generator, List, Type
+from typing import Any, Callable, Dict, Generator, List, Optional, Type
import requests
from lightning_cloud.openapi.rest import ApiException
from requests import Session
from rich import print
+from rich.color import ANSI_COLOR_NAMES
from lightning_app import LightningApp, LightningFlow
from lightning_app.cli.lightning_cli import run_app
from lightning_app.core.constants import LIGHTNING_CLOUD_PROJECT_ID
from lightning_app.runners.multiprocess import MultiProcessRuntime
from lightning_app.testing.config import Config
+from lightning_app.utilities.app_logs import _app_logs_reader
from lightning_app.utilities.cloud import _get_project
from lightning_app.utilities.enum import CacheCallsKeys
from lightning_app.utilities.imports import _is_playwright_available, requires
@@ -32,6 +36,9 @@
from playwright.sync_api import HttpCredentials, sync_playwright
+_logger = logging.getLogger(__name__)
+
+
class LightningTestApp(LightningApp):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
@@ -282,20 +289,6 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str
var scrollingElement = (document.scrollingElement || document.body);
scrollingElement.scrollTop = scrollingElement.scrollHeight;
}, 200);
-
- if (!window._logs) {
- window._logs = [];
- }
-
- if (window.logTerminals) {
- Object.entries(window.logTerminals).forEach(
- ([key, value]) => {
- window.logTerminals[key]._onLightningWritelnHandler = function (data) {
- window._logs = window._logs.concat([data]);
- }
- }
- );
- }
"""
)
@@ -309,8 +302,46 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str
except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError):
pass
- def fetch_logs() -> str:
- return admin_page.evaluate("window._logs;")
+ client = LightningClient()
+ project = _get_project(client)
+ identifiers = []
+ rich_colors = list(ANSI_COLOR_NAMES)
+
+ def fetch_logs(component_names: Optional[List[str]] = None) -> Generator:
+ """This methods creates websockets connection in threads and returns the logs to the main thread."""
+ app_id = admin_page.url.split("/")[-1]
+
+ if not component_names:
+ works = client.lightningwork_service_list_lightningwork(
+ project_id=project.project_id,
+ app_id=app_id,
+ ).lightningworks
+ component_names = ["flow"] + [w.name for w in works]
+
+ def on_error_callback(ws_app, *_):
+ print(traceback.print_exc())
+ ws_app.close()
+
+ colors = {c: rich_colors[i + 1] for i, c in enumerate(component_names)}
+ gen = _app_logs_reader(
+ client=client,
+ project_id=project.project_id,
+ app_id=app_id,
+ component_names=component_names,
+ follow=False,
+ on_error_callback=on_error_callback,
+ )
+ max_length = max(len(c.replace("root.", "")) for c in component_names)
+ for log_event in gen:
+ message = log_event.message
+ identifier = f"{log_event.timestamp}{log_event.message}"
+ if identifier not in identifiers:
+ date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S")
+ identifiers.append(identifier)
+ color = colors[log_event.component_name]
+ padding = (max_length - len(log_event.component_name)) * " "
+ print(f"[{color}]{log_event.component_name}{padding}[/{color}] {date} {message}")
+ yield message
# 5. Print your application ID
print(
@@ -323,11 +354,6 @@ def fetch_logs() -> str:
pass
finally:
print("##################################################")
- printed_logs = []
- for log in fetch_logs():
- if log not in printed_logs:
- printed_logs.append(log)
- print(log.split("[0m")[-1])
button = admin_page.locator('[data-cy="stop"]')
try:
button.wait_for(timeout=3 * 1000)
@@ -337,8 +363,6 @@ def fetch_logs() -> str:
context.close()
browser.close()
- client = LightningClient()
- project = _get_project(client)
list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id)
for lightningapp in list_lightningapps.lightningapps:
diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py
index 4a7af9b5c5143..536fbaae05093 100644
--- a/src/lightning_app/utilities/app_logs.py
+++ b/src/lightning_app/utilities/app_logs.py
@@ -5,7 +5,7 @@
from datetime import datetime, timedelta
from json import JSONDecodeError
from threading import Thread
-from typing import Iterator, List, Optional, Tuple
+from typing import Callable, Iterator, List, Optional
import dateutil.parser
from websocket import WebSocketApp
@@ -30,10 +30,17 @@ class _LogEventLabels:
class _LogEvent:
message: str
timestamp: datetime
+ component_name: str
labels: _LogEventLabels
+ def __ge__(self, other: "_LogEvent") -> bool:
+ return self.timestamp >= other.timestamp
-def _push_logevents_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue):
+ def __gt__(self, other: "_LogEvent") -> bool:
+ return self.timestamp > other.timestamp
+
+
+def _push_log_events_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue):
"""Pushes _LogEvents from websocket to read_queue.
Returns callback function used with `on_message_callback` of websocket.WebSocketApp.
@@ -43,13 +50,17 @@ def callback(ws_app: WebSocketApp, msg: str):
# We strongly trust that the contract on API will hold atm :D
event_dict = json.loads(msg)
labels = _LogEventLabels(**event_dict["labels"])
+
if "message" in event_dict:
+ message = event_dict["message"]
+ timestamp = dateutil.parser.isoparse(event_dict["timestamp"])
event = _LogEvent(
- message=event_dict["message"],
- timestamp=dateutil.parser.isoparse(event_dict["timestamp"]),
+ message=message,
+ timestamp=timestamp,
+ component_name=component_name,
labels=labels,
)
- read_queue.put((event.timestamp, component_name, event))
+ read_queue.put(event)
return callback
@@ -66,8 +77,13 @@ def _error_callback(ws_app: WebSocketApp, error: Exception):
def _app_logs_reader(
- client: LightningClient, project_id: str, app_id: str, component_names: List[str], follow: bool
-) -> Iterator[Tuple[str, _LogEvent]]:
+ client: LightningClient,
+ project_id: str,
+ app_id: str,
+ component_names: List[str],
+ follow: bool,
+ on_error_callback: Optional[Callable] = None,
+) -> Iterator[_LogEvent]:
read_queue = queue.PriorityQueue()
logs_api_client = _LightningLogsSocketAPI(client.api_client)
@@ -78,8 +94,8 @@ def _app_logs_reader(
project_id=project_id,
app_id=app_id,
component=component_name,
- on_message_callback=_push_logevents_to_read_queue_callback(component_name, read_queue),
- on_error_callback=_error_callback,
+ on_message_callback=_push_log_events_to_read_queue_callback(component_name, read_queue),
+ on_error_callback=on_error_callback or _error_callback,
)
for component_name in component_names
]
@@ -92,20 +108,19 @@ def _app_logs_reader(
for th in log_threads:
th.start()
+ # Print logs from queue when log event is available
user_log_start = "<<< BEGIN USER_RUN_FLOW SECTION >>>"
start_timestamp = None
# Print logs from queue when log event is available
try:
while True:
- _, component_name, log_event = read_queue.get(timeout=None if follow else 1.0)
- log_event: _LogEvent
-
+ log_event = read_queue.get(timeout=None if follow else 1.0)
if user_log_start in log_event.message:
start_timestamp = log_event.timestamp + timedelta(seconds=0.5)
if start_timestamp and log_event.timestamp > start_timestamp:
- yield component_name, log_event
+ yield log_event
except queue.Empty:
# Empty is raised by queue.get if timeout is reached. Follow = False case.
diff --git a/tests/tests_app/utilities/test_app_logs.py b/tests/tests_app/utilities/test_app_logs.py
new file mode 100644
index 0000000000000..e7384dd72d6e2
--- /dev/null
+++ b/tests/tests_app/utilities/test_app_logs.py
@@ -0,0 +1,11 @@
+from datetime import datetime
+from unittest.mock import MagicMock
+
+from lightning_app.utilities.app_logs import _LogEvent
+
+
+def test_log_event():
+ event_1 = _LogEvent("", datetime.now(), MagicMock(), MagicMock())
+ event_2 = _LogEvent("", datetime.now(), MagicMock(), MagicMock())
+ assert event_1 < event_2
+ assert event_1 <= event_2
diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py
index 266f0305c7604..236e587e23101 100644
--- a/tests/tests_app_examples/test_commands.py
+++ b/tests/tests_app_examples/test_commands.py
@@ -26,7 +26,7 @@ def test_commands_example_cloud() -> None:
has_logs = False
while not has_logs:
- for log in fetch_logs():
+ for log in fetch_logs(["flow"]):
if "['something', 'else']" in log:
has_logs = True
sleep(1)
diff --git a/tests/tests_app_examples/test_custom_work_dependencies.py b/tests/tests_app_examples/test_custom_work_dependencies.py
index d7c9db5ef610a..b8971e0ef2148 100644
--- a/tests/tests_app_examples/test_custom_work_dependencies.py
+++ b/tests/tests_app_examples/test_custom_work_dependencies.py
@@ -16,7 +16,7 @@ def test_custom_work_dependencies_example_cloud() -> None:
) as (_, _, fetch_logs, _):
has_logs = False
while not has_logs:
- for log in fetch_logs():
+ for log in fetch_logs(["flow"]):
if "Custom Work Dependency checker End" in log:
has_logs = True
sleep(1)
diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py
index 14efc3458716e..630e76b550e9e 100644
--- a/tests/tests_app_examples/test_drive.py
+++ b/tests/tests_app_examples/test_drive.py
@@ -11,14 +11,14 @@
def test_drive_example_cloud() -> None:
with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_drive")) as (
_,
- view_page,
+ _,
fetch_logs,
_,
):
has_logs = False
while not has_logs:
- for log in fetch_logs():
+ for log in fetch_logs(["flow"]):
if "Application End!" in log:
has_logs = True
sleep(1)
diff --git a/tests/tests_app_examples/test_idle_timeout.py b/tests/tests_app_examples/test_idle_timeout.py
index a39ae3f693f7a..f06181ce86ed3 100644
--- a/tests/tests_app_examples/test_idle_timeout.py
+++ b/tests/tests_app_examples/test_idle_timeout.py
@@ -17,7 +17,7 @@ def test_idle_timeout_example_cloud() -> None:
):
has_logs = False
while not has_logs:
- for log in fetch_logs():
+ for log in fetch_logs(["flow"]):
if "Application End" in log:
has_logs = True
sleep(1)
diff --git a/tests/tests_app_examples/test_payload.py b/tests/tests_app_examples/test_payload.py
index 58fc28a4a8d3c..b40b8ca52defd 100644
--- a/tests/tests_app_examples/test_payload.py
+++ b/tests/tests_app_examples/test_payload.py
@@ -13,7 +13,7 @@ def test_payload_example_cloud() -> None:
has_logs = False
while not has_logs:
- for log in fetch_logs():
+ for log in fetch_logs(["flow"]):
if "Application End!" in log:
has_logs = True
sleep(1)
diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py
index acc9e285c4d79..026c45a4e1ba1 100644
--- a/tests/tests_app_examples/test_v0_app.py
+++ b/tests/tests_app_examples/test_v0_app.py
@@ -45,7 +45,7 @@ def check_content(button_name, text_content):
wait_for(view_page, check_content, "TAB_2", "Hello from component B")
has_logs = False
while not has_logs:
- for log in fetch_logs():
+ for log in fetch_logs(["flow"]):
if "'a': 'a', 'b': 'b'" in log:
has_logs = True
sleep(1)
From b8b8f033fd55db6c03e28ced1ddc2b49f6c8b770 Mon Sep 17 00:00:00 2001
From: thomas chaton
Date: Wed, 10 Aug 2022 14:56:41 +0200
Subject: [PATCH 32/59] (app) Run the flow only if the state has updated 1/2
(#14076)
---
src/lightning_app/CHANGELOG.md | 2 +
src/lightning_app/core/app.py | 31 +++++++++++-----
src/lightning_app/utilities/app_helpers.py | 7 ++--
src/lightning_app/utilities/commands/base.py | 1 +
src/lightning_app/utilities/scheduler.py | 2 +-
tests/tests_app/core/test_lightning_app.py | 39 +++++++++++++++++++-
tests/tests_app/core/test_lightning_flow.py | 21 +++++------
tests/tests_app/utilities/test_commands.py | 4 +-
8 files changed, 78 insertions(+), 29 deletions(-)
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index ba8cdd796c5bb..f32d07697f376 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -24,6 +24,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703))
+- Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076))
+
### Deprecated
### Fixed
diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py
index 584f94285c219..3f9e2521eb21d 100644
--- a/src/lightning_app/core/app.py
+++ b/src/lightning_app/core/app.py
@@ -15,7 +15,7 @@
from lightning_app.core.queues import BaseQueue, SingleProcessQueue
from lightning_app.frontend import Frontend
from lightning_app.storage.path import storage_root_dir
-from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef
+from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef
from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests
from lightning_app.utilities.component import _convert_paths_after_init
from lightning_app.utilities.enum import AppStage, CacheCallsKeys
@@ -94,7 +94,7 @@ def __init__(
self.processes: t.Dict[str, WorkManager] = {}
self.frontends: t.Dict[str, Frontend] = {}
self.stage = AppStage.RUNNING
- self._has_updated: bool = False
+ self._has_updated: bool = True
self._schedules: t.Dict[str, t.Dict] = {}
self.threads: t.List[threading.Thread] = []
@@ -278,7 +278,7 @@ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]:
if component_output:
logger.debug(f"Received from {component_output.id} : {component_output.delta.to_dict()}")
work = self.get_component_by_name(component_output.id)
- new_work_delta = _delta_to_appstate_delta(self.root, work, deepcopy(component_output.delta))
+ new_work_delta = _delta_to_app_state_delta(self.root, work, deepcopy(component_output.delta))
deltas.append(new_work_delta)
else:
should_get_component_output = False
@@ -307,9 +307,11 @@ def maybe_apply_changes(self) -> bool:
if not deltas:
# When no deltas are received from the Rest API or work queues,
# we need to check if the flow modified the state and populate changes.
- if Delta(DeepDiff(self.last_state, self.state, verbose_level=2)).to_dict():
+ deep_diff = DeepDiff(self.last_state, self.state, verbose_level=2)
+ if deep_diff:
+ # TODO: Resolve changes with ``CacheMissException``.
# new_state = self.populate_changes(self.last_state, self.state)
- self.set_state(self.state)
+ self.set_last_state(self.state)
self._has_updated = True
return False
@@ -329,7 +331,6 @@ def maybe_apply_changes(self) -> bool:
def run_once(self):
"""Method used to collect changes and run the root Flow once."""
done = False
- self._has_updated = False
self._last_run_time = 0.0
if self.backend is not None:
@@ -352,17 +353,23 @@ def run_once(self):
_process_command_requests(self)
+ t0 = time()
+
try:
self.check_error_queue()
- t0 = time()
- self.root.run()
- self._last_run_time = time() - t0
+ # Execute the flow only if:
+ # - There are state changes
+ # - It is the first execution of the flow
+ if self._has_updated:
+ self.root.run()
except CacheMissException:
self._on_cache_miss_exception()
except (ExitAppException, KeyboardInterrupt):
done = True
self.stage = AppStage.STOPPING
+ self._last_run_time = time() - t0
+
self.on_run_once_end()
return done
@@ -414,6 +421,8 @@ def _run(self) -> bool:
if self._has_updated and self.should_publish_changes_to_api and self.api_publish_state_queue:
self.api_publish_state_queue.put(self.state_vars)
+ self._has_updated = False
+
return True
def _update_layout(self) -> None:
@@ -430,8 +439,10 @@ def _apply_restarting(self) -> bool:
self.stage = AppStage.BLOCKING
return False
- def _has_work_finished(self, work):
+ def _has_work_finished(self, work) -> bool:
latest_call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH]
+ if latest_call_hash is None:
+ return False
return "ret" in work._calls[latest_call_hash]
def _collect_work_finish_status(self) -> dict:
diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py
index 4144c6de3ba12..faa612bba1998 100644
--- a/src/lightning_app/utilities/app_helpers.py
+++ b/src/lightning_app/utilities/app_helpers.py
@@ -299,7 +299,7 @@ def _set_child_name(component: "Component", child: "Component", new_name: str) -
return child_name
-def _delta_to_appstate_delta(root: "LightningFlow", component: "Component", delta: Delta) -> Delta:
+def _delta_to_app_state_delta(root: "LightningFlow", component: "Component", delta: Delta) -> Delta:
delta_dict = delta.to_dict()
for changed in delta_dict.values():
for delta_key in changed.copy().keys():
@@ -322,8 +322,9 @@ def _delta_to_appstate_delta(root: "LightningFlow", component: "Component", delt
delta_key_without_root = delta_key[4:] # the first 4 chars are the word 'root', strip it
new_key = new_prefix + delta_key_without_root
- changed[new_key] = val
- del changed[delta_key]
+ if new_key != delta_key:
+ changed[new_key] = val
+ del changed[delta_key]
return Delta(delta_dict)
diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py
index 11661e51ca26a..b87b41b05df42 100644
--- a/src/lightning_app/utilities/commands/base.py
+++ b/src/lightning_app/utilities/commands/base.py
@@ -243,3 +243,4 @@ def _process_command_requests(app):
# Validation is done on the CLI side.
response = method(**command_query["command_arguments"])
app.commands_responses_queue.put({"response": response, "id": command_query["id"]})
+ app._has_updated = True
diff --git a/src/lightning_app/utilities/scheduler.py b/src/lightning_app/utilities/scheduler.py
index 012930f017f20..e45b0879246b9 100644
--- a/src/lightning_app/utilities/scheduler.py
+++ b/src/lightning_app/utilities/scheduler.py
@@ -15,7 +15,7 @@ class SchedulerThread(threading.Thread):
def __init__(self, app) -> None:
super().__init__(daemon=True)
self._exit_event = threading.Event()
- self._sleep_time = 0.5
+ self._sleep_time = 1.0
self._app = app
def run(self) -> None:
diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py
index e6c715f87ef03..3776481965be3 100644
--- a/tests/tests_app/core/test_lightning_app.py
+++ b/tests/tests_app/core/test_lightning_app.py
@@ -1,3 +1,4 @@
+import logging
import os
import pickle
from time import sleep
@@ -27,6 +28,8 @@
from lightning_app.utilities.redis import check_if_redis_running
from lightning_app.utilities.warnings import LightningFlowWarning
+logger = logging.getLogger()
+
class B1(LightningFlow):
def __init__(self):
@@ -439,19 +442,25 @@ def __init__(self):
self.counter = 0
def run(self):
- self.counter = 1
+ if self.counter < 2:
+ self.counter += 1
def test_maybe_apply_changes_from_flow():
"""This test validates the app `_updated` is set to True only if the state was changed in the flow."""
app = LightningApp(SimpleFlow())
- assert not app._has_updated
+ assert app._has_updated
app.maybe_apply_changes()
app.root.run()
app.maybe_apply_changes()
assert app._has_updated
app._has_updated = False
+ app.root.run()
+ app.maybe_apply_changes()
+ assert app._has_updated
+ app._has_updated = False
+ app.root.run()
app.maybe_apply_changes()
assert not app._has_updated
@@ -920,3 +929,29 @@ def test_state_size_constant_growth():
MultiProcessRuntime(app, start_server=False).dispatch()
assert app.root._state_sizes[0] <= 5904
assert app.root._state_sizes[20] <= 23736
+
+
+class FlowUpdated(LightningFlow):
+ def run(self):
+ logger.info("Hello World")
+
+
+class NonUpdatedLightningTestApp(LightningTestApp):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.counter = 0
+
+ def on_after_run_once(self):
+ self.counter += 1
+ if not self._has_updated and self.counter > 2:
+ return True
+ return super().on_after_run_once()
+
+
+def test_non_updated_flow(caplog):
+ """This tests validate the app can run 3 times and call the flow only once."""
+ with caplog.at_level(logging.INFO):
+ app = NonUpdatedLightningTestApp(FlowUpdated())
+ MultiProcessRuntime(app, start_server=False).dispatch()
+ assert caplog.messages == ["Hello World"]
+ assert app.counter == 3
diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py
index e8ce1222a3186..4c0eb23ea014c 100644
--- a/tests/tests_app/core/test_lightning_flow.py
+++ b/tests/tests_app/core/test_lightning_flow.py
@@ -16,7 +16,7 @@
from lightning_app.storage import Path
from lightning_app.storage.path import storage_root_dir
from lightning_app.testing.helpers import EmptyFlow, EmptyWork
-from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef
+from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef
from lightning_app.utilities.enum import CacheCallsKeys
from lightning_app.utilities.exceptions import ExitAppException
@@ -416,7 +416,7 @@ def run(self):
flow_a.work.counter = 1
work_state_2 = flow_a.work.state
delta = Delta(DeepDiff(work_state, work_state_2, verbose_level=2))
- delta = _delta_to_appstate_delta(flow_a, flow_a.work, delta)
+ delta = _delta_to_app_state_delta(flow_a, flow_a.work, delta)
new_flow_state = LightningApp.populate_changes(flow_state, flow_state + delta)
flow_a.set_state(new_flow_state)
assert flow_a.work.counter == 1
@@ -592,24 +592,23 @@ def run(self):
class FlowSchedule(LightningFlow):
def __init__(self):
super().__init__()
- self._last_time = None
+ self._last_times = []
+ self.target = 3
+ self.seconds = ",".join([str(v) for v in range(0, 60, self.target)])
def run(self):
- if self.schedule("* * * * * 0,5,10,15,20,25,30,35,40,45,50,55"):
- if self._last_time is None:
- self._last_time = False
- elif not self._last_time:
- self._last_time = time()
+ if self.schedule(f"* * * * * {self.seconds}"):
+ if len(self._last_times) < 3:
+ self._last_times.append(time())
else:
- # TODO (tchaton) Optimize flow execution.
- assert 4.0 < abs(time() - self._last_time) < 6.0
+ assert abs((time() - self._last_times[-1]) - self.target) < 3
self._exit()
def test_scheduling_api():
app = LightningApp(FlowSchedule())
- MultiProcessRuntime(app).dispatch()
+ MultiProcessRuntime(app, start_server=True).dispatch()
def test_lightning_flow():
diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py
index 1e8e36ed09545..ed7f386395282 100644
--- a/tests/tests_app/utilities/test_commands.py
+++ b/tests/tests_app/utilities/test_commands.py
@@ -44,7 +44,7 @@ def __init__(self):
def run(self):
if self.has_sweep and len(self.names) == 1:
- sleep(2)
+ sleep(1)
self._exit()
def trigger_method(self, name: str):
@@ -156,7 +156,7 @@ def test_configure_commands(monkeypatch):
monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name", "my_name", "--num_trials", "1"])
app_command()
time_left = 15
- while time_left > 0 or process.exitcode is None:
+ while time_left > 0 and process.exitcode != 0:
sleep(0.1)
time_left -= 0.1
assert process.exitcode == 0
From cda381a626719d965d85f9034993cae1f4227f29 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Wed, 10 Aug 2022 15:03:53 +0200
Subject: [PATCH 33/59] Update changelog after 1.7.1 release (#14127)
---
src/pytorch_lightning/CHANGELOG.md | 26 ++++++++------------------
1 file changed, 8 insertions(+), 18 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index b405665b9df88..baf98d81a7733 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -52,39 +52,29 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Fixed
-- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983))
+- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061))
-- Casted tensors to fp16 before moving them to device with `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
+- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938))
-- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988))
+- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128))
-- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061))
+## [1.7.1] - 2022-08-09
+### Fixed
+- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983))
+- Casted tensors to fp16 before moving them to device with `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
+- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988))
- Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262))
-
-
- Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992))
-
-
- Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041))
-
-
-- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938))
-
-
- Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051))
-
-
- Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095))
-- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128))
-
-
## [1.7.0] - 2022-08-02
### Added
From 58014846ee0fb54b92e4bfb4c0965b72bc0a9641 Mon Sep 17 00:00:00 2001
From: Krishna Kalyan
Date: Wed, 10 Aug 2022 14:32:12 +0100
Subject: [PATCH 34/59] Update Grid links to Lightning AI (#14081)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* initial changes for lightning
* Update .github/BECOMING_A_CORE_CONTRIBUTOR.md
Co-authored-by: Adrian Wälchli
Co-authored-by: Adrian Wälchli
---
.github/BECOMING_A_CORE_CONTRIBUTOR.md | 2 +-
SECURITY.md | 2 +-
src/pytorch_lightning/README.md | 4 ++--
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md
index a179161f687a1..fd40e29e1ebf1 100644
--- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md
+++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md
@@ -62,4 +62,4 @@ We are on the lookout for new people to join, however, if you feel like you meet
## Employment
-You can also become a [Grid.ai](https://www.grid.ai) employee or intern and work on Lightning. To get started, you can email `careers@grid.ai` with your resume or check out our [open job postings](https://boards.greenhouse.io/gridai).
+You can also become a [Lightning AI](https://lightning.ai/) employee or intern and work on Lightning. To get started, you can email `careers@lightning.ai` with your resume or check out our [open job postings](https://boards.greenhouse.io/lightningai).
diff --git a/SECURITY.md b/SECURITY.md
index 8f265f26be452..862563f84e2fe 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -1,2 +1,2 @@
-developer@grid.ai
+developer@lightning.ai
developer@pytorchlightning.ai
diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md
index b57aea6fae147..914596c0a9d2f 100644
--- a/src/pytorch_lightning/README.md
+++ b/src/pytorch_lightning/README.md
@@ -14,8 +14,8 @@ ______________________________________________________________________
Docs •
Examples •
Community •
- Grid AI •
- License
+ Lightning AI •
+ License
From 4e87a44002a91c869f43c0929d29fa8600f14f15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Wed, 10 Aug 2022 17:15:35 +0200
Subject: [PATCH 35/59] Avoid entry_points deprecation warning (#14052)
Co-authored-by: Adam J. Stewart
Co-authored-by: Akihiro Nitta
---
src/pytorch_lightning/CHANGELOG.md | 6 ++++++
.../trainer/connectors/callback_connector.py | 11 ++++++++---
src/pytorch_lightning/utilities/imports.py | 1 +
3 files changed, 15 insertions(+), 3 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index baf98d81a7733..90285b55c8037 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -61,6 +61,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128))
+- Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052))
+
+
+- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061))
+
+
## [1.7.1] - 2022-08-09
### Fixed
diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py
index bb7f912420256..32d67d44ad44c 100644
--- a/src/pytorch_lightning/trainer/connectors/callback_connector.py
+++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py
@@ -31,7 +31,7 @@
from pytorch_lightning.callbacks.rich_model_summary import RichModelSummary
from pytorch_lightning.callbacks.timer import Timer
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0
+from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0
from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info
_log = logging.getLogger(__name__)
@@ -260,14 +260,19 @@ def _configure_external_callbacks() -> List[Callback]:
Return:
A list of all callbacks collected from external factories.
"""
+ group = "pytorch_lightning.callbacks_factory"
+
if _PYTHON_GREATER_EQUAL_3_8_0:
from importlib.metadata import entry_points
- factories = entry_points().get("pytorch_lightning.callbacks_factory", ())
+ if _PYTHON_GREATER_EQUAL_3_10_0:
+ factories = entry_points(group=group) # type: ignore[call-arg]
+ else:
+ factories = entry_points().get(group, {}) # type: ignore[assignment]
else:
from pkg_resources import iter_entry_points
- factories = iter_entry_points("pytorch_lightning.callbacks_factory") # type: ignore[assignment]
+ factories = iter_entry_points(group) # type: ignore[assignment]
external_callbacks: List[Callback] = []
for factory in factories:
diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py
index 67bf75be3c4d3..ba437ad332dfa 100644
--- a/src/pytorch_lightning/utilities/imports.py
+++ b/src/pytorch_lightning/utilities/imports.py
@@ -124,6 +124,7 @@ def __repr__(self) -> str:
_IS_WINDOWS = platform.system() == "Windows"
_IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765
_PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8)
+_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10)
_TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1")
_TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0")
_TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2")
From 9b61b1c482cb8be569e664647a577730e55680c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Wed, 10 Aug 2022 17:21:05 +0200
Subject: [PATCH 36/59] Remove duplicated test classes (#14122)
Remove duplicated classes
---
.../progress/test_rich_progress_bar.py | 3 +-
.../callbacks/test_stochastic_weight_avg.py | 3 +-
tests/tests_pytorch/helpers/datasets.py | 39 +------------------
.../strategies/test_deepspeed_strategy.py | 3 +-
.../trainer/flags/test_val_check_interval.py | 3 +-
.../logging_/test_train_loop_logging.py | 3 +-
.../test_estimated_stepping_batches.py | 3 +-
.../tests_pytorch/trainer/test_dataloaders.py | 8 +++-
tests/tests_pytorch/trainer/test_trainer.py | 8 +++-
tests/tests_pytorch/utilities/test_data.py | 3 +-
10 files changed, 20 insertions(+), 56 deletions(-)
diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py
index e9374f8ea4be1..f1ccf2a2726a2 100644
--- a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py
+++ b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py
@@ -21,8 +21,7 @@
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ProgressBarBase, RichProgressBar
from pytorch_lightning.callbacks.progress.rich_progress import RichProgressBarTheme
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
-from tests_pytorch.helpers.datasets import RandomIterableDataset
+from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
from tests_pytorch.helpers.runif import RunIf
diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
index 65a0fea2fb4a5..7f1692e30a3f2 100644
--- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
+++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py
@@ -26,10 +26,9 @@
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import StochasticWeightAveraging
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
+from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
from pytorch_lightning.strategies import DDPSpawnStrategy, Strategy
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests_pytorch.helpers.datasets import RandomIterableDataset
from tests_pytorch.helpers.runif import RunIf
diff --git a/tests/tests_pytorch/helpers/datasets.py b/tests/tests_pytorch/helpers/datasets.py
index 3443020d4528f..c9d185313e85e 100644
--- a/tests/tests_pytorch/helpers/datasets.py
+++ b/tests/tests_pytorch/helpers/datasets.py
@@ -19,7 +19,7 @@
from typing import Optional, Sequence, Tuple
import torch
-from torch.utils.data import Dataset, IterableDataset
+from torch.utils.data import Dataset
class MNIST(Dataset):
@@ -212,40 +212,3 @@ def __getitem__(self, idx):
def __len__(self):
return len(self.y)
-
-
-class RandomDictDataset(Dataset):
- def __init__(self, size: int, length: int):
- self.len = length
- self.data = torch.randn(length, size)
-
- def __getitem__(self, index):
- a = self.data[index]
- b = a + 2
- return {"a": a, "b": b}
-
- def __len__(self):
- return self.len
-
-
-class RandomIterableDataset(IterableDataset):
- def __init__(self, size: int, count: int):
- self.count = count
- self.size = size
-
- def __iter__(self):
- for _ in range(self.count):
- yield torch.randn(self.size)
-
-
-class RandomIterableDatasetWithLen(IterableDataset):
- def __init__(self, size: int, count: int):
- self.count = count
- self.size = size
-
- def __iter__(self):
- for _ in range(len(self)):
- yield torch.randn(self.size)
-
- def __len__(self):
- return self.count
diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
index 272b03a846688..e3c6f95f3ff47 100644
--- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
+++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py
@@ -28,13 +28,12 @@
from pytorch_lightning import LightningDataModule, LightningModule, Trainer
from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
+from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin
from pytorch_lightning.strategies import DeepSpeedStrategy
from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE, LightningDeepSpeedModule
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests_pytorch.helpers.datamodules import ClassifDataModule
-from tests_pytorch.helpers.datasets import RandomIterableDataset
from tests_pytorch.helpers.runif import RunIf
if _DEEPSPEED_AVAILABLE:
diff --git a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py
index 9414fd1c5096f..e5fd9b5dd2706 100644
--- a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py
+++ b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py
@@ -16,10 +16,9 @@
import pytest
from torch.utils.data import DataLoader
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
+from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
from pytorch_lightning.trainer.trainer import Trainer
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests_pytorch.helpers.datasets import RandomIterableDataset
@pytest.mark.parametrize("max_epochs", [1, 2, 3])
diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
index d16be306b9365..85ed3d8e3471d 100644
--- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
+++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py
@@ -28,9 +28,8 @@
from pytorch_lightning import callbacks, Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, TQDMProgressBar
from pytorch_lightning.core.module import LightningModule
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
+from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomDictDataset
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests_pytorch.helpers.datasets import RandomDictDataset
from tests_pytorch.helpers.runif import RunIf
diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py
index 92a1126294dfc..846a39a748a60 100644
--- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py
+++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py
@@ -22,11 +22,10 @@
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler
-from pytorch_lightning.demos.boring_classes import BoringModel
+from pytorch_lightning.demos.boring_classes import BoringModel, RandomIterableDataset
from pytorch_lightning.strategies.ipu import IPUStrategy
from pytorch_lightning.utilities import device_parser
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests_pytorch.helpers.datasets import RandomIterableDataset
from tests_pytorch.helpers.runif import RunIf
diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py
index 5bea5a4cbbe1c..34504392dc0c1 100644
--- a/tests/tests_pytorch/trainer/test_dataloaders.py
+++ b/tests/tests_pytorch/trainer/test_dataloaders.py
@@ -25,12 +25,16 @@
from pytorch_lightning import Callback, seed_everything, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
+from pytorch_lightning.demos.boring_classes import (
+ BoringModel,
+ RandomDataset,
+ RandomIterableDataset,
+ RandomIterableDatasetWithLen,
+)
from pytorch_lightning.trainer.states import RunningStage
from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_iterable_dataset, has_len_all_ranks
from pytorch_lightning.utilities.exceptions import MisconfigurationException
from tests_pytorch.helpers.dataloaders import CustomInfDataloader, CustomNotImplementedErrorDataloader
-from tests_pytorch.helpers.datasets import RandomIterableDataset, RandomIterableDatasetWithLen
from tests_pytorch.helpers.runif import RunIf
diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py
index e4be8929f9c7e..9506acee425d0 100644
--- a/tests/tests_pytorch/trainer/test_trainer.py
+++ b/tests/tests_pytorch/trainer/test_trainer.py
@@ -41,7 +41,12 @@
from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint
from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter
from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
+from pytorch_lightning.demos.boring_classes import (
+ BoringModel,
+ RandomDataset,
+ RandomIterableDataset,
+ RandomIterableDatasetWithLen,
+)
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler
from pytorch_lightning.strategies import (
@@ -60,7 +65,6 @@
from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_12
from pytorch_lightning.utilities.seed import seed_everything
from tests_pytorch.helpers.datamodules import ClassifDataModule
-from tests_pytorch.helpers.datasets import RandomIterableDataset, RandomIterableDatasetWithLen
from tests_pytorch.helpers.runif import RunIf
from tests_pytorch.helpers.simple_models import ClassificationModel
diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py
index ffb898efaa815..3700feaba9992 100644
--- a/tests/tests_pytorch/utilities/test_data.py
+++ b/tests/tests_pytorch/utilities/test_data.py
@@ -6,7 +6,7 @@
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
from pytorch_lightning import Trainer
-from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset
+from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset
from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper
from pytorch_lightning.trainer.states import RunningStage
from pytorch_lightning.utilities.data import (
@@ -23,7 +23,6 @@
warning_cache,
)
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from tests_pytorch.helpers.datasets import RandomIterableDataset
from tests_pytorch.helpers.utils import no_warning_call
From 2abed91c5386ee9434b4e45e859e91d06bef3080 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Wed, 10 Aug 2022 17:25:44 +0200
Subject: [PATCH 37/59] Update CODEOWNERS (#14119)
* Update CODEOWNERS
* Cleanup and remove old sections
* pl focus
Co-authored-by: Jirka Borovec
---
.github/CODEOWNERS | 40 ++++++++++++++----------------
src/pytorch_lightning/__about__.py | 1 -
2 files changed, 18 insertions(+), 23 deletions(-)
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f83924b9566ce..0b4692731bff9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -5,7 +5,7 @@
# the repo. Unless a later match takes precedence,
# @global-owner1 and @global-owner2 will be requested for
# review when someone opens a pull request.
-* @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7
+* @williamfalcon @borda @tchaton @awaelchli @kaushikb11 @rohitgr7
# CI/CD and configs
/.github/ @borda @carmocca @akihironitta @tchaton
@@ -26,13 +26,14 @@
/docs/source-app/expertise_levels @williamfalcon @Felonious-Spellfire @RobertLaurella
# Packages
+/src/pytorch_lightning @carmocca @justusschock
/src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11
/src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11
/src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11
/src/pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11
/src/pytorch_lightning/lite @tchaton @awaelchli @carmocca
/src/pytorch_lightning/loggers @tchaton @awaelchli @borda
-/src/pytorch_lightning/loggers/wandb.py @borisdayma
+/src/pytorch_lightning/loggers/wandb.py @borisdayma @borda
/src/pytorch_lightning/loggers/neptune.py @shnela @HubertJaworski @pkasprzyk @pitercl @Raalsky @aniezurawski @kamil-kaczmarek
/src/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca
/src/pytorch_lightning/overrides @tchaton @borda
@@ -46,7 +47,7 @@
/src/pytorch_lightning/utilities @borda @tchaton @carmocca
/src/lightning_app @tchaton @manskx
-/src/lightning_app/cli/pl-app-template @awaelchli @tchaton @Borda
+/src/lightning_app/cli/pl-app-template @tchaton @awaelchli @Borda
/src/lightning_app/core @tchaton @awaelchli @manskx
/src/lightning_app/core/queues.py @tchaton @hhsecond @manskx
/src/lightning_app/runners/cloud.py @tchaton @hhsecond
@@ -54,28 +55,23 @@
/src/lightning_app/__about__.py @nohalon @edenlightning @lantiga
# Examples
-/examples/app_* @tchaton @awaelchli @manskx @hhsecond
+/examples/app_* @tchaton @awaelchli @manskx @hhsecond
# App tests
-/tests/tests_app @tchaton @awaelchli @manskx @hhsecond
-/tests/tests_app_examples @tchaton @awaelchli @manskx @hhsecond
+/tests/tests_app @tchaton @awaelchli @manskx @hhsecond
+/tests/tests_app_examples @tchaton @awaelchli @manskx @hhsecond
# Specifics
-/src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca
-/src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca
-
+/src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca
+/src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca
# API
-/src/pytorch_lightning/callbacks/base.py @williamfalcon @awaelchli @ananthsub @carmocca
-/src/pytorch_lightning/core/datamodule.py @williamFalcon @awaelchli @ananthsub @carmocca
-/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli
-/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @ananthsub @carmocca
-/src/pytorch_lightning/core/lightning.py @williamfalcon @tchaton @awaelchli
-
-# Testing
-/tests/helpers/boring_model.py @williamfalcon @tchaton @borda
+/src/pytorch_lightning/callbacks/callback.py @williamfalcon @awaelchli @ananthsub @carmocca
+/src/pytorch_lightning/core/datamodule.py @williamFalcon @awaelchli @ananthsub @carmocca
+/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli
+/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @ananthsub @carmocca
+/src/pytorch_lightning/core/module.py @williamfalcon @tchaton @awaelchli
-/.github/CODEOWNERS @williamfalcon
-/.github/approve_config.yml @williamfalcon
-/SECURITY.md @williamfalcon
-/README.md @williamfalcon @edenlightning @borda
-/setup.py @williamfalcon @borda @carmocca
+/.github/CODEOWNERS @williamfalcon
+/SECURITY.md @williamfalcon
+/README.md @williamfalcon @edenlightning @borda
+/setup.py @williamfalcon @borda @carmocca
/src/pytorch_lightning/__about__.py @williamfalcon @borda @carmocca
diff --git a/src/pytorch_lightning/__about__.py b/src/pytorch_lightning/__about__.py
index 6d09c5264e1ab..e2fdbd9ee3016 100644
--- a/src/pytorch_lightning/__about__.py
+++ b/src/pytorch_lightning/__about__.py
@@ -13,7 +13,6 @@
# limitations under the License.
import time
-# __version__ = "1.7.0"
__author__ = "Lightning AI et al."
__author_email__ = "pytorch@lightning.ai"
__license__ = "Apache-2.0"
From 527b28ed974c326f9e86c334b0c5bd477b635f89 Mon Sep 17 00:00:00 2001
From: Krishna Kalyan
Date: Wed, 10 Aug 2022 16:26:44 +0100
Subject: [PATCH 38/59] Fix mypy errors attributed to
`pytorch_lightning.profilers.simple` (#14103)
---
pyproject.toml | 1 -
src/pytorch_lightning/profilers/simple.py | 19 +++++++++++--------
2 files changed, 11 insertions(+), 9 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index 8db782df357d8..b5e806bc69900 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -57,7 +57,6 @@ module = [
"pytorch_lightning.demos.mnist_datamodule",
"pytorch_lightning.profilers.base",
"pytorch_lightning.profilers.pytorch",
- "pytorch_lightning.profilers.simple",
"pytorch_lightning.strategies.sharded",
"pytorch_lightning.strategies.sharded_spawn",
"pytorch_lightning.trainer.callback_hook",
diff --git a/src/pytorch_lightning/profilers/simple.py b/src/pytorch_lightning/profilers/simple.py
index 20d76f9b2d378..0fb9497ff17fb 100644
--- a/src/pytorch_lightning/profilers/simple.py
+++ b/src/pytorch_lightning/profilers/simple.py
@@ -60,7 +60,7 @@ def __init__(
"""
super().__init__(dirpath=dirpath, filename=filename)
self.current_actions: Dict[str, float] = {}
- self.recorded_durations = defaultdict(list)
+ self.recorded_durations: Dict = defaultdict(list)
self.extended = extended
self.start_time = time.monotonic()
@@ -104,20 +104,23 @@ def summary(self) -> str:
if len(self.recorded_durations) > 0:
max_key = max(len(k) for k in self.recorded_durations.keys())
- def log_row(action, mean, num_calls, total, per):
+ def log_row_extended(action: str, mean: str, num_calls: str, total: str, per: str) -> str:
row = f"{sep}| {action:<{max_key}s}\t| {mean:<15}\t|"
row += f" {num_calls:<15}\t| {total:<15}\t| {per:<15}\t|"
return row
- header_string = log_row("Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %")
+ header_string = log_row_extended(
+ "Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %"
+ )
output_string_len = len(header_string.expandtabs())
sep_lines = f"{sep}{'-' * output_string_len}"
output_string += sep_lines + header_string + sep_lines
- report, total_calls, total_duration = self._make_report_extended()
- output_string += log_row("Total", "-", f"{total_calls:}", f"{total_duration:.5}", "100 %")
+ report_extended: _TABLE_DATA_EXTENDED
+ report_extended, total_calls, total_duration = self._make_report_extended()
+ output_string += log_row_extended("Total", "-", f"{total_calls:}", f"{total_duration:.5}", "100 %")
output_string += sep_lines
- for action, mean_duration, num_calls, total_duration, duration_per in report:
- output_string += log_row(
+ for action, mean_duration, num_calls, total_duration, duration_per in report_extended:
+ output_string += log_row_extended(
action,
f"{mean_duration:.5}",
f"{num_calls}",
@@ -128,7 +131,7 @@ def log_row(action, mean, num_calls, total, per):
else:
max_key = max(len(k) for k in self.recorded_durations)
- def log_row(action, mean, total):
+ def log_row(action: str, mean: str, total: str) -> str:
return f"{sep}| {action:<{max_key}s}\t| {mean:<15}\t| {total:<15}\t|"
header_string = log_row("Action", "Mean duration (s)", "Total time (s)")
From 6f4edd721f9852d8f4afaa49edd1f80c5fc6dc72 Mon Sep 17 00:00:00 2001
From: William Falcon
Date: Wed, 10 Aug 2022 09:03:51 -0700
Subject: [PATCH 39/59] Update README.md
---
README.md | 2 ++
1 file changed, 2 insertions(+)
diff --git a/README.md b/README.md
index 9c03e3707ec24..2d32094f6595f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,5 @@
+### ** NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines**
+
From f132d44821f9fe7ad83d74edbb13dc6ee7769a3d Mon Sep 17 00:00:00 2001
From: otaj <6065855+otaj@users.noreply.github.com>
Date: Wed, 10 Aug 2022 18:09:50 +0200
Subject: [PATCH 40/59] Fix a bug that caused spurious `AttributeError` when
multiple `DataLoader` classes are imported (#14117)
---
src/pytorch_lightning/CHANGELOG.md | 3 +++
src/pytorch_lightning/utilities/data.py | 10 +++++----
tests/tests_pytorch/utilities/test_data.py | 25 ++++++++++++++++++++++
3 files changed, 34 insertions(+), 4 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 90285b55c8037..97bb317b02a14 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -52,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Fixed
+- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117))
+
+
- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061))
diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py
index 00a7cb8486709..b625a046f6122 100644
--- a/src/pytorch_lightning/utilities/data.py
+++ b/src/pytorch_lightning/utilities/data.py
@@ -501,15 +501,17 @@ def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = Non
It patches the ``__init__`` method.
"""
classes = _get_all_subclasses(base_cls) | {base_cls}
- wrapped = set()
for cls in classes:
- if cls.__init__ not in wrapped:
+ # Check that __init__ belongs to the class
+ # https://stackoverflow.com/a/5253424
+ if "__init__" in cls.__dict__:
cls._old_init = cls.__init__
cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg)
- wrapped.add(cls.__init__)
yield
for cls in classes:
- if hasattr(cls, "_old_init"):
+ # Check that _old_init belongs to the class
+ # https://stackoverflow.com/a/5253424
+ if "_old_init" in cls.__dict__:
cls.__init__ = cls._old_init
del cls._old_init
diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py
index 3700feaba9992..cc70417988616 100644
--- a/tests/tests_pytorch/utilities/test_data.py
+++ b/tests/tests_pytorch/utilities/test_data.py
@@ -1,3 +1,4 @@
+import random
from dataclasses import dataclass
import pytest
@@ -172,6 +173,30 @@ def __init__(self, randomize, *args, **kwargs):
assert isinstance(new_dataloader, GoodImpl)
+def test_replace_init_method_multiple_loaders_without_init():
+ """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__`
+ method (the one we are wrapping), it can happen, that `hasattr(cls, "_old_init")` is True because of parent
+ class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error
+ occured only sometimes because it depends on the order in which we are iterating over a set of classes we are
+ patching.
+
+ This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__`
+ and are children of `DataLoader`. We are testing that a) context manager `_replace_init_method` exits cleanly, and
+ b) the mechanism checking for presence of `_old_init` works as expected.
+ """
+ classes = [DataLoader]
+ for i in range(100):
+ classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {}))
+
+ with _replace_init_method(DataLoader, "dataset"):
+ for cls in classes[1:]: # First one is `DataLoader`
+ assert "_old_init" not in cls.__dict__
+ assert hasattr(cls, "_old_init")
+
+ assert "_old_init" in DataLoader.__dict__
+ assert hasattr(DataLoader, "_old_init")
+
+
class DataLoaderSubclass1(DataLoader):
def __init__(self, attribute1, *args, **kwargs):
self.at1 = attribute1
From 45a10a137cbbc7bd07bf3bf4b7c4b8b8a9439516 Mon Sep 17 00:00:00 2001
From: Jirka Borovec
Date: Wed, 10 Aug 2022 18:22:44 +0200
Subject: [PATCH 41/59] update chlog after 0.5.5 (#14133)
---
src/lightning_app/CHANGELOG.md | 65 ++++++++++++++++++++++++++++++++--
1 file changed, 62 insertions(+), 3 deletions(-)
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index f32d07697f376..ea28c57611311 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -9,27 +9,86 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
### Added
- Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602))
+
+
- Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835))
+
+
- Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894))
+
+
- Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987))
+
+
- Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830))
- Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634))
+
### Changed
-- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537))
+-
+
### Changed
-- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703))
+-
+
- Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076))
### Deprecated
+-
+
+
### Fixed
-- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970))
+-
+
+
+## [0.5.5] - 2022-08-9
+### Deprecated
+
+- Deprecate sheety API ([#14004](https://github.com/Lightning-AI/lightning/pull/14004))
+
+### Fixed
+
+- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970))
- Resolved a bug about a race condition when sending the work state through the caller_queue ([#14074](https://github.com/Lightning-AI/lightning/pull/14074))
+- Fixed Start Lightning App on Cloud if Repo Begins With Name "Lightning" ([#14025](https://github.com/Lightning-AI/lightning/pull/14025))
+
+
+## [0.5.4] - 2022-08-01
+
+### Changed
+
+- Wrapped imports for traceability ([#13924](https://github.com/Lightning-AI/lightning/pull/13924))
+- Set version as today ([#13906](https://github.com/Lightning-AI/lightning/pull/13906))
+
+### Fixed
+
+- Included app templates to the lightning and app packages ([#13731](https://github.com/Lightning-AI/lightning/pull/13731))
+- Added UI for install all ([#13732](https://github.com/Lightning-AI/lightning/pull/13732))
+- Fixed build meta pkg flow ([#13926](https://github.com/Lightning-AI/lightning/pull/13926))
+
+## [0.5.3] - 2022-07-25
+
+### Changed
+
+- Pruned requirements duplicity ([#13739](https://github.com/Lightning-AI/lightning/pull/13739))
+
+### Fixed
+
+- Use correct python version in lightning component template ([#13790](https://github.com/Lightning-AI/lightning/pull/13790))
+
+## [0.5.2] - 2022-07-18
+
+### Added
+
+- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537))
+
+### Changed
+
+- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703))
From e226180527b065813bb1ba5e83f4990c3b81d444 Mon Sep 17 00:00:00 2001
From: thomas chaton
Date: Wed, 10 Aug 2022 19:26:01 +0200
Subject: [PATCH 42/59] (app) Remove ClickRunner (#14147)
---
README.md | 2 +-
tests/tests_app_examples/test_boring_app.py | 7 +------
2 files changed, 2 insertions(+), 7 deletions(-)
diff --git a/README.md b/README.md
index 2d32094f6595f..f9d5a9a57f5e2 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-### ** NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines**
+### \*\* NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines\*\*
diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py
index f8143b1db1a88..0ca1b823b4706 100644
--- a/tests/tests_app_examples/test_boring_app.py
+++ b/tests/tests_app_examples/test_boring_app.py
@@ -13,7 +13,6 @@ def test_boring_app_example_cloud() -> None:
with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py") as (
_,
view_page,
- _,
name,
):
@@ -31,8 +30,4 @@ def check_hello_there(*_, **__):
assert result.exit_code == 0
assert result.exception is None
- assert len(lines) > 1, result.output
- # We know that at some point we need to intstall lightning, so we check for that
- assert any(
- "Successfully built lightning" in line for line in lines
- ), f"Did not find logs with lightning installation: {result.output}"
+ assert any("http://0.0.0.0:8080" in line for line in lines)
From 3966f959aab2682df26f9712c37e468704304792 Mon Sep 17 00:00:00 2001
From: Jirka Borovec
Date: Wed, 10 Aug 2022 19:38:39 +0200
Subject: [PATCH 43/59] relax `docker` requirement (#14009)
---
requirements/app/cloud.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt
index ff18d47b44565..6644a56a2894b 100644
--- a/requirements/app/cloud.txt
+++ b/requirements/app/cloud.txt
@@ -1,4 +1,4 @@
starsessions
redis>=4.0.0, <=4.2.4
-docker==5.0.3
+docker>=5.0.0, <=5.0.3
# setuptools==59.5.0
From f11f1e2bb470a57f4043a41b1cdf194071c4be1e Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Aug 2022 19:40:34 +0200
Subject: [PATCH 44/59] Update gcsfs requirement from <2022.6.0,>=2021.5.0 to
>=2021.5.0,<2022.8.0 in /requirements (#14079)
Update gcsfs requirement in /requirements
Updates the requirements on [gcsfs](https://github.com/fsspec/gcsfs) to permit the latest version.
- [Release notes](https://github.com/fsspec/gcsfs/releases)
- [Commits](https://github.com/fsspec/gcsfs/compare/2021.05.0...2022.7.1)
---
updated-dependencies:
- dependency-name: gcsfs
dependency-type: direct:production
...
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
requirements/pytorch/extra.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt
index c386c5581cc42..20b6c1b8dbc12 100644
--- a/requirements/pytorch/extra.txt
+++ b/requirements/pytorch/extra.txt
@@ -7,5 +7,5 @@ torchtext>=0.10.*, <0.14.0
omegaconf>=2.0.5, <2.3.0
hydra-core>=1.0.5, <1.3.0
jsonargparse[signatures]>=4.12.0, <=4.12.0
-gcsfs>=2021.5.0, <2022.6.0
+gcsfs>=2021.5.0, <2022.8.0
rich>=10.14.0, !=10.15.0.a, <13.0.0
From 7e7736778bfc1f3864d878458b9de87de7ded52c Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Wed, 10 Aug 2022 14:27:35 -0400
Subject: [PATCH 45/59] Update onnxruntime requirement from <=1.12.0 to <1.13.0
in /requirements (#14083)
Updates the requirements on [onnxruntime](https://github.com/microsoft/onnxruntime) to permit the latest version.
- [Release notes](https://github.com/microsoft/onnxruntime/releases)
- [Changelog](https://github.com/microsoft/onnxruntime/blob/master/docs/ReleaseManagement.md)
- [Commits](https://github.com/microsoft/onnxruntime/compare/v0.1.4...v1.12.1)
---
updated-dependencies:
- dependency-name: onnxruntime
dependency-type: direct:production
...
Signed-off-by: dependabot[bot]
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
requirements/pytorch/test.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt
index c155400a3d35f..f8bd5793a0af6 100644
--- a/requirements/pytorch/test.txt
+++ b/requirements/pytorch/test.txt
@@ -10,7 +10,7 @@ mypy==0.971
# needed in tests
cloudpickle>=1.3, <=2.1.0
scikit-learn>0.22.1, <=1.1.1
-onnxruntime<=1.12.0
+onnxruntime<1.13.0
psutil<=5.9.1 # for `DeviceStatsMonitor`
pandas>1.0, <=1.4.3 # needed in benchmarks
fastapi<=0.79.0
From 784b60412c1dec73c5f7c90ced343d2bbd394c25 Mon Sep 17 00:00:00 2001
From: panos-is <102533125+panos-is@users.noreply.github.com>
Date: Wed, 10 Aug 2022 23:07:23 +0300
Subject: [PATCH 46/59] (app) Add s3 drive type (1/2) (#14002)
* Add S3 protocol and optimization field to the drive object
* Add a list of drives to the work specification
* [pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
* add only protocol for s3 drives, no optimization arguments, and add tests
* added trailing slash criteria
* allow slash in s3 drives
* fix
* fixed test issues
Co-authored-by: Panos Lantavos-Stratigakis
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Rick Izzo
Co-authored-by: Jirka Borovec
Co-authored-by: Rick Izzo
---
src/lightning_app/storage/drive.py | 33 ++++++++++-
tests/tests_app/storage/test_drive.py | 81 +++++++++++++++++++--------
2 files changed, 90 insertions(+), 24 deletions(-)
diff --git a/src/lightning_app/storage/drive.py b/src/lightning_app/storage/drive.py
index 3bcdf72780653..b69d2581851b8 100644
--- a/src/lightning_app/storage/drive.py
+++ b/src/lightning_app/storage/drive.py
@@ -13,7 +13,7 @@
class Drive:
__IDENTIFIER__ = "__drive__"
- __PROTOCOLS__ = ["lit://"]
+ __PROTOCOLS__ = ["lit://", "s3://"]
def __init__(
self,
@@ -35,15 +35,28 @@ def __init__(
root_folder: This is the folder from where the Drive perceives the data (e.g this acts as a mount dir).
"""
self.id = None
+ self.protocol = None
for protocol in self.__PROTOCOLS__:
if id.startswith(protocol):
self.protocol = protocol
self.id = id.replace(protocol, "")
+ break
+ else: # N.B. for-else loop
+ raise ValueError(
+ f"Unknown protocol for the drive 'id' argument '{id}`. The 'id' string "
+ f"must start with one of the following prefixes {self.__PROTOCOLS__}"
+ )
+
+ if self.protocol == "s3://" and not self.id.endswith("/"):
+ raise ValueError(
+ "S3 drives must end in a trailing slash (`/`) to indicate a folder is being mounted. "
+ f"Recieved: '{id}'. Mounting a single file is not currently supported."
+ )
if not self.id:
raise Exception(f"The Drive id needs to start with one of the following protocols: {self.__PROTOCOLS__}")
- if "/" in self.id:
+ if self.protocol != "s3://" and "/" in self.id:
raise Exception(f"The id should be unique to identify your drive. Found `{self.id}`.")
self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else os.getcwd()
@@ -75,6 +88,10 @@ def put(self, path: str) -> None:
raise Exception("The component name needs to be known to put a path to the Drive.")
if _is_flow_context():
raise Exception("The flow isn't allowed to put files into a Drive.")
+ if self.protocol == "s3://":
+ raise PermissionError(
+ "S3 based drives cannot currently add files via this API. Did you mean to use `lit://` drives?"
+ )
self._validate_path(path)
@@ -98,6 +115,10 @@ def list(self, path: Optional[str] = ".", component_name: Optional[str] = None)
"""
if _is_flow_context():
raise Exception("The flow isn't allowed to list files from a Drive.")
+ if self.protocol == "s3://":
+ raise PermissionError(
+ "S3 based drives cannot currently list files via this API. Did you mean to use `lit://` drives?"
+ )
if component_name:
paths = [
@@ -142,6 +163,10 @@ def get(
"""
if _is_flow_context():
raise Exception("The flow isn't allowed to get files from a Drive.")
+ if self.protocol == "s3://":
+ raise PermissionError(
+ "S3 based drives cannot currently get files via this API. Did you mean to use `lit://` drives?"
+ )
if component_name:
shared_path = self._to_shared_path(
@@ -189,6 +214,10 @@ def delete(self, path: str) -> None:
"""
if not self.component_name:
raise Exception("The component name needs to be known to delete a path to the Drive.")
+ if self.protocol == "s3://":
+ raise PermissionError(
+ "S3 based drives cannot currently delete files via this API. Did you mean to use `lit://` drives?"
+ )
shared_path = self._to_shared_path(
path,
diff --git a/tests/tests_app/storage/test_drive.py b/tests/tests_app/storage/test_drive.py
index 3d9db44c10e13..0d452571d9f43 100644
--- a/tests/tests_app/storage/test_drive.py
+++ b/tests/tests_app/storage/test_drive.py
@@ -11,7 +11,7 @@
from lightning_app.utilities.component import _set_flow_context
-class SyncWorkA(LightningWork):
+class SyncWorkLITDriveA(LightningWork):
def __init__(self, tmpdir):
super().__init__()
self.tmpdir = tmpdir
@@ -25,19 +25,19 @@ def run(self, drive: Drive):
os.remove(f"{self.tmpdir}/a.txt")
-class SyncWorkB(LightningWork):
+class SyncWorkLITDriveB(LightningWork):
def run(self, drive: Drive):
assert not os.path.exists("a.txt")
drive.get("a.txt")
assert os.path.exists("a.txt")
-class SyncFlow(LightningFlow):
+class SyncFlowLITDrives(LightningFlow):
def __init__(self, tmpdir):
super().__init__()
self.log_dir = Drive("lit://log_dir")
- self.work_a = SyncWorkA(str(tmpdir))
- self.work_b = SyncWorkB()
+ self.work_a = SyncWorkLITDriveA(str(tmpdir))
+ self.work_b = SyncWorkLITDriveB()
def run(self):
self.work_a.run(self.log_dir)
@@ -45,15 +45,15 @@ def run(self):
self._exit()
-def test_synchronization_drive(tmpdir):
+def test_synchronization_lit_drive(tmpdir):
if os.path.exists("a.txt"):
os.remove("a.txt")
- app = LightningApp(SyncFlow(tmpdir))
+ app = LightningApp(SyncFlowLITDrives(tmpdir))
MultiProcessRuntime(app, start_server=False).dispatch()
os.remove("a.txt")
-class Work(LightningWork):
+class LITDriveWork(LightningWork):
def __init__(self):
super().__init__(parallel=True)
self.drive = None
@@ -75,7 +75,7 @@ def run(self, *args, **kwargs):
self.counter += 1
-class Work2(LightningWork):
+class LITDriveWork2(LightningWork):
def __init__(self):
super().__init__(parallel=True)
@@ -86,11 +86,11 @@ def run(self, drive: Drive, **kwargs):
assert drive.list(".", component_name=self.name) == []
-class Flow(LightningFlow):
+class LITDriveFlow(LightningFlow):
def __init__(self):
super().__init__()
- self.work = Work()
- self.work2 = Work2()
+ self.work = LITDriveWork()
+ self.work2 = LITDriveWork2()
def run(self):
self.work.run("0")
@@ -102,15 +102,15 @@ def run(self):
self._exit()
-def test_drive_transferring_files():
- app = LightningApp(Flow())
+def test_lit_drive_transferring_files():
+ app = LightningApp(LITDriveFlow())
MultiProcessRuntime(app, start_server=False).dispatch()
os.remove("a.txt")
-def test_drive():
- with pytest.raises(Exception, match="The Drive id needs to start with one of the following protocols"):
- Drive("this_drive_id")
+def test_lit_drive():
+ with pytest.raises(Exception, match="Unknown protocol for the drive 'id' argument"):
+ Drive("invalid_drive_id")
with pytest.raises(
Exception, match="The id should be unique to identify your drive. Found `this_drive_id/something_else`."
@@ -213,9 +213,46 @@ def test_drive():
os.remove("a.txt")
-def test_maybe_create_drive():
+def test_s3_drives():
+ drive = Drive("s3://foo/", allow_duplicates=True)
+ drive.component_name = "root.work"
- drive = Drive("lit://drive_3", allow_duplicates=False)
+ with pytest.raises(
+ Exception, match="S3 based drives cannot currently add files via this API. Did you mean to use `lit://` drives?"
+ ):
+ drive.put("a.txt")
+ with pytest.raises(
+ Exception,
+ match="S3 based drives cannot currently list files via this API. Did you mean to use `lit://` drives?",
+ ):
+ drive.list("a.txt")
+ with pytest.raises(
+ Exception, match="S3 based drives cannot currently get files via this API. Did you mean to use `lit://` drives?"
+ ):
+ drive.get("a.txt")
+ with pytest.raises(
+ Exception,
+ match="S3 based drives cannot currently delete files via this API. Did you mean to use `lit://` drives?",
+ ):
+ drive.delete("a.txt")
+
+ _set_flow_context()
+ with pytest.raises(Exception, match="The flow isn't allowed to put files into a Drive."):
+ drive.put("a.txt")
+ with pytest.raises(Exception, match="The flow isn't allowed to list files from a Drive."):
+ drive.list("a.txt")
+ with pytest.raises(Exception, match="The flow isn't allowed to get files from a Drive."):
+ drive.get("a.txt")
+
+
+def test_create_s3_drive_without_trailing_slash_fails():
+ with pytest.raises(ValueError, match="S3 drives must end in a trailing slash"):
+ Drive("s3://foo")
+
+
+@pytest.mark.parametrize("drive_id", ["lit://drive", "s3://drive/"])
+def test_maybe_create_drive(drive_id):
+ drive = Drive(drive_id, allow_duplicates=False)
drive.component_name = "root.work1"
new_drive = _maybe_create_drive(drive.component_name, drive.to_dict())
assert new_drive.protocol == drive.protocol
@@ -223,9 +260,9 @@ def test_maybe_create_drive():
assert new_drive.component_name == drive.component_name
-def test_drive_deepcopy():
-
- drive = Drive("lit://drive", allow_duplicates=True)
+@pytest.mark.parametrize("drive_id", ["lit://drive", "s3://drive/"])
+def test_drive_deepcopy(drive_id):
+ drive = Drive(drive_id, allow_duplicates=True)
drive.component_name = "root.work1"
new_drive = deepcopy(drive)
assert new_drive.id == drive.id
From 5396b1899fa2ed3de1a369a1551fa155a80c4321 Mon Sep 17 00:00:00 2001
From: thomas chaton
Date: Wed, 10 Aug 2022 22:34:23 +0200
Subject: [PATCH 47/59] Resolve e2es V3 (#14153)
update
---
tests/tests_app_examples/test_boring_app.py | 4 ++++
tests/tests_app_examples/test_drive.py | 2 +-
2 files changed, 5 insertions(+), 1 deletion(-)
diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py
index 0ca1b823b4706..afb958571d16b 100644
--- a/tests/tests_app_examples/test_boring_app.py
+++ b/tests/tests_app_examples/test_boring_app.py
@@ -13,6 +13,7 @@ def test_boring_app_example_cloud() -> None:
with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py") as (
_,
view_page,
+ fetch_logs,
name,
):
@@ -24,6 +25,9 @@ def check_hello_there(*_, **__):
wait_for(view_page, check_hello_there)
+ for _ in fetch_logs():
+ pass
+
runner = CliRunner()
result = runner.invoke(logs, [name])
lines = result.output.splitlines()
diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py
index 630e76b550e9e..dde68d1a85113 100644
--- a/tests/tests_app_examples/test_drive.py
+++ b/tests/tests_app_examples/test_drive.py
@@ -18,7 +18,7 @@ def test_drive_example_cloud() -> None:
has_logs = False
while not has_logs:
- for log in fetch_logs(["flow"]):
+ for log in fetch_logs():
if "Application End!" in log:
has_logs = True
sleep(1)
From 4008f9cd414db2b0319b62ab4cb5d2193c6e97ed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Wed, 10 Aug 2022 23:15:12 +0200
Subject: [PATCH 48/59] Convert subprocess test to standalone test (#14101)
---
tests/tests_pytorch/run_standalone_tasks.sh | 10 ++-
tests/tests_pytorch/serve/__init__.py | 0
tests/tests_pytorch/strategies/ddp_model.py | 58 ----------------
.../strategies/scripts/__init__.py | 0
.../strategies/scripts/cli_script.py | 24 +++++++
tests/tests_pytorch/strategies/test_ddp.py | 67 +++++++------------
tests/tests_pytorch/utilities/distributed.py | 45 -------------
7 files changed, 55 insertions(+), 149 deletions(-)
create mode 100644 tests/tests_pytorch/serve/__init__.py
delete mode 100644 tests/tests_pytorch/strategies/ddp_model.py
create mode 100644 tests/tests_pytorch/strategies/scripts/__init__.py
create mode 100644 tests/tests_pytorch/strategies/scripts/cli_script.py
delete mode 100644 tests/tests_pytorch/utilities/distributed.py
diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh
index 960bd867ceaa4..698ed7863ab96 100644
--- a/tests/tests_pytorch/run_standalone_tasks.sh
+++ b/tests/tests_pytorch/run_standalone_tasks.sh
@@ -34,6 +34,10 @@ fi
# test that a user can manually launch individual processes
echo "Running manual ddp launch test"
export PYTHONPATH="${PYTHONPATH}:$(pwd)"
-args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
-MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} &
-MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args}
+args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1"
+MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} &
+MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args}
+
+# test that ddp can launched as a module (-m option)
+echo "Running ddp example as module"
+python -m strategies.scripts.cli_script ${args}
diff --git a/tests/tests_pytorch/serve/__init__.py b/tests/tests_pytorch/serve/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tests_pytorch/strategies/ddp_model.py b/tests/tests_pytorch/strategies/ddp_model.py
deleted file mode 100644
index 76d1f3f2f6866..0000000000000
--- a/tests/tests_pytorch/strategies/ddp_model.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Runs either `.fit()` or `.test()` on a single node across multiple gpus."""
-import os
-from argparse import ArgumentParser
-
-import torch
-
-from pytorch_lightning import seed_everything, Trainer
-from tests_pytorch.helpers.datamodules import ClassifDataModule
-from tests_pytorch.helpers.simple_models import ClassificationModel
-
-
-def main():
- seed_everything(4321)
-
- parser = ArgumentParser(add_help=False)
- parser = Trainer.add_argparse_args(parser)
- parser.add_argument("--trainer_method", default="fit")
- parser.add_argument("--tmpdir")
- parser.add_argument("--workdir")
- parser.set_defaults(accelerator="gpu", devices=2)
- parser.set_defaults(strategy="ddp")
- args = parser.parse_args()
-
- dm = ClassifDataModule()
- model = ClassificationModel()
- trainer = Trainer.from_argparse_args(args)
-
- if args.trainer_method == "fit":
- trainer.fit(model, datamodule=dm)
- result = None
- elif args.trainer_method == "test":
- result = trainer.test(model, datamodule=dm)
- elif args.trainer_method == "fit_test":
- trainer.fit(model, datamodule=dm)
- result = trainer.test(model, datamodule=dm)
- else:
- raise ValueError(f"Unsupported: {args.trainer_method}")
-
- result_ext = {"status": "complete", "method": args.trainer_method, "result": result}
- file_path = os.path.join(args.tmpdir, "ddp.result")
- torch.save(result_ext, file_path)
-
-
-if __name__ == "__main__":
- main()
diff --git a/tests/tests_pytorch/strategies/scripts/__init__.py b/tests/tests_pytorch/strategies/scripts/__init__.py
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/tests/tests_pytorch/strategies/scripts/cli_script.py b/tests/tests_pytorch/strategies/scripts/cli_script.py
new file mode 100644
index 0000000000000..17f0d29392eb9
--- /dev/null
+++ b/tests/tests_pytorch/strategies/scripts/cli_script.py
@@ -0,0 +1,24 @@
+# Copyright The PyTorch Lightning team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""A trivial script that wraps a LightningCLI around the BoringModel and BoringDataModule."""
+from pytorch_lightning.cli import LightningCLI
+from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel
+
+if __name__ == "__main__":
+ LightningCLI(
+ BoringModel,
+ BoringDataModule,
+ seed_everything_default=42,
+ save_config_overwrite=True,
+ )
diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py
index 1a2a0475e7ed6..9b196f3e2a97f 100644
--- a/tests/tests_pytorch/strategies/test_ddp.py
+++ b/tests/tests_pytorch/strategies/test_ddp.py
@@ -21,60 +21,41 @@
from torch.nn.parallel.distributed import DistributedDataParallel
import pytorch_lightning as pl
-from pytorch_lightning import Trainer
+from pytorch_lightning import seed_everything, Trainer
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.demos.boring_classes import BoringModel
from pytorch_lightning.strategies import DDPStrategy
+from tests_pytorch.helpers.datamodules import ClassifDataModule
from tests_pytorch.helpers.runif import RunIf
-from tests_pytorch.strategies import ddp_model
-from tests_pytorch.utilities.distributed import call_training_script
+from tests_pytorch.helpers.simple_models import ClassificationModel
-CLI_ARGS = "--max_epochs 1 --accelerator gpu --devices 2 --strategy ddp"
+@RunIf(min_cuda_gpus=2, standalone=True)
+def test_multi_gpu_model_ddp_fit_only(tmpdir):
+ dm = ClassifDataModule()
+ model = ClassificationModel()
+ trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp")
+ trainer.fit(model, datamodule=dm)
-@RunIf(min_cuda_gpus=2)
-@pytest.mark.parametrize("as_module", [True, False])
-def test_multi_gpu_model_ddp_fit_only(tmpdir, as_module):
- # call the script
- call_training_script(ddp_model, CLI_ARGS, "fit", tmpdir, timeout=120, as_module=as_module)
- # load the results of the script
- result_path = os.path.join(tmpdir, "ddp.result")
- result = torch.load(result_path)
+@RunIf(min_cuda_gpus=2, standalone=True)
+def test_multi_gpu_model_ddp_test_only(tmpdir):
+ dm = ClassifDataModule()
+ model = ClassificationModel()
+ trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp")
+ trainer.test(model, datamodule=dm)
- # verify the file wrote the expected outputs
- assert result["status"] == "complete"
+@RunIf(min_cuda_gpus=2, standalone=True)
+def test_multi_gpu_model_ddp_fit_test(tmpdir):
+ seed_everything(4321)
+ dm = ClassifDataModule()
+ model = ClassificationModel()
+ trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp")
+ trainer.fit(model, datamodule=dm)
+ result = trainer.test(model, datamodule=dm)
-@RunIf(min_cuda_gpus=2)
-@pytest.mark.parametrize("as_module", [True, False])
-def test_multi_gpu_model_ddp_test_only(tmpdir, as_module):
- # call the script
- call_training_script(ddp_model, CLI_ARGS, "test", tmpdir, as_module=as_module)
-
- # load the results of the script
- result_path = os.path.join(tmpdir, "ddp.result")
- result = torch.load(result_path)
-
- # verify the file wrote the expected outputs
- assert result["status"] == "complete"
-
-
-@RunIf(min_cuda_gpus=2)
-@pytest.mark.parametrize("as_module", [True, False])
-def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module):
- # call the script
- call_training_script(ddp_model, CLI_ARGS, "fit_test", tmpdir, timeout=20, as_module=as_module)
-
- # load the results of the script
- result_path = os.path.join(tmpdir, "ddp.result")
- result = torch.load(result_path)
-
- # verify the file wrote the expected outputs
- assert result["status"] == "complete"
-
- model_outs = result["result"]
- for out in model_outs:
+ for out in result:
assert out["test_acc"] > 0.7
diff --git a/tests/tests_pytorch/utilities/distributed.py b/tests/tests_pytorch/utilities/distributed.py
deleted file mode 100644
index 38a50edcc7177..0000000000000
--- a/tests/tests_pytorch/utilities/distributed.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright The PyTorch Lightning team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import os
-import subprocess
-import sys
-from pathlib import Path
-from subprocess import TimeoutExpired
-
-import pytorch_lightning
-
-
-def call_training_script(module_file, cli_args, method, tmpdir, timeout=60, as_module=False):
- file = Path(module_file.__file__).absolute()
- cli_args = cli_args.split(" ") if cli_args else []
- cli_args += ["--tmpdir", str(tmpdir)]
- cli_args += ["--trainer_method", method]
- file_args = ["-m", module_file.__spec__.name] if as_module else [str(file)]
- command = [sys.executable] + file_args + cli_args
-
- # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment
- env = os.environ.copy()
- env["PYTHONPATH"] = env.get("PYTHONPATH", "") + f"{pytorch_lightning.__file__}:"
-
- # for running in ddp mode, we need to launch it's own process or pytest will get stuck
- p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env)
- try:
- std, err = p.communicate(timeout=timeout)
- err = str(err.decode("utf-8"))
- if "Exception" in err:
- raise Exception(err)
- except TimeoutExpired:
- p.kill()
- std, err = p.communicate()
- return std, err
From 2a10a36b9211fbecdfc79dc0bdae9b972ec8f91d Mon Sep 17 00:00:00 2001
From: William Falcon
Date: Wed, 10 Aug 2022 18:30:01 -0400
Subject: [PATCH 49/59] Update README.md
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index f9d5a9a57f5e2..6f075f5fd42b6 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
-**Build high-performance PyTorch models and deploy them with Lightning Apps (scalable end-to-end ML systems).**
+**Build high-performance (PyTorch) models, research workflows, ML production pipelines.**
______________________________________________________________________
From a7cebf24169dbe80c5e718946cb5de931082f814 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Thu, 11 Aug 2022 01:32:32 +0200
Subject: [PATCH 50/59] Fix entry point test for Python 3.10 (#14154)
---
.../trainer/connectors/test_callback_connector.py | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py
index d6d5018aa1dd0..02e846425a2a0 100644
--- a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py
@@ -30,7 +30,7 @@
)
from pytorch_lightning.demos.boring_classes import BoringModel
from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector
-from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0
+from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0
def test_checkpoint_callbacks_are_last(tmpdir):
@@ -265,7 +265,10 @@ def _make_entry_point_query_mock(callback_factory):
entry_point = Mock()
entry_point.name = "mocked"
entry_point.load.return_value = callback_factory
- if _PYTHON_GREATER_EQUAL_3_8_0:
+ if _PYTHON_GREATER_EQUAL_3_10_0:
+ query_mock.return_value = [entry_point]
+ import_path = "importlib.metadata.entry_points"
+ elif _PYTHON_GREATER_EQUAL_3_8_0:
query_mock().get.return_value = [entry_point]
import_path = "importlib.metadata.entry_points"
else:
From 3dc08b1ef565774853467a7e56842becfa381dd6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Thu, 11 Aug 2022 09:33:19 +0200
Subject: [PATCH 51/59] Fix flaky test caused by weak reference (#14157)
---
tests/tests_pytorch/trainer/connectors/test_data_connector.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py
index 52ef4c4db6d8d..2650e46b7fa60 100644
--- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py
+++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py
@@ -445,7 +445,8 @@ def test_dataloader_source_direct_access():
def test_dataloader_source_request_from_module():
"""Test requesting a dataloader from a module works."""
module = BoringModel()
- module.trainer = Trainer()
+ trainer = Trainer()
+ module.trainer = trainer
module.foo = Mock(return_value=module.train_dataloader())
source = _DataLoaderSource(module, "foo")
From 6eed72b621921856a846e39e4dd6bc9fd764348b Mon Sep 17 00:00:00 2001
From: thomas chaton
Date: Thu, 11 Aug 2022 12:35:00 +0200
Subject: [PATCH 52/59] (app) Introduce configure_api and Post, Get, Delete,
Put HttpMethods (#13945)
---
.github/workflows/ci-app-cloud-e2e-test.yml | 2 +-
.../.lightning | 0
.../app.py | 18 +-
.../command.py | 0
src/lightning_app/CHANGELOG.md | 1 +
src/lightning_app/api/__init__.py | 3 +
src/lightning_app/api/http_methods.py | 107 +++++++++++
src/lightning_app/api/request_types.py | 36 ++++
src/lightning_app/cli/lightning_cli.py | 50 ++---
src/lightning_app/core/api.py | 126 +++++--------
src/lightning_app/core/app.py | 32 ++--
src/lightning_app/core/flow.py | 33 ++++
src/lightning_app/core/queues.py | 22 +--
src/lightning_app/runners/backends/backend.py | 5 +-
src/lightning_app/runners/multiprocess.py | 19 +-
src/lightning_app/utilities/cli_helpers.py | 38 +++-
src/lightning_app/utilities/commands/base.py | 175 ++++++++----------
src/lightning_app/utilities/enum.py | 6 +
src/lightning_app/utilities/network.py | 3 +-
tests/tests_app/core/test_lightning_api.py | 107 ++++++++---
tests/tests_app/utilities/test_app_logs.py | 2 +
tests/tests_app/utilities/test_commands.py | 33 ++--
tests/tests_app_examples/test_commands.py | 32 ----
.../test_commands_and_api.py | 42 +++++
24 files changed, 568 insertions(+), 324 deletions(-)
rename examples/{app_commands => app_commands_and_api}/.lightning (100%)
rename examples/{app_commands => app_commands_and_api}/app.py (56%)
rename examples/{app_commands => app_commands_and_api}/command.py (100%)
create mode 100644 src/lightning_app/api/__init__.py
create mode 100644 src/lightning_app/api/http_methods.py
create mode 100644 src/lightning_app/api/request_types.py
delete mode 100644 tests/tests_app_examples/test_commands.py
create mode 100644 tests/tests_app_examples/test_commands_and_api.py
diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml
index 3ad455650a117..9a5a10a95cd33 100644
--- a/.github/workflows/ci-app-cloud-e2e-test.yml
+++ b/.github/workflows/ci-app-cloud-e2e-test.yml
@@ -54,7 +54,7 @@ jobs:
- custom_work_dependencies
- drive
- payload
- - commands
+ - commands_and_api
timeout-minutes: 35
steps:
- uses: actions/checkout@v2
diff --git a/examples/app_commands/.lightning b/examples/app_commands_and_api/.lightning
similarity index 100%
rename from examples/app_commands/.lightning
rename to examples/app_commands_and_api/.lightning
diff --git a/examples/app_commands/app.py b/examples/app_commands_and_api/app.py
similarity index 56%
rename from examples/app_commands/app.py
rename to examples/app_commands_and_api/app.py
index 99eb15c75c709..0d15bc531bb38 100644
--- a/examples/app_commands/app.py
+++ b/examples/app_commands_and_api/app.py
@@ -1,15 +1,16 @@
from command import CustomCommand, CustomConfig
from lightning import LightningFlow
+from lightning_app.api import Post
from lightning_app.core.app import LightningApp
class ChildFlow(LightningFlow):
- def trigger_method(self, name: str):
+ def nested_command(self, name: str):
print(f"Hello {name}")
def configure_commands(self):
- return [{"nested_trigger_command": self.trigger_method}]
+ return [{"nested_command": self.nested_command}]
class FlowCommands(LightningFlow):
@@ -19,21 +20,24 @@ def __init__(self):
self.child_flow = ChildFlow()
def run(self):
- if len(self.names):
+ if self.names:
print(self.names)
- def trigger_without_client_command(self, name: str):
+ def command_without_client(self, name: str):
self.names.append(name)
- def trigger_with_client_command(self, config: CustomConfig):
+ def command_with_client(self, config: CustomConfig):
self.names.append(config.name)
def configure_commands(self):
commands = [
- {"trigger_without_client_command": self.trigger_without_client_command},
- {"trigger_with_client_command": CustomCommand(self.trigger_with_client_command)},
+ {"command_without_client": self.command_without_client},
+ {"command_with_client": CustomCommand(self.command_with_client)},
]
return commands + self.child_flow.configure_commands()
+ def configure_api(self):
+ return [Post("/user/command_without_client", self.command_without_client)]
+
app = LightningApp(FlowCommands())
diff --git a/examples/app_commands/command.py b/examples/app_commands_and_api/command.py
similarity index 100%
rename from examples/app_commands/command.py
rename to examples/app_commands_and_api/command.py
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index ea28c57611311..7158d1ff7a2da 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
+- Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945))
### Changed
-
diff --git a/src/lightning_app/api/__init__.py b/src/lightning_app/api/__init__.py
new file mode 100644
index 0000000000000..25ec5c4708761
--- /dev/null
+++ b/src/lightning_app/api/__init__.py
@@ -0,0 +1,3 @@
+from lightning_app.api.http_methods import Delete, Get, Post, Put
+
+__all__ = ["Delete", "Get", "Post", "Put"]
diff --git a/src/lightning_app/api/http_methods.py b/src/lightning_app/api/http_methods.py
new file mode 100644
index 0000000000000..02b6ec87f17d2
--- /dev/null
+++ b/src/lightning_app/api/http_methods.py
@@ -0,0 +1,107 @@
+import asyncio
+import inspect
+import time
+from copy import deepcopy
+from functools import wraps
+from multiprocessing import Queue
+from typing import Any, Callable, Dict, List, Optional
+from uuid import uuid4
+
+from fastapi import FastAPI
+
+from lightning_app.api.request_types import APIRequest, CommandRequest
+
+
+def _signature_proxy_function():
+ pass
+
+
+class HttpMethod:
+ def __init__(self, route: str, method: Callable, method_name: Optional[str] = None, timeout: int = 30, **kwargs):
+ """This class is used to inject user defined methods within the App Rest API.
+
+ Arguments:
+ route: The path used to route the requests
+ method: The associated flow method
+ timeout: The time in seconds taken before raising a timeout exception.
+ """
+ self.route = route
+ self.component_name = method.__self__.name
+ self.method_name = method_name or method.__name__
+ self.method_annotations = method.__annotations__
+ # TODO: Validate the signature contains only pydantic models.
+ self.method_signature = inspect.signature(method)
+ self.timeout = timeout
+ self.kwargs = kwargs
+
+ def add_route(self, app: FastAPI, request_queue: Queue, responses_store: Dict[str, Any]) -> None:
+ # 1: Create a proxy function with the signature of the wrapped method.
+ fn = deepcopy(_signature_proxy_function)
+ fn.__annotations__ = self.method_annotations
+ fn.__name__ = self.method_name
+ setattr(fn, "__signature__", self.method_signature)
+
+ # 2: Get the route associated with the http method.
+ route = getattr(app, self.__class__.__name__.lower())
+
+ request_cls = CommandRequest if self.route.startswith("/command/") else APIRequest
+
+ # 3: Define the request handler.
+ @wraps(_signature_proxy_function)
+ async def _handle_request(*args, **kwargs):
+ async def fn(*args, **kwargs):
+ request_id = str(uuid4()).split("-")[0]
+ request_queue.put(
+ request_cls(
+ name=self.component_name,
+ method_name=self.method_name,
+ args=args,
+ kwargs=kwargs,
+ id=request_id,
+ )
+ )
+
+ t0 = time.time()
+ while request_id not in responses_store:
+ await asyncio.sleep(0.1)
+ if (time.time() - t0) > self.timeout:
+ raise Exception("The response was never received.")
+
+ return responses_store.pop(request_id)
+
+ return await asyncio.create_task(fn(*args, **kwargs))
+
+ # 4: Register the user provided route to the Rest API.
+ route(self.route, **self.kwargs)(_handle_request)
+
+
+class Post(HttpMethod):
+ pass
+
+
+class Get(HttpMethod):
+
+ pass
+
+
+class Put(HttpMethod):
+
+ pass
+
+
+class Delete(HttpMethod):
+ pass
+
+
+def _add_tags_to_api(apis: List[HttpMethod], tags: List[str]) -> None:
+ for api in apis:
+ if not api.kwargs.get("tag"):
+ api.kwargs["tags"] = tags
+
+
+def _validate_api(apis: List[HttpMethod]) -> None:
+ for api in apis:
+ if not isinstance(api, HttpMethod):
+ raise Exception(f"The provided api should be either [{Delete}, {Get}, {Post}, {Put}]")
+ if api.route.startswith("/command"):
+ raise Exception("The route `/command` is reserved for commands. Please, use something else.")
diff --git a/src/lightning_app/api/request_types.py b/src/lightning_app/api/request_types.py
new file mode 100644
index 0000000000000..53a6df25820a3
--- /dev/null
+++ b/src/lightning_app/api/request_types.py
@@ -0,0 +1,36 @@
+from dataclasses import asdict, dataclass
+from typing import Any
+
+from deepdiff import Delta
+
+
+@dataclass
+class BaseRequest:
+ def to_dict(self):
+ return asdict(self)
+
+
+@dataclass
+class DeltaRequest(BaseRequest):
+ delta: Delta
+
+ def to_dict(self):
+ return self.delta.to_dict()
+
+
+@dataclass
+class CommandRequest(BaseRequest):
+ id: str
+ name: str
+ method_name: str
+ args: Any
+ kwargs: Any
+
+
+@dataclass
+class APIRequest(BaseRequest):
+ id: str
+ name: str
+ method_name: str
+ args: Any
+ kwargs: Any
diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py
index babe0aa2b2abc..6a6e41df57026 100644
--- a/src/lightning_app/cli/lightning_cli.py
+++ b/src/lightning_app/cli/lightning_cli.py
@@ -4,7 +4,6 @@
from argparse import ArgumentParser
from pathlib import Path
from typing import List, Tuple, Union
-from uuid import uuid4
import click
import requests
@@ -26,10 +25,10 @@
_retrieve_application_url_and_available_commands,
)
from lightning_app.utilities.cloud import _get_project
+from lightning_app.utilities.enum import OpenAPITags
from lightning_app.utilities.install_components import register_all_external_components
from lightning_app.utilities.login import Auth
from lightning_app.utilities.network import LightningClient
-from lightning_app.utilities.state import headers_for
logger = logging.getLogger(__name__)
@@ -263,41 +262,42 @@ def app_command():
hparams, argv = parser.parse_known_args()
# 1: Collect the url and comments from the running application
- url, commands = _retrieve_application_url_and_available_commands(hparams.app_id)
- if url is None or commands is None:
+ url, api_commands = _retrieve_application_url_and_available_commands(hparams.app_id)
+ if url is None or api_commands is None:
raise Exception("We couldn't find any matching running app.")
- if not commands:
+ if not api_commands:
raise Exception("This application doesn't expose any commands yet.")
command = argv[0]
- command_names = [c["command"] for c in commands]
- if command not in command_names:
- raise Exception(f"The provided command {command} isn't available in {command_names}")
+ if command not in api_commands:
+ raise Exception(f"The provided command {command} isn't available in {list(api_commands)}")
# 2: Send the command from the user
- command_metadata = [c for c in commands if c["command"] == command][0]
- params = command_metadata["params"]
+ metadata = api_commands[command]
# 3: Execute the command
- if not command_metadata["is_client_command"]:
- # TODO: Improve what is supported there.
- kwargs = {k.split("=")[0].replace("--", ""): k.split("=")[1] for k in argv[1:]}
- for param in params:
- if param not in kwargs:
- raise Exception(f"The argument --{param}=X hasn't been provided.")
- json = {
- "command_name": command,
- "command_arguments": kwargs,
- "affiliation": command_metadata["affiliation"],
- "id": str(uuid4()),
- }
- resp = requests.post(url + "/api/v1/commands", json=json, headers=headers_for({}))
+ if metadata["tag"] == OpenAPITags.APP_COMMAND:
+ # TODO: Improve what is current supported
+ kwargs = [v.replace("--", "") for v in argv[1:]]
+
+ for p in kwargs:
+ if p.split("=")[0] not in metadata["parameters"]:
+ raise Exception(f"Some arguments need to be provided. The keys are {list(metadata['parameters'])}.")
+ # TODO: Encode the parameters and validate their type.
+ query_parameters = "&".join(kwargs)
+ resp = requests.post(url + f"/command/{command}?{query_parameters}")
assert resp.status_code == 200, resp.json()
else:
- client_command, models = _download_command(command_metadata, hparams.app_id, debug_mode=debug_mode)
- client_command._setup(metadata=command_metadata, models=models, app_url=url)
+ client_command = _download_command(
+ command,
+ metadata["cls_path"],
+ metadata["cls_name"],
+ hparams.app_id,
+ debug_mode=debug_mode,
+ )
+ client_command._setup(command_name=command, app_url=url)
sys.argv = argv
client_command.run()
diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py
index f19ada5340d57..8b625713e0c2c 100644
--- a/src/lightning_app/core/api.py
+++ b/src/lightning_app/core/api.py
@@ -3,7 +3,6 @@
import os
import queue
import sys
-import time
import traceback
from copy import deepcopy
from multiprocessing import Queue
@@ -21,9 +20,12 @@
from pydantic import BaseModel
from websockets.exceptions import ConnectionClosed
+from lightning_app.api.http_methods import HttpMethod
+from lightning_app.api.request_types import DeltaRequest
from lightning_app.core.constants import FRONTEND_DIR
from lightning_app.core.queues import RedisQueue
from lightning_app.utilities.app_helpers import InMemoryStateStore, StateStore
+from lightning_app.utilities.enum import OpenAPITags
from lightning_app.utilities.imports import _is_redis_available, _is_starsessions_available
if _is_starsessions_available():
@@ -42,9 +44,6 @@ class SessionMiddleware:
frontend_static_dir = os.path.join(FRONTEND_DIR, "static")
api_app_delta_queue: Queue = None
-api_commands_requests_queue: Queue = None
-api_commands_metadata_queue: Queue = None
-api_commands_responses_queue: Queue = None
template = {"ui": {}, "app": {}}
templates = Jinja2Templates(directory=FRONTEND_DIR)
@@ -56,8 +55,8 @@ class SessionMiddleware:
lock = Lock()
app_spec: Optional[List] = None
-app_commands_metadata: Optional[Dict] = None
-commands_response_store = {}
+# In the future, this would be abstracted to support horizontal scaling.
+responses_store = {}
logger = logging.getLogger(__name__)
@@ -67,11 +66,10 @@ class SessionMiddleware:
class UIRefresher(Thread):
- def __init__(self, api_publish_state_queue, api_commands_metadata_queue, api_commands_responses_queue) -> None:
+ def __init__(self, api_publish_state_queue, api_response_queue) -> None:
super().__init__(daemon=True)
self.api_publish_state_queue = api_publish_state_queue
- self.api_commands_metadata_queue = api_commands_metadata_queue
- self.api_commands_responses_queue = api_commands_responses_queue
+ self.api_response_queue = api_response_queue
self._exit_event = Event()
def run(self):
@@ -93,18 +91,11 @@ def run_once(self):
pass
try:
- metadata = self.api_commands_metadata_queue.get(timeout=0)
+ response = self.api_response_queue.get(timeout=0)
with lock:
- global app_commands_metadata
- app_commands_metadata = metadata
- except queue.Empty:
- pass
-
- try:
- response = self.api_commands_responses_queue.get(timeout=0)
- with lock:
- global commands_response_store
- commands_response_store[response["id"]] = response["response"]
+ # TODO: Abstract the responses store to support horizontal scaling.
+ global responses_store
+ responses_store[response["id"]] = response["response"]
except queue.Empty:
pass
@@ -117,6 +108,23 @@ class StateUpdate(BaseModel):
state: dict = {}
+openapi_tags = [
+ {
+ "name": OpenAPITags.APP_CLIENT_COMMAND,
+ "description": "The App Endpoints to be triggered exclusively from the CLI",
+ },
+ {
+ "name": OpenAPITags.APP_COMMAND,
+ "description": "The App Endpoints that can be triggered equally from the CLI or from a Http Request",
+ },
+ {
+ "name": OpenAPITags.APP_API,
+ "description": "The App Endpoints that can be triggered exclusively from a Http Request",
+ },
+]
+
+app = FastAPI(openapi_tags=openapi_tags)
+
fastapi_service = FastAPI()
fastapi_service.add_middleware(
@@ -176,50 +184,13 @@ async def get_spec(
return app_spec or []
-@fastapi_service.post("/api/v1/commands", response_class=JSONResponse)
-async def run_remote_command(
- request: Request,
-) -> None:
- data = await request.json()
- command_name = data.get("command_name", None)
- if not command_name:
- raise Exception("The provided command name is empty.")
- command_arguments = data.get("command_arguments", None)
- if not command_arguments:
- raise Exception("The provided command metadata is empty.")
- affiliation = data.get("affiliation", None)
- if not affiliation:
- raise Exception("The provided affiliation is empty.")
-
- async def fn(data):
- request_id = data["id"]
- api_commands_requests_queue.put(data)
-
- t0 = time.time()
- while request_id not in commands_response_store:
- await asyncio.sleep(0.1)
- if (time.time() - t0) > 15:
- raise Exception("The response was never received.")
-
- return commands_response_store[request_id]
-
- return await asyncio.create_task(fn(data))
-
-
-@fastapi_service.get("/api/v1/commands", response_class=JSONResponse)
-async def get_commands() -> Optional[Dict]:
- global app_commands_metadata
- with lock:
- return app_commands_metadata
-
-
@fastapi_service.post("/api/v1/delta")
async def post_delta(
request: Request,
x_lightning_type: Optional[str] = Header(None),
x_lightning_session_uuid: Optional[str] = Header(None),
x_lightning_session_id: Optional[str] = Header(None),
-) -> Mapping:
+) -> None:
"""This endpoint is used to make an update to the app state using delta diff, mainly used by streamlit to
update the state."""
@@ -229,9 +200,7 @@ async def post_delta(
raise Exception("Missing X-Lightning-Session-ID header")
body: Dict = await request.json()
- delta = body["delta"]
- update_delta = Delta(delta)
- api_app_delta_queue.put(update_delta)
+ api_app_delta_queue.put(DeltaRequest(delta=Delta(body["delta"])))
@fastapi_service.post("/api/v1/state")
@@ -240,7 +209,7 @@ async def post_state(
x_lightning_type: Optional[str] = Header(None),
x_lightning_session_uuid: Optional[str] = Header(None),
x_lightning_session_id: Optional[str] = Header(None),
-) -> Mapping:
+) -> None:
if x_lightning_session_uuid is None:
raise Exception("Missing X-Lightning-Session-UUID header")
if x_lightning_session_id is None:
@@ -263,8 +232,7 @@ async def post_state(
state = body["state"]
last_state = global_app_state_store.get_served_state(x_lightning_session_uuid)
deep_diff = DeepDiff(last_state, state, verbose_level=2)
- update_delta = Delta(deep_diff)
- api_app_delta_queue.put(update_delta)
+ api_app_delta_queue.put(DeltaRequest(delta=Delta(deep_diff)))
@fastapi_service.get("/healthz", status_code=200)
@@ -307,8 +275,6 @@ async def websocket_endpoint(websocket: WebSocket):
await websocket.close()
-# Catch-all for nonexistent API routes (since we define a catch-all for client-side routing)
-@fastapi_service.get("/api{full_path:path}", response_class=JSONResponse)
async def api_catch_all(request: Request, full_path: str):
raise HTTPException(status_code=404, detail="Not found")
@@ -317,14 +283,18 @@ async def api_catch_all(request: Request, full_path: str):
fastapi_service.mount("/static", StaticFiles(directory=frontend_static_dir, check_dir=False), name="static")
-# Catch-all for frontend routes, must be defined after all other routes
-@fastapi_service.get("/{full_path:path}", response_class=HTMLResponse)
async def frontend_route(request: Request, full_path: str):
if "pytest" in sys.modules:
return ""
return templates.TemplateResponse("index.html", {"request": request})
+def register_global_routes():
+ # Catch-all for nonexistent API routes (since we define a catch-all for client-side routing)
+ fastapi_service.get("/api{full_path:path}", response_class=JSONResponse)(api_catch_all)
+ fastapi_service.get("/{full_path:path}", response_class=HTMLResponse)(frontend_route)
+
+
class LightningUvicornServer(uvicorn.Server):
has_started_queue = None
@@ -346,34 +316,28 @@ async def check_is_started(self, queue):
def start_server(
api_publish_state_queue,
api_delta_queue,
- commands_requests_queue,
- commands_responses_queue,
- commands_metadata_queue,
+ api_response_queue,
has_started_queue: Optional[Queue] = None,
host="127.0.0.1",
port=8000,
uvicorn_run: bool = True,
spec: Optional[List] = None,
+ apis: Optional[List[HttpMethod]] = None,
app_state_store: Optional[StateStore] = None,
):
global api_app_delta_queue
global global_app_state_store
- global api_commands_requests_queue
- global api_commands_responses_queue
global app_spec
app_spec = spec
api_app_delta_queue = api_delta_queue
- api_commands_requests_queue = commands_requests_queue
- api_commands_responses_queue = commands_responses_queue
- api_commands_metadata_queue = commands_metadata_queue
if app_state_store is not None:
global_app_state_store = app_state_store
global_app_state_store.add(TEST_SESSION_UUID)
- refresher = UIRefresher(api_publish_state_queue, api_commands_metadata_queue, commands_responses_queue)
+ refresher = UIRefresher(api_publish_state_queue, api_response_queue)
refresher.setDaemon(True)
refresher.start()
@@ -384,6 +348,14 @@ def start_server(
LightningUvicornServer.has_started_queue = has_started_queue
# uvicorn is doing some uglyness by replacing uvicorn.main by click command.
sys.modules["uvicorn.main"].Server = LightningUvicornServer
+
+ # Register the user API.
+ if apis:
+ for api in apis:
+ api.add_route(fastapi_service, api_app_delta_queue, responses_store)
+
+ register_global_routes()
+
uvicorn.run(app=fastapi_service, host=host, port=port, log_level="error")
return refresher
diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py
index 3f9e2521eb21d..65242a1ae0a2a 100644
--- a/src/lightning_app/core/app.py
+++ b/src/lightning_app/core/app.py
@@ -11,12 +11,13 @@
from deepdiff import DeepDiff, Delta
import lightning_app
+from lightning_app.api.request_types import APIRequest, CommandRequest, DeltaRequest
from lightning_app.core.constants import FLOW_DURATION_SAMPLES, FLOW_DURATION_THRESHOLD, STATE_ACCUMULATE_WAIT
from lightning_app.core.queues import BaseQueue, SingleProcessQueue
from lightning_app.frontend import Frontend
from lightning_app.storage.path import storage_root_dir
from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef
-from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests
+from lightning_app.utilities.commands.base import _process_requests
from lightning_app.utilities.component import _convert_paths_after_init
from lightning_app.utilities.enum import AppStage, CacheCallsKeys
from lightning_app.utilities.exceptions import CacheMissException, ExitAppException
@@ -73,9 +74,7 @@ def __init__(
# queues definition.
self.delta_queue: t.Optional[BaseQueue] = None
self.readiness_queue: t.Optional[BaseQueue] = None
- self.commands_requests_queue: t.Optional[BaseQueue] = None
- self.commands_responses_queue: t.Optional[BaseQueue] = None
- self.commands_metadata_queue: t.Optional[BaseQueue] = None
+ self.api_response_queue: t.Optional[BaseQueue] = None
self.api_publish_state_queue: t.Optional[BaseQueue] = None
self.api_delta_queue: t.Optional[BaseQueue] = None
self.error_queue: t.Optional[BaseQueue] = None
@@ -253,7 +252,7 @@ def named_works(self) -> t.List[t.Tuple[str, "lightning_app.LightningWork"]]:
"""Returns all the works defined within this application with their names."""
return self.root.named_works(recurse=True)
- def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]:
+ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[t.Union[Delta, APIRequest, CommandRequest]]:
# The aggregation would try to get as many deltas as possible
# from both the `api_delta_queue` and `delta_queue`
# during the `state_accumulate_wait` time.
@@ -267,8 +266,12 @@ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]:
while (time() - t0) < self.state_accumulate_wait:
if self.api_delta_queue and should_get_delta_from_api:
- delta_from_api: Delta = self.get_state_changed_from_queue(self.api_delta_queue) # TODO: rename
+ delta_from_api: t.Union[DeltaRequest, APIRequest, CommandRequest] = self.get_state_changed_from_queue(
+ self.api_delta_queue
+ ) # TODO: rename
if delta_from_api:
+ if isinstance(delta_from_api, DeltaRequest):
+ delta_from_api = delta_from_api.delta
deltas.append(delta_from_api)
else:
should_get_delta_from_api = False
@@ -317,8 +320,19 @@ def maybe_apply_changes(self) -> bool:
logger.debug(f"Received {[d.to_dict() for d in deltas]}")
- state = self.state
+ # 1: Process the API / Command Requests first as they might affect the state.
+ state_deltas = []
for delta in deltas:
+ if isinstance(delta, (APIRequest, CommandRequest)):
+ _process_requests(self, delta)
+ else:
+ state_deltas.append(delta)
+
+ # 2: Collect the state
+ state = self.state
+
+ # 3: Apply the state delta
+ for delta in state_deltas:
try:
state += delta
except Exception as e:
@@ -351,8 +365,6 @@ def run_once(self):
elif self.stage == AppStage.RESTARTING:
return self._apply_restarting()
- _process_command_requests(self)
-
t0 = time()
try:
@@ -411,8 +423,6 @@ def _run(self) -> bool:
self._reset_run_time_monitor()
- _populate_commands_endpoint(self)
-
while not done:
done = self.run_once()
diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py
index f6b6e34e81538..41c46cd868307 100644
--- a/src/lightning_app/core/flow.py
+++ b/src/lightning_app/core/flow.py
@@ -634,3 +634,36 @@ def my_remote_method(self, name):
lightning my_command_name --args name=my_own_name
"""
raise NotImplementedError
+
+ def configure_api(self):
+ """Configure the API routes of the LightningFlow.
+
+ Returns a list of HttpMethod such as Post or Get.
+
+ .. code-block:: python
+
+ from lightning_app import LightningFlow
+ from lightning_app.api import Post
+
+ from pydantic import BaseModel
+
+
+ class HandlerModel(BaseModel):
+ name: str
+
+
+ class Flow(L.LightningFlow):
+ def __init__(self):
+ super().__init__()
+ self.names = []
+
+ def handler(self, config: HandlerModel) -> None:
+ self.names.append(config.name)
+
+ def configure_api(self):
+ return [Post("/v1/api/request", self.handler)]
+
+ Once the app is running, you can access the Swagger UI of the app
+ under the ``/docs`` route.
+ """
+ raise NotImplementedError
diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py
index efac8230047e0..2b7295d7f327f 100644
--- a/src/lightning_app/core/queues.py
+++ b/src/lightning_app/core/queues.py
@@ -36,9 +36,7 @@
ORCHESTRATOR_COPY_REQUEST_CONSTANT = "ORCHESTRATOR_COPY_REQUEST"
ORCHESTRATOR_COPY_RESPONSE_CONSTANT = "ORCHESTRATOR_COPY_RESPONSE"
WORK_QUEUE_CONSTANT = "WORK_QUEUE"
-COMMANDS_REQUESTS_QUEUE_CONSTANT = "COMMANDS_REQUESTS_QUEUE"
-COMMANDS_RESPONSES_QUEUE_CONSTANT = "COMMANDS_RESPONSES_QUEUE"
-COMMANDS_METADATA_QUEUE_CONSTANT = "COMMANDS_METADATA_QUEUE"
+API_RESPONSE_QUEUE_CONSTANT = "API_RESPONSE_QUEUE"
class QueuingSystem(Enum):
@@ -54,18 +52,8 @@ def _get_queue(self, queue_name: str) -> "BaseQueue":
else:
return SingleProcessQueue(queue_name, default_timeout=STATE_UPDATE_TIMEOUT)
- def get_commands_requests_queue(self, queue_id: Optional[str] = None) -> "BaseQueue":
- queue_name = f"{queue_id}_{COMMANDS_REQUESTS_QUEUE_CONSTANT}" if queue_id else COMMANDS_REQUESTS_QUEUE_CONSTANT
- return self._get_queue(queue_name)
-
- def get_commands_responses_queue(self, queue_id: Optional[str] = None) -> "BaseQueue":
- queue_name = (
- f"{queue_id}_{COMMANDS_RESPONSES_QUEUE_CONSTANT}" if queue_id else COMMANDS_RESPONSES_QUEUE_CONSTANT
- )
- return self._get_queue(queue_name)
-
- def get_commands_metadata_queue(self, queue_id: Optional[str] = None) -> "BaseQueue":
- queue_name = f"{queue_id}_{COMMANDS_METADATA_QUEUE_CONSTANT}" if queue_id else COMMANDS_METADATA_QUEUE_CONSTANT
+ def get_api_response_queue(self, queue_id: Optional[str] = None) -> "BaseQueue":
+ queue_name = f"{queue_id}_{API_RESPONSE_QUEUE_CONSTANT}" if queue_id else API_RESPONSE_QUEUE_CONSTANT
return self._get_queue(queue_name)
def get_readiness_queue(self, queue_id: Optional[str] = None) -> "BaseQueue":
@@ -98,10 +86,6 @@ def get_api_delta_queue(self, queue_id: Optional[str] = None) -> "BaseQueue":
queue_name = f"{queue_id}_{API_DELTA_QUEUE_CONSTANT}" if queue_id else API_DELTA_QUEUE_CONSTANT
return self._get_queue(queue_name)
- def get_api_refresh_queue(self, queue_id: Optional[str] = None) -> "BaseQueue":
- queue_name = f"{queue_id}_{API_REFRESH_QUEUE_CONSTANT}" if queue_id else API_REFRESH_QUEUE_CONSTANT
- return self._get_queue(queue_name)
-
def get_orchestrator_request_queue(self, work_name: str, queue_id: Optional[str] = None) -> "BaseQueue":
queue_name = (
f"{queue_id}_{ORCHESTRATOR_REQUEST_CONSTANT}_{work_name}"
diff --git a/src/lightning_app/runners/backends/backend.py b/src/lightning_app/runners/backends/backend.py
index 87bb103823fd2..a944cd4aa9093 100644
--- a/src/lightning_app/runners/backends/backend.py
+++ b/src/lightning_app/runners/backends/backend.py
@@ -82,11 +82,8 @@ def _prepare_queues(self, app):
kw = dict(queue_id=self.queue_id)
app.delta_queue = self.queues.get_delta_queue(**kw)
app.readiness_queue = self.queues.get_readiness_queue(**kw)
- app.commands_requests_queue = self.queues.get_commands_requests_queue(**kw)
- app.commands_responses_queue = self.queues.get_commands_responses_queue(**kw)
- app.commands_metadata_queue = self.queues.get_commands_metadata_queue(**kw)
+ app.api_response_queue = self.queues.get_api_response_queue(**kw)
app.error_queue = self.queues.get_error_queue(**kw)
- app.delta_queue = self.queues.get_delta_queue(**kw)
app.api_publish_state_queue = self.queues.get_api_state_publish_queue(**kw)
app.api_delta_queue = self.queues.get_api_delta_queue(**kw)
app.request_queues = {}
diff --git a/src/lightning_app/runners/multiprocess.py b/src/lightning_app/runners/multiprocess.py
index 92ec900d89c65..16e373b0a37a2 100644
--- a/src/lightning_app/runners/multiprocess.py
+++ b/src/lightning_app/runners/multiprocess.py
@@ -3,10 +3,13 @@
from dataclasses import dataclass
from typing import Any, Callable, Optional, Union
+from lightning_app.api.http_methods import _add_tags_to_api, _validate_api
from lightning_app.core.api import start_server
from lightning_app.runners.backends import Backend
from lightning_app.runners.runtime import Runtime
from lightning_app.storage.orchestrator import StorageOrchestrator
+from lightning_app.utilities.app_helpers import is_overridden
+from lightning_app.utilities.commands.base import _commands_to_api, _prepare_commands
from lightning_app.utilities.component import _set_flow_context, _set_frontend_context
from lightning_app.utilities.load_app import extract_metadata_from_app
from lightning_app.utilities.network import find_free_network_port
@@ -60,15 +63,25 @@ def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwarg
if self.start_server:
self.app.should_publish_changes_to_api = True
has_started_queue = self.backend.queues.get_has_server_started_queue()
+
+ apis = []
+ if is_overridden("configure_api", self.app.root):
+ apis = self.app.root.configure_api()
+ _validate_api(apis)
+ _add_tags_to_api(apis, ["app_api"])
+
+ if is_overridden("configure_commands", self.app.root):
+ commands = _prepare_commands(self.app)
+ apis += _commands_to_api(commands)
+
kwargs = dict(
+ apis=apis,
host=self.host,
port=self.port,
+ api_response_queue=self.app.api_response_queue,
api_publish_state_queue=self.app.api_publish_state_queue,
api_delta_queue=self.app.api_delta_queue,
has_started_queue=has_started_queue,
- commands_requests_queue=self.app.commands_requests_queue,
- commands_responses_queue=self.app.commands_responses_queue,
- commands_metadata_queue=self.app.commands_metadata_queue,
spec=extract_metadata_from_app(self.app),
)
server_proc = multiprocessing.Process(target=start_server, kwargs=kwargs)
diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py
index fcce96ec64407..6000114c3d4d6 100644
--- a/src/lightning_app/utilities/cli_helpers.py
+++ b/src/lightning_app/utilities/cli_helpers.py
@@ -49,16 +49,42 @@ def _is_url(id: Optional[str]) -> bool:
return False
+def _get_metadata_from_openapi(paths: Dict, path: str):
+ parameters = paths[path]["post"].get("parameters", {})
+ tag = paths[path]["post"].get("tags", [None])[0]
+ cls_path = paths[path]["post"].get("cls_path", None)
+ cls_name = paths[path]["post"].get("cls_name", None)
+
+ metadata = {"tag": tag, "parameters": {}}
+
+ if cls_path:
+ metadata["cls_path"] = cls_path
+
+ if cls_name:
+ metadata["cls_name"] = cls_name
+
+ if not parameters:
+ return metadata
+
+ metadata["parameters"].update({d["name"]: d["schema"]["type"] for d in parameters})
+ return metadata
+
+
+def _extract_command_from_openapi(openapi_resp: Dict) -> Dict[str, Dict[str, str]]:
+ command_paths = [p for p in openapi_resp["paths"] if p.startswith("/command/")]
+ return {p.replace("/command/", ""): _get_metadata_from_openapi(openapi_resp["paths"], p) for p in command_paths}
+
+
def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Optional[str]):
"""This function is used to retrieve the current url associated with an id."""
if _is_url(app_id_or_name_or_url):
url = app_id_or_name_or_url
assert url
- resp = requests.get(url + "/api/v1/commands")
+ resp = requests.get(url + "/openapi.json")
if resp.status_code != 200:
raise Exception(f"The server didn't process the request properly. Found {resp.json()}")
- return url, resp.json()
+ return url, _extract_command_from_openapi(resp.json())
# 2: If no identifier has been provided, evaluate the local application
failed_locally = False
@@ -66,10 +92,10 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti
if app_id_or_name_or_url is None:
try:
url = f"http://localhost:{APP_SERVER_PORT}"
- resp = requests.get(f"{url}/api/v1/commands")
+ resp = requests.get(f"{url}/openapi.json")
if resp.status_code != 200:
raise Exception(f"The server didn't process the request properly. Found {resp.json()}")
- return url, resp.json()
+ return url, _extract_command_from_openapi(resp.json())
except requests.exceptions.ConnectionError:
failed_locally = True
@@ -88,8 +114,8 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti
if lightningapp.id == app_id_or_name_or_url or lightningapp.name == app_id_or_name_or_url:
if lightningapp.status.url == "":
raise Exception("The application is starting. Try in a few moments.")
- resp = requests.get(lightningapp.status.url + "/api/v1/commands")
+ resp = requests.get(lightningapp.status.url + "/openapi.json")
if resp.status_code != 200:
raise Exception(f"The server didn't process the request properly. Found {resp.json()}")
- return lightningapp.status.url, resp.json()
+ return lightningapp.status.url, _extract_command_from_openapi(resp.json())
return None, None
diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py
index b87b41b05df42..c74926f542744 100644
--- a/src/lightning_app/utilities/commands/base.py
+++ b/src/lightning_app/utilities/commands/base.py
@@ -1,6 +1,5 @@
import errno
import inspect
-import logging
import os
import os.path as osp
import shutil
@@ -8,19 +7,18 @@
from getpass import getuser
from importlib.util import module_from_spec, spec_from_file_location
from tempfile import gettempdir
-from typing import Any, Callable, Dict, List, Optional, Tuple
-from uuid import uuid4
+from typing import Any, Callable, Dict, List, Optional, Union
import requests
from pydantic import BaseModel
+from lightning_app.api.http_methods import Post
+from lightning_app.api.request_types import APIRequest, CommandRequest
from lightning_app.utilities.app_helpers import is_overridden
from lightning_app.utilities.cloud import _get_project
from lightning_app.utilities.network import LightningClient
from lightning_app.utilities.state import AppState
-_logger = logging.getLogger(__name__)
-
def makedirs(path: str):
r"""Recursive directory creation function."""
@@ -31,31 +29,18 @@ def makedirs(path: str):
raise e
-class _ClientCommandConfig(BaseModel):
- command: str
- affiliation: str
- params: Dict[str, str]
- is_client_command: bool
- cls_path: str
- cls_name: str
- owner: str
- requirements: Optional[List[str]]
-
-
class ClientCommand:
def __init__(self, method: Callable, requirements: Optional[List[str]] = None) -> None:
self.method = method
flow = getattr(method, "__self__", None)
self.owner = flow.name if flow else None
self.requirements = requirements
- self.metadata = None
self.models: Optional[Dict[str, BaseModel]] = None
self.app_url = None
self._state = None
- def _setup(self, metadata: Dict[str, Any], models: Dict[str, BaseModel], app_url: str) -> None:
- self.metadata = metadata
- self.models = models
+ def _setup(self, command_name: str, app_url: str) -> None:
+ self.command_name = command_name
self.app_url = app_url
@property
@@ -72,67 +57,50 @@ def state(self):
def run(self, **cli_kwargs) -> None:
"""Overrides with the logic to execute on the client side."""
- def invoke_handler(self, **kwargs: Any) -> Dict[str, Any]:
- from lightning.app.utilities.state import headers_for
-
- assert kwargs.keys() == self.models.keys()
- for k, v in kwargs.items():
- assert isinstance(v, self.models[k])
- json = {
- "command_name": self.metadata["command"],
- "command_arguments": {k: v.json() for k, v in kwargs.items()},
- "affiliation": self.metadata["affiliation"],
- "id": str(uuid4()),
- }
- resp = requests.post(self.app_url + "/api/v1/commands", json=json, headers=headers_for({}))
+ def invoke_handler(self, config: BaseModel) -> Dict[str, Any]:
+ resp = requests.post(self.app_url + f"/command/{self.command_name}", data=config.json())
assert resp.status_code == 200, resp.json()
return resp.json()
def _to_dict(self):
return {"owner": self.owner, "requirements": self.requirements}
- def __call__(self, **kwargs: Any) -> Any:
- assert self.models
- input = {}
- for k, v in kwargs.items():
- input[k] = self.models[k].parse_raw(v)
- return self.method(**input)
+ def __call__(self, **kwargs):
+ return self.method(**kwargs)
def _download_command(
- command_metadata: Dict[str, Any],
- app_id: Optional[str],
+ command_name: str,
+ cls_path: str,
+ cls_name: str,
+ app_id: Optional[str] = None,
debug_mode: bool = False,
-) -> Tuple[ClientCommand, Dict[str, BaseModel]]:
+) -> ClientCommand:
# TODO: This is a skateboard implementation and the final version will rely on versioned
# immutable commands for security concerns
- config = _ClientCommandConfig(**command_metadata)
tmpdir = osp.join(gettempdir(), f"{getuser()}_commands")
makedirs(tmpdir)
- target_file = osp.join(tmpdir, f"{config.command}.py")
+ target_file = osp.join(tmpdir, f"{command_name}.py")
if app_id:
client = LightningClient()
project_id = _get_project(client).project_id
response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts(project_id, app_id)
for artifact in response.artifacts:
- if f"commands/{config.command}.py" == artifact.filename:
+ if f"commands/{command_name}.py" == artifact.filename:
r = requests.get(artifact.url, allow_redirects=True)
with open(target_file, "wb") as f:
f.write(r.content)
else:
if not debug_mode:
- shutil.copy(config.cls_path, target_file)
+ shutil.copy(cls_path, target_file)
- cls_name = config.cls_name
- spec = spec_from_file_location(config.cls_name, config.cls_path if debug_mode else target_file)
+ spec = spec_from_file_location(cls_name, cls_path if debug_mode else target_file)
mod = module_from_spec(spec)
sys.modules[cls_name] = mod
spec.loader.exec_module(mod)
- command = getattr(mod, cls_name)(method=None, requirements=config.requirements)
- models = {k: getattr(mod, v) for k, v in config.params.items()}
- if debug_mode:
- shutil.rmtree(tmpdir)
- return command, models
+ command = getattr(mod, cls_name)(method=None, requirements=[])
+ shutil.rmtree(tmpdir)
+ return command
def _to_annotation(anno: str) -> str:
@@ -142,7 +110,7 @@ def _to_annotation(anno: str) -> str:
return anno
-def _command_to_method_and_metadata(command: ClientCommand) -> Tuple[Callable, Dict[str, Any]]:
+def _validate_client_command(command: ClientCommand):
"""Extract method and its metadata from a ClientCommand."""
params = inspect.signature(command.method).parameters
command_metadata = {
@@ -170,8 +138,6 @@ def _command_to_method_and_metadata(command: ClientCommand) -> Tuple[Callable, D
raise Exception(
f"The provided annotation for the argument {k} shouldn't an instance of pydantic BaseModel."
)
- command.models[k] = config
- return method, command_metadata
def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]:
@@ -192,55 +158,68 @@ def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]:
return filepath
-def _populate_commands_endpoint(app):
+def _prepare_commands(app) -> List:
if not is_overridden("configure_commands", app.root):
- return
+ return []
- # 1: Populate commands metadata
+ # 1: Upload the command to s3.
commands = app.root.configure_commands()
- commands_metadata = []
- command_names = set()
for command_mapping in commands:
for command_name, command in command_mapping.items():
- is_client_command = isinstance(command, ClientCommand)
- extras = {}
- if is_client_command:
+ if isinstance(command, ClientCommand):
_upload_command(command_name, command)
- command, extras = _command_to_method_and_metadata(command)
- if command_name in command_names:
- raise Exception(f"The component name {command_name} has already been used. They need to be unique.")
- command_names.add(command_name)
- params = inspect.signature(command).parameters
- commands_metadata.append(
- {
- "command": command_name,
- "affiliation": command.__self__.name,
- "params": list(params.keys()),
- "is_client_command": is_client_command,
- **extras,
- }
- )
- # 1.2: Pass the collected commands through the queue to the Rest API.
- app.commands_metadata_queue.put(commands_metadata)
+ # 2: Cache the commands on the app.
app.commands = commands
+ return commands
-def _process_command_requests(app):
- if not is_overridden("configure_commands", app.root):
- return
-
- # 1: Populate commands metadata
- commands = app.commands
-
- # 2: Collect requests metadata
- command_query = app.get_state_changed_from_queue(app.commands_requests_queue)
- if command_query:
- for command in commands:
- for command_name, method in command.items():
- if command_query["command_name"] == command_name:
- # 2.1: Evaluate the method associated to a specific command.
- # Validation is done on the CLI side.
- response = method(**command_query["command_arguments"])
- app.commands_responses_queue.put({"response": response, "id": command_query["id"]})
- app._has_updated = True
+def _process_api_request(app, request: APIRequest) -> None:
+ flow = app.get_component_by_name(request.name)
+ method = getattr(flow, request.method_name)
+ response = method(*request.args, **request.kwargs)
+ app.api_response_queue.put({"response": response, "id": request.id})
+
+
+def _process_command_requests(app, request: CommandRequest) -> None:
+ for command in app.commands:
+ for command_name, method in command.items():
+ if request.method_name == command_name:
+ # 2.1: Evaluate the method associated to a specific command.
+ # Validation is done on the CLI side.
+ response = method(*request.args, **request.kwargs)
+ app.api_response_queue.put({"response": response, "id": request.id})
+
+
+def _process_requests(app, request: Union[APIRequest, CommandRequest]) -> None:
+ """Convert user commands to API endpoint."""
+ if isinstance(request, APIRequest):
+ _process_api_request(app, request)
+ else:
+ _process_command_requests(app, request)
+
+
+def _collect_open_api_extras(command) -> Dict:
+ if not isinstance(command, ClientCommand):
+ return {}
+ return {
+ "cls_path": inspect.getfile(command.__class__),
+ "cls_name": command.__class__.__name__,
+ }
+
+
+def _commands_to_api(commands: List[Dict[str, Union[Callable, ClientCommand]]]) -> List:
+ """Convert user commands to API endpoint."""
+ api = []
+ for command in commands:
+ for k, v in command.items():
+ api.append(
+ Post(
+ f"/command/{k}",
+ v.method if isinstance(v, ClientCommand) else v,
+ method_name=k,
+ tags=["app_client_command"] if isinstance(v, ClientCommand) else ["app_command"],
+ openapi_extra=_collect_open_api_extras(v),
+ )
+ )
+ return api
diff --git a/src/lightning_app/utilities/enum.py b/src/lightning_app/utilities/enum.py
index dbf20413aa9d9..2b88d93169930 100644
--- a/src/lightning_app/utilities/enum.py
+++ b/src/lightning_app/utilities/enum.py
@@ -72,3 +72,9 @@ def make_status(stage: str, message: Optional[str] = None, reason: Optional[str]
class CacheCallsKeys:
LATEST_CALL_HASH = "latest_call_hash"
+
+
+class OpenAPITags:
+ APP_CLIENT_COMMAND = "app_client_command"
+ APP_COMMAND = "app_command"
+ APP_API = "app_api"
diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py
index 7fd03750a515d..050734723acc1 100644
--- a/src/lightning_app/utilities/network.py
+++ b/src/lightning_app/utilities/network.py
@@ -48,11 +48,12 @@ def _configure_session() -> Session:
return http
-def _check_service_url_is_ready(url: str, timeout: float = 100) -> bool:
+def _check_service_url_is_ready(url: str, timeout: float = 5) -> bool:
try:
response = requests.get(url, timeout=timeout)
return response.status_code in (200, 404)
except (ConnectionError, ConnectTimeout, ReadTimeout):
+ logger.debug(f"The url {url} is not ready.")
return False
diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py
index edd2896d1951d..1b2bf2fb52fd9 100644
--- a/tests/tests_app/core/test_lightning_api.py
+++ b/tests/tests_app/core/test_lightning_api.py
@@ -2,15 +2,27 @@
import multiprocessing as mp
import os
from copy import deepcopy
+from multiprocessing import Process
+from time import sleep
from unittest import mock
import pytest
+import requests
from deepdiff import DeepDiff, Delta
from httpx import AsyncClient
+from pydantic import BaseModel
from lightning_app import LightningApp, LightningFlow, LightningWork
+from lightning_app.api.http_methods import Post
from lightning_app.core import api
-from lightning_app.core.api import fastapi_service, global_app_state_store, start_server, UIRefresher
+from lightning_app.core.api import (
+ fastapi_service,
+ global_app_state_store,
+ register_global_routes,
+ start_server,
+ UIRefresher,
+)
+from lightning_app.core.constants import APP_SERVER_PORT
from lightning_app.runners import MultiProcessRuntime, SingleProcessRuntime
from lightning_app.storage.drive import Drive
from lightning_app.testing.helpers import MockQueue
@@ -20,6 +32,8 @@
from lightning_app.utilities.redis import check_if_redis_running
from lightning_app.utilities.state import AppState, headers_for
+register_global_routes()
+
class WorkA(LightningWork):
def __init__(self):
@@ -161,12 +175,11 @@ def test_update_publish_state_and_maybe_refresh_ui():
app = AppStageTestingApp(FlowA(), debug=True)
publish_state_queue = MockQueue("publish_state_queue")
- commands_metadata_queue = MockQueue("commands_metadata_queue")
- commands_responses_queue = MockQueue("commands_metadata_queue")
+ api_response_queue = MockQueue("api_response_queue")
publish_state_queue.put(app.state_with_changes)
- thread = UIRefresher(publish_state_queue, commands_metadata_queue, commands_responses_queue)
+ thread = UIRefresher(publish_state_queue, api_response_queue)
thread.run_once()
assert global_app_state_store.get_app_state("1234") == app.state_with_changes
@@ -192,18 +205,14 @@ def get(self, timeout: int = 0):
publish_state_queue = InfiniteQueue("publish_state_queue")
change_state_queue = MockQueue("change_state_queue")
has_started_queue = MockQueue("has_started_queue")
- commands_requests_queue = MockQueue("commands_requests_queue")
- commands_responses_queue = MockQueue("commands_responses_queue")
- commands_metadata_queue = MockQueue("commands_metadata_queue")
+ api_response_queue = MockQueue("api_response_queue")
state = app.state_with_changes
publish_state_queue.put(state)
spec = extract_metadata_from_app(app)
ui_refresher = start_server(
publish_state_queue,
change_state_queue,
- commands_requests_queue,
- commands_responses_queue,
- commands_metadata_queue,
+ api_response_queue,
has_started_queue=has_started_queue,
uvicorn_run=False,
spec=spec,
@@ -343,16 +352,12 @@ def test_start_server_started():
api_publish_state_queue = mp.Queue()
api_delta_queue = mp.Queue()
has_started_queue = mp.Queue()
- commands_requests_queue = mp.Queue()
- commands_responses_queue = mp.Queue()
- commands_metadata_queue = mp.Queue()
+ api_response_queue = mp.Queue()
kwargs = dict(
api_publish_state_queue=api_publish_state_queue,
api_delta_queue=api_delta_queue,
has_started_queue=has_started_queue,
- commands_requests_queue=commands_requests_queue,
- commands_responses_queue=commands_responses_queue,
- commands_metadata_queue=commands_metadata_queue,
+ api_response_queue=api_response_queue,
port=1111,
)
@@ -372,18 +377,14 @@ def test_start_server_info_message(ui_refresher, uvicorn_run, caplog, monkeypatc
api_publish_state_queue = MockQueue()
api_delta_queue = MockQueue()
has_started_queue = MockQueue()
- commands_requests_queue = MockQueue()
- commands_responses_queue = MockQueue()
- commands_metadata_queue = MockQueue()
+ api_response_queue = MockQueue()
kwargs = dict(
host=host,
port=1111,
api_publish_state_queue=api_publish_state_queue,
api_delta_queue=api_delta_queue,
has_started_queue=has_started_queue,
- commands_requests_queue=commands_requests_queue,
- commands_responses_queue=commands_responses_queue,
- commands_metadata_queue=commands_metadata_queue,
+ api_response_queue=api_response_queue,
)
monkeypatch.setattr(api, "logger", logging.getLogger())
@@ -395,3 +396,65 @@ def test_start_server_info_message(ui_refresher, uvicorn_run, caplog, monkeypatc
ui_refresher.assert_called_once()
uvicorn_run.assert_called_once_with(host="0.0.0.1", port=1111, log_level="error", app=mock.ANY)
+
+
+class InputRequestModel(BaseModel):
+ name: str
+
+
+class OutputRequestModel(BaseModel):
+ name: str
+ counter: int
+
+
+class FlowAPI(LightningFlow):
+ def __init__(self):
+ super().__init__()
+ self.counter = 0
+
+ def run(self):
+ if self.counter == 2:
+ sleep(0.5)
+ self._exit()
+
+ def request(self, config: InputRequestModel) -> OutputRequestModel:
+ self.counter += 1
+ return OutputRequestModel(name=config.name, counter=self.counter)
+
+ def configure_api(self):
+ return [Post("/api/v1/request", self.request)]
+
+
+def target():
+ app = LightningApp(FlowAPI())
+ MultiProcessRuntime(app).dispatch()
+
+
+def test_configure_api():
+
+ process = Process(target=target)
+ process.start()
+ time_left = 15
+ while time_left > 0:
+ try:
+ requests.get(f"http://localhost:{APP_SERVER_PORT}/healthz")
+ break
+ except requests.exceptions.ConnectionError:
+ sleep(0.1)
+ time_left -= 0.1
+
+ response = requests.post(
+ f"http://localhost:{APP_SERVER_PORT}/api/v1/request", data=InputRequestModel(name="hello").json()
+ )
+ assert response.json() == {"name": "hello", "counter": 1}
+ response = requests.post(
+ f"http://localhost:{APP_SERVER_PORT}/api/v1/request", data=InputRequestModel(name="hello").json()
+ )
+ assert response.json() == {"name": "hello", "counter": 2}
+ time_left = 15
+ while time_left > 0:
+ if process.exitcode == 0:
+ break
+ sleep(0.1)
+ time_left -= 0.1
+ assert process.exitcode == 0
diff --git a/tests/tests_app/utilities/test_app_logs.py b/tests/tests_app/utilities/test_app_logs.py
index e7384dd72d6e2..7a0fe087e7c29 100644
--- a/tests/tests_app/utilities/test_app_logs.py
+++ b/tests/tests_app/utilities/test_app_logs.py
@@ -1,4 +1,5 @@
from datetime import datetime
+from time import sleep
from unittest.mock import MagicMock
from lightning_app.utilities.app_logs import _LogEvent
@@ -6,6 +7,7 @@
def test_log_event():
event_1 = _LogEvent("", datetime.now(), MagicMock(), MagicMock())
+ sleep(0.1)
event_2 = _LogEvent("", datetime.now(), MagicMock(), MagicMock())
assert event_1 < event_2
assert event_1 <= event_2
diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py
index ed7f386395282..1be35a3a2e290 100644
--- a/tests/tests_app/utilities/test_commands.py
+++ b/tests/tests_app/utilities/test_commands.py
@@ -14,7 +14,7 @@
from lightning_app.core.constants import APP_SERVER_PORT
from lightning_app.runners import MultiProcessRuntime
from lightning_app.testing.helpers import RunIf
-from lightning_app.utilities.commands.base import _command_to_method_and_metadata, _download_command, ClientCommand
+from lightning_app.utilities.commands.base import _download_command, _validate_client_command, ClientCommand
from lightning_app.utilities.state import AppState
@@ -25,7 +25,6 @@ class SweepConfig(BaseModel):
class SweepCommand(ClientCommand):
def run(self) -> None:
- print(sys.argv)
parser = argparse.ArgumentParser()
parser.add_argument("--sweep_name", type=str)
parser.add_argument("--num_trials", type=int)
@@ -91,15 +90,15 @@ def run_failure_2(name: CustomModel):
@RunIf(skip_windows=True)
-def test_command_to_method_and_metadata():
+def test_validate_client_command():
with pytest.raises(Exception, match="The provided annotation for the argument name"):
- _command_to_method_and_metadata(ClientCommand(run_failure_0))
+ _validate_client_command(ClientCommand(run_failure_0))
with pytest.raises(Exception, match="annotate your method"):
- _command_to_method_and_metadata(ClientCommand(run_failure_1))
+ _validate_client_command(ClientCommand(run_failure_1))
with pytest.raises(Exception, match="lightning_app/utilities/commands/base.py"):
- _command_to_method_and_metadata(ClientCommand(run_failure_2))
+ _validate_client_command(ClientCommand(run_failure_2))
def test_client_commands(monkeypatch):
@@ -115,17 +114,13 @@ def test_client_commands(monkeypatch):
url = "http//"
kwargs = {"something": "1", "something_else": "1"}
command = DummyCommand(run)
- _, command_metadata = _command_to_method_and_metadata(command)
- command_metadata.update(
- {
- "command": "dummy",
- "affiliation": "root",
- "is_client_command": True,
- "owner": "root",
- }
+ _validate_client_command(command)
+ client_command = _download_command(
+ command_name="something",
+ cls_path=__file__,
+ cls_name="DummyCommand",
)
- client_command, models = _download_command(command_metadata, None)
- client_command._setup(metadata=command_metadata, models=models, app_url=url)
+ client_command._setup("something", app_url=url)
client_command.run(**kwargs)
@@ -153,10 +148,12 @@ def test_configure_commands(monkeypatch):
state = AppState()
state._request_state()
assert state.names == ["something"]
- monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name", "my_name", "--num_trials", "1"])
+ monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name=my_name", "--num_trials=1"])
app_command()
time_left = 15
- while time_left > 0 and process.exitcode != 0:
+ while time_left > 0:
+ if process.exitcode == 0:
+ break
sleep(0.1)
time_left -= 0.1
assert process.exitcode == 0
diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py
deleted file mode 100644
index 236e587e23101..0000000000000
--- a/tests/tests_app_examples/test_commands.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import os
-from subprocess import Popen
-from time import sleep
-from unittest import mock
-
-import pytest
-from tests_app import _PROJECT_ROOT
-
-from lightning_app.testing.testing import run_app_in_cloud
-
-
-@mock.patch.dict(os.environ, {"SKIP_LIGHTING_UTILITY_WHEELS_BUILD": "0"})
-@pytest.mark.cloud
-def test_commands_example_cloud() -> None:
- with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands")) as (
- admin_page,
- _,
- fetch_logs,
- _,
- ):
- app_id = admin_page.url.split("/")[-1]
- cmd = f"lightning trigger_with_client_command --name=something --app_id {app_id}"
- Popen(cmd, shell=True).wait()
- cmd = f"lightning trigger_without_client_command --name=else --app_id {app_id}"
- Popen(cmd, shell=True).wait()
-
- has_logs = False
- while not has_logs:
- for log in fetch_logs(["flow"]):
- if "['something', 'else']" in log:
- has_logs = True
- sleep(1)
diff --git a/tests/tests_app_examples/test_commands_and_api.py b/tests/tests_app_examples/test_commands_and_api.py
new file mode 100644
index 0000000000000..8d84cf4847ebd
--- /dev/null
+++ b/tests/tests_app_examples/test_commands_and_api.py
@@ -0,0 +1,42 @@
+import os
+from subprocess import Popen
+from time import sleep
+
+import pytest
+import requests
+from tests_app import _PROJECT_ROOT
+
+from lightning_app.testing.testing import run_app_in_cloud
+
+
+@pytest.mark.cloud
+def test_commands_and_api_example_cloud() -> None:
+ with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands_and_api")) as (
+ admin_page,
+ view_page,
+ fetch_logs,
+ _,
+ ):
+ # 1: Collect the app_id
+ app_id = admin_page.url.split("/")[-1]
+
+ # 2: Send the first command with the client
+ cmd = f"lightning command_with_client --name=this --app_id {app_id}"
+ Popen(cmd, shell=True).wait()
+
+ # 3: Send the second command without a client
+ cmd = f"lightning command_without_client --name=is --app_id {app_id}"
+ Popen(cmd, shell=True).wait()
+
+ # 4: Send a request to the Rest API directly.
+ base_url = view_page.url.replace("/view", "").replace("/child_flow", "")
+ resp = requests.post(base_url + "/user/command_without_client?name=awesome")
+ assert resp.status_code == 200, resp.json()
+
+ # 5: Validate the logs.
+ has_logs = False
+ while not has_logs:
+ for log in fetch_logs():
+ if "['this', 'is', 'awesome']" in log:
+ has_logs = True
+ sleep(1)
From 7c8c996f6acbcd3f497b6274bc96e7767e5695b1 Mon Sep 17 00:00:00 2001
From: Rick Izzo
Date: Thu, 11 Aug 2022 09:22:59 -0400
Subject: [PATCH 53/59] Feature GRID-9731: Update Lightning Cloud.py Backend to
Accept Drive Specs (2/2) (#14106)
initial work adding drives to create work API from framework cloud dispatcher
---
requirements/app/base.txt | 2 +-
src/lightning_app/runners/cloud.py | 43 ++++
src/lightning_app/storage/drive.py | 2 +-
tests/tests_app/runners/test_cloud.py | 356 ++++++++++++++++++++++++++
4 files changed, 401 insertions(+), 2 deletions(-)
diff --git a/requirements/app/base.txt b/requirements/app/base.txt
index 02eeb04bfa218..fcde2f18a300a 100644
--- a/requirements/app/base.txt
+++ b/requirements/app/base.txt
@@ -1,4 +1,4 @@
-lightning-cloud==0.5.0
+lightning-cloud==0.5.3
packaging
deepdiff>=5.7.0, <=5.8.1
starsessions
diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py
index 957b60b5d2ab5..2cd98ebe4cf68 100644
--- a/src/lightning_app/runners/cloud.py
+++ b/src/lightning_app/runners/cloud.py
@@ -18,15 +18,22 @@
Gridv1ImageSpec,
V1BuildSpec,
V1DependencyFileInfo,
+ V1Drive,
+ V1DriveSpec,
+ V1DriveStatus,
+ V1DriveType,
V1EnvVar,
V1Flowserver,
V1LightningappInstanceSpec,
V1LightningappInstanceState,
+ V1LightningworkDrives,
V1LightningworkSpec,
+ V1Metadata,
V1NetworkConfig,
V1PackageManager,
V1ProjectClusterBinding,
V1PythonDependencyInfo,
+ V1SourceType,
V1UserRequestedComputeConfig,
V1Work,
)
@@ -36,6 +43,7 @@
from lightning_app.runners.backends.cloud import CloudBackend
from lightning_app.runners.runtime import Runtime
from lightning_app.source_code import LocalSourceCodeDir
+from lightning_app.storage import Drive
from lightning_app.utilities.cloud import _get_project
from lightning_app.utilities.dependency_caching import get_hash
from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file
@@ -107,10 +115,45 @@ def dispatch(
preemptible=work.cloud_compute.preemptible,
shm_size=work.cloud_compute.shm_size,
)
+
+ drive_specs: List[V1LightningworkDrives] = []
+ for drive_attr_name, drive in [
+ (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive)
+ ]:
+ if drive.protocol == "lit://":
+ drive_type = V1DriveType.NO_MOUNT_S3
+ source_type = V1SourceType.S3
+ elif drive.protocol == "s3://":
+ drive_type = V1DriveType.INDEXED_S3
+ source_type = V1SourceType.S3
+ else:
+ raise RuntimeError(
+ f"unknown drive protocol `{drive.protocol}`. Please verify this "
+ f"drive type has been configured for use in the cloud dispatcher."
+ )
+
+ drive_specs.append(
+ V1LightningworkDrives(
+ drive=V1Drive(
+ metadata=V1Metadata(
+ name=f"{work.name}.{drive_attr_name}",
+ ),
+ spec=V1DriveSpec(
+ drive_type=drive_type,
+ source_type=source_type,
+ source=f"{drive.protocol}{drive.id}",
+ ),
+ status=V1DriveStatus(),
+ ),
+ mount_location=str(drive.root_folder),
+ ),
+ )
+
random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
spec = V1LightningworkSpec(
build_spec=build_spec,
cluster_id=cluster_id,
+ drives=drive_specs,
user_requested_compute_config=user_compute_config,
network_config=[V1NetworkConfig(name=random_name, port=work.port)],
)
diff --git a/src/lightning_app/storage/drive.py b/src/lightning_app/storage/drive.py
index b69d2581851b8..f72ad38b6e130 100644
--- a/src/lightning_app/storage/drive.py
+++ b/src/lightning_app/storage/drive.py
@@ -59,7 +59,7 @@ def __init__(
if self.protocol != "s3://" and "/" in self.id:
raise Exception(f"The id should be unique to identify your drive. Found `{self.id}`.")
- self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else os.getcwd()
+ self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else pathlib.Path(os.getcwd())
if not os.path.isdir(self.root_folder):
raise Exception(f"The provided root_folder isn't a directory: {root_folder}")
self.component_name = component_name
diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py
index 4b1cf08e8554d..640eb9c114c2d 100644
--- a/tests/tests_app/runners/test_cloud.py
+++ b/tests/tests_app/runners/test_cloud.py
@@ -1,4 +1,5 @@
import logging
+from copy import copy
from pathlib import Path
from unittest import mock
from unittest.mock import MagicMock
@@ -9,21 +10,29 @@
Gridv1ImageSpec,
V1BuildSpec,
V1DependencyFileInfo,
+ V1Drive,
+ V1DriveSpec,
+ V1DriveStatus,
+ V1DriveType,
V1LightningappInstanceState,
+ V1LightningworkDrives,
V1LightningworkSpec,
V1ListLightningappInstancesResponse,
V1ListMembershipsResponse,
V1Membership,
+ V1Metadata,
V1NetworkConfig,
V1PackageManager,
V1ProjectClusterBinding,
V1PythonDependencyInfo,
+ V1SourceType,
V1UserRequestedComputeConfig,
V1Work,
)
from lightning_app import LightningApp, LightningWork
from lightning_app.runners import backends, cloud
+from lightning_app.storage import Drive
from lightning_app.utilities.cloud import _get_project
from lightning_app.utilities.dependency_caching import get_hash
@@ -33,6 +42,25 @@ def run(self):
print("my run")
+class WorkWithSingleDrive(LightningWork):
+ def __init__(self):
+ super().__init__()
+ self.drive = None
+
+ def run(self):
+ pass
+
+
+class WorkWithTwoDrives(LightningWork):
+ def __init__(self):
+ super().__init__()
+ self.lit_drive = None
+ self.s3_drive = None
+
+ def run(self):
+ pass
+
+
class TestAppCreationClient:
"""Testing the calls made using GridRestClient to create the app."""
@@ -250,6 +278,134 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir):
),
image="random_base_public_image",
),
+ drives=[],
+ user_requested_compute_config=V1UserRequestedComputeConfig(
+ name="default", count=1, disk_size=0, preemptible=False, shm_size=0
+ ),
+ network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)],
+ ),
+ )
+ ],
+ )
+ mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with(
+ "test-project-id", mock.ANY, expected_body
+ )
+
+ # running dispatch with disabled dependency cache
+ mock_client.reset_mock()
+ monkeypatch.setattr(cloud, "DISABLE_DEPENDENCY_CACHE", True)
+ expected_body.dependency_cache_key = None
+ cloud_runtime.dispatch()
+ mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with(
+ "test-project-id", mock.ANY, expected_body
+ )
+ else:
+ mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with(
+ "test-project-id", mock.ANY, mock.ANY, mock.ANY
+ )
+
+ @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock())
+ @pytest.mark.parametrize("lightningapps", [[], [MagicMock()]])
+ def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch, tmpdir):
+ source_code_root_dir = Path(tmpdir / "src").absolute()
+ source_code_root_dir.mkdir()
+ Path(source_code_root_dir / ".lightning").write_text("name: myapp")
+ requirements_file = Path(source_code_root_dir / "requirements.txt")
+ Path(requirements_file).touch()
+
+ mock_client = mock.MagicMock()
+ if lightningapps:
+ lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED
+ mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
+ V1ListLightningappInstancesResponse(lightningapps=lightningapps)
+ )
+ lightning_app_instance = MagicMock()
+ mock_client.lightningapp_v2_service_create_lightningapp_release = MagicMock(return_value=lightning_app_instance)
+ mock_client.lightningapp_v2_service_create_lightningapp_release_instance = MagicMock(
+ return_value=lightning_app_instance
+ )
+ existing_instance = MagicMock()
+ existing_instance.status.phase = V1LightningappInstanceState.STOPPED
+ mock_client.lightningapp_service_get_lightningapp = MagicMock(return_value=existing_instance)
+ cloud_backend = mock.MagicMock()
+ cloud_backend.client = mock_client
+ monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend))
+ monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
+ monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
+ app = mock.MagicMock()
+ flow = mock.MagicMock()
+
+ mocked_drive = MagicMock(spec=Drive)
+ setattr(mocked_drive, "id", "foobar")
+ setattr(mocked_drive, "protocol", "lit://")
+ setattr(mocked_drive, "component_name", "test-work")
+ setattr(mocked_drive, "allow_duplicates", False)
+ setattr(mocked_drive, "root_folder", tmpdir)
+ # deepcopy on a MagicMock instance will return an empty magicmock instance. To
+ # overcome this we set the __deepcopy__ method `return_value` to equal what
+ # should be the results of the deepcopy operation (an instance of the original class)
+ mocked_drive.__deepcopy__.return_value = copy(mocked_drive)
+
+ work = WorkWithSingleDrive()
+ monkeypatch.setattr(work, "drive", mocked_drive)
+ monkeypatch.setattr(work, "_state", {"_port", "drive"})
+ monkeypatch.setattr(work, "_name", "test-work")
+ monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"])
+ monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"])
+ monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image")
+ monkeypatch.setattr(work._cloud_compute, "disk_size", 0)
+ monkeypatch.setattr(work._cloud_compute, "preemptible", False)
+ monkeypatch.setattr(work, "_port", 8080)
+
+ flow.works = lambda recurse: [work]
+ app.flows = [flow]
+ cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py"))
+ monkeypatch.setattr(
+ "lightning_app.runners.cloud._get_project",
+ lambda x: V1Membership(name="test-project", project_id="test-project-id"),
+ )
+ cloud_runtime.dispatch()
+
+ if lightningapps:
+ expected_body = Body8(
+ description=None,
+ local_source=True,
+ app_entrypoint_file="entrypoint.py",
+ enable_app_server=True,
+ flow_servers=[],
+ dependency_cache_key=get_hash(requirements_file),
+ image_spec=Gridv1ImageSpec(
+ dependency_file_info=V1DependencyFileInfo(
+ package_manager=V1PackageManager.PIP, path="requirements.txt"
+ )
+ ),
+ works=[
+ V1Work(
+ name="test-work",
+ spec=V1LightningworkSpec(
+ build_spec=V1BuildSpec(
+ commands=["echo 'start'"],
+ python_dependencies=V1PythonDependencyInfo(
+ package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0"
+ ),
+ image="random_base_public_image",
+ ),
+ drives=[
+ V1LightningworkDrives(
+ drive=V1Drive(
+ metadata=V1Metadata(
+ name="test-work.drive",
+ ),
+ spec=V1DriveSpec(
+ drive_type=V1DriveType.NO_MOUNT_S3,
+ source_type=V1SourceType.S3,
+ source="lit://foobar",
+ ),
+ status=V1DriveStatus(),
+ ),
+ mount_location=str(tmpdir),
+ ),
+ ],
user_requested_compute_config=V1UserRequestedComputeConfig(
name="default", count=1, disk_size=0, preemptible=False, shm_size=0
),
@@ -275,6 +431,206 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir):
"test-project-id", mock.ANY, mock.ANY, mock.ANY
)
+ @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock())
+ @pytest.mark.parametrize("lightningapps", [[], [MagicMock()]])
+ def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, monkeypatch, tmpdir):
+ source_code_root_dir = Path(tmpdir / "src").absolute()
+ source_code_root_dir.mkdir()
+ Path(source_code_root_dir / ".lightning").write_text("name: myapp")
+ requirements_file = Path(source_code_root_dir / "requirements.txt")
+ Path(requirements_file).touch()
+
+ mock_client = mock.MagicMock()
+ if lightningapps:
+ lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED
+ mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = (
+ V1ListLightningappInstancesResponse(lightningapps=lightningapps)
+ )
+ lightning_app_instance = MagicMock()
+ mock_client.lightningapp_v2_service_create_lightningapp_release = MagicMock(return_value=lightning_app_instance)
+ mock_client.lightningapp_v2_service_create_lightningapp_release_instance = MagicMock(
+ return_value=lightning_app_instance
+ )
+ existing_instance = MagicMock()
+ existing_instance.status.phase = V1LightningappInstanceState.STOPPED
+ mock_client.lightningapp_service_get_lightningapp = MagicMock(return_value=existing_instance)
+ cloud_backend = mock.MagicMock()
+ cloud_backend.client = mock_client
+ monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend))
+ monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock())
+ monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock())
+ app = mock.MagicMock()
+ flow = mock.MagicMock()
+
+ mocked_lit_drive = MagicMock(spec=Drive)
+ setattr(mocked_lit_drive, "id", "foobar")
+ setattr(mocked_lit_drive, "protocol", "lit://")
+ setattr(mocked_lit_drive, "component_name", "test-work")
+ setattr(mocked_lit_drive, "allow_duplicates", False)
+ setattr(mocked_lit_drive, "root_folder", tmpdir)
+ # deepcopy on a MagicMock instance will return an empty magicmock instance. To
+ # overcome this we set the __deepcopy__ method `return_value` to equal what
+ # should be the results of the deepcopy operation (an instance of the original class)
+ mocked_lit_drive.__deepcopy__.return_value = copy(mocked_lit_drive)
+
+ mocked_s3_drive = MagicMock(spec=Drive)
+ setattr(mocked_s3_drive, "id", "some-bucket/path/")
+ setattr(mocked_s3_drive, "protocol", "s3://")
+ setattr(mocked_s3_drive, "component_name", "test-work")
+ setattr(mocked_s3_drive, "allow_duplicates", False)
+ setattr(mocked_s3_drive, "root_folder", "/hello/")
+ # deepcopy on a MagicMock instance will return an empty magicmock instance. To
+ # overcome this we set the __deepcopy__ method `return_value` to equal what
+ # should be the results of the deepcopy operation (an instance of the original class)
+ mocked_s3_drive.__deepcopy__.return_value = copy(mocked_s3_drive)
+
+ work = WorkWithTwoDrives()
+ monkeypatch.setattr(work, "lit_drive", mocked_lit_drive)
+ monkeypatch.setattr(work, "s3_drive", mocked_s3_drive)
+ monkeypatch.setattr(work, "_state", {"_port", "_name", "lit_drive", "s3_drive"})
+ monkeypatch.setattr(work, "_name", "test-work")
+ monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"])
+ monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"])
+ monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image")
+ monkeypatch.setattr(work._cloud_compute, "disk_size", 0)
+ monkeypatch.setattr(work._cloud_compute, "preemptible", False)
+ monkeypatch.setattr(work, "_port", 8080)
+
+ flow.works = lambda recurse: [work]
+ app.flows = [flow]
+ cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py"))
+ monkeypatch.setattr(
+ "lightning_app.runners.cloud._get_project",
+ lambda x: V1Membership(name="test-project", project_id="test-project-id"),
+ )
+ cloud_runtime.dispatch()
+
+ if lightningapps:
+ s3_drive_spec = V1LightningworkDrives(
+ drive=V1Drive(
+ metadata=V1Metadata(
+ name="test-work.s3_drive",
+ ),
+ spec=V1DriveSpec(
+ drive_type=V1DriveType.INDEXED_S3,
+ source_type=V1SourceType.S3,
+ source="s3://some-bucket/path/",
+ ),
+ status=V1DriveStatus(),
+ ),
+ mount_location="/hello/",
+ )
+ lit_drive_spec = V1LightningworkDrives(
+ drive=V1Drive(
+ metadata=V1Metadata(
+ name="test-work.lit_drive",
+ ),
+ spec=V1DriveSpec(
+ drive_type=V1DriveType.NO_MOUNT_S3,
+ source_type=V1SourceType.S3,
+ source="lit://foobar",
+ ),
+ status=V1DriveStatus(),
+ ),
+ mount_location=str(tmpdir),
+ )
+
+ # order of drives in the spec is non-deterministic, so there are two options
+ # depending for the expected body value on which drive is ordered in the list first.
+
+ expected_body_option_1 = Body8(
+ description=None,
+ local_source=True,
+ app_entrypoint_file="entrypoint.py",
+ enable_app_server=True,
+ flow_servers=[],
+ dependency_cache_key=get_hash(requirements_file),
+ image_spec=Gridv1ImageSpec(
+ dependency_file_info=V1DependencyFileInfo(
+ package_manager=V1PackageManager.PIP, path="requirements.txt"
+ )
+ ),
+ works=[
+ V1Work(
+ name="test-work",
+ spec=V1LightningworkSpec(
+ build_spec=V1BuildSpec(
+ commands=["echo 'start'"],
+ python_dependencies=V1PythonDependencyInfo(
+ package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0"
+ ),
+ image="random_base_public_image",
+ ),
+ drives=[lit_drive_spec, s3_drive_spec],
+ user_requested_compute_config=V1UserRequestedComputeConfig(
+ name="default", count=1, disk_size=0, preemptible=False, shm_size=0
+ ),
+ network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)],
+ ),
+ )
+ ],
+ )
+
+ expected_body_option_2 = Body8(
+ description=None,
+ local_source=True,
+ app_entrypoint_file="entrypoint.py",
+ enable_app_server=True,
+ flow_servers=[],
+ dependency_cache_key=get_hash(requirements_file),
+ image_spec=Gridv1ImageSpec(
+ dependency_file_info=V1DependencyFileInfo(
+ package_manager=V1PackageManager.PIP, path="requirements.txt"
+ )
+ ),
+ works=[
+ V1Work(
+ name="test-work",
+ spec=V1LightningworkSpec(
+ build_spec=V1BuildSpec(
+ commands=["echo 'start'"],
+ python_dependencies=V1PythonDependencyInfo(
+ package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0"
+ ),
+ image="random_base_public_image",
+ ),
+ drives=[s3_drive_spec, lit_drive_spec],
+ user_requested_compute_config=V1UserRequestedComputeConfig(
+ name="default", count=1, disk_size=0, preemptible=False, shm_size=0
+ ),
+ network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)],
+ ),
+ )
+ ],
+ )
+
+ # try both options for the expected body to avoid false
+ # positive test failures depending on system randomness
+
+ expected_body = expected_body_option_1
+ try:
+ mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with(
+ "test-project-id", mock.ANY, expected_body
+ )
+ except Exception:
+ expected_body = expected_body_option_2
+ mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with(
+ "test-project-id", mock.ANY, expected_body
+ )
+
+ # running dispatch with disabled dependency cache
+ mock_client.reset_mock()
+ monkeypatch.setattr(cloud, "DISABLE_DEPENDENCY_CACHE", True)
+ expected_body.dependency_cache_key = None
+ cloud_runtime.dispatch()
+ mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with(
+ "test-project-id", mock.ANY, expected_body
+ )
+ else:
+ mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with(
+ "test-project-id", mock.ANY, mock.ANY, mock.ANY
+ )
+
@mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock())
@mock.patch("lightning_app.runners.backends.cloud.LightningClient", MagicMock())
From 98ded4524f373d906aac475b6a7599b6f1661c39 Mon Sep 17 00:00:00 2001
From: Raphael Randschau
Date: Thu, 11 Aug 2022 08:19:21 -0700
Subject: [PATCH 54/59] [CLI] change cluster creation cost savings mode default
(#14132)
* [CLI] change cluster creation cost savings mode default
instead of having customers opt-into cost savings mode, we'll ask them to opt-out of cost savings mode.
---
src/lightning_app/CHANGELOG.md | 2 +-
src/lightning_app/cli/lightning_cli_create.py | 14 +++++++-------
tests/tests_app/cli/test_cli.py | 17 ++++++++++-------
3 files changed, 18 insertions(+), 15 deletions(-)
diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md
index 7158d1ff7a2da..2aa5c7cdd837c 100644
--- a/src/lightning_app/CHANGELOG.md
+++ b/src/lightning_app/CHANGELOG.md
@@ -28,7 +28,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945))
### Changed
--
+- Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132))
### Changed
diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py
index d400db4b6f337..c9cea2a5676f9 100644
--- a/src/lightning_app/cli/lightning_cli_create.py
+++ b/src/lightning_app/cli/lightning_cli_create.py
@@ -33,14 +33,14 @@ def create():
help="Instance types that you want to support, for computer jobs within the cluster.",
)
@click.option(
- "--cost-savings",
- "cost_savings",
+ "--enable-performance",
+ "enable_performance",
type=bool,
required=False,
default=False,
is_flag=True,
- help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for cost savings.
- This makes runs cheaper but start-up times may increase.""",
+ help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance.
+ This makes runs more expensive but start-up times decrease.""",
)
@click.option(
"--edit-before-creation",
@@ -65,12 +65,12 @@ def create_cluster(
provider: str,
instance_types: str,
edit_before_creation: bool,
- cost_savings: bool,
+ enable_performance: bool,
wait: bool,
**kwargs,
):
"""Create a Lightning AI BYOC compute cluster with your cloud provider credentials."""
- if provider != "aws":
+ if provider.lower() != "aws":
click.echo("Only AWS is supported for now. But support for more providers is coming soon.")
return
cluster_manager = AWSClusterManager()
@@ -81,6 +81,6 @@ def create_cluster(
external_id=external_id,
instance_types=instance_types.split(",") if instance_types is not None else None,
edit_before_creation=edit_before_creation,
- cost_savings=cost_savings,
+ cost_savings=not enable_performance,
wait=wait,
)
diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py
index 8cc5dd50f836e..48e1a26bb6f2b 100644
--- a/tests/tests_app/cli/test_cli.py
+++ b/tests/tests_app/cli/test_cli.py
@@ -71,14 +71,17 @@ def test_main_lightning_cli_help():
@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock())
@mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create")
@pytest.mark.parametrize(
- "instance_types,expected_instance_types",
+ "extra_arguments,expected_instance_types,expected_cost_savings_mode",
[
- (["--instance-types", "t3.xlarge"], ["t3.xlarge"]),
- (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"]),
- ([], None),
+ (["--instance-types", "t3.xlarge"], ["t3.xlarge"], True),
+ (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"], True),
+ ([], None, True),
+ (["--enable-performance"], None, False),
],
)
-def test_create_cluster(create_command: mock.MagicMock, instance_types, expected_instance_types):
+def test_create_cluster(
+ create_command: mock.MagicMock, extra_arguments, expected_instance_types, expected_cost_savings_mode
+):
runner = CliRunner()
runner.invoke(
create_cluster,
@@ -91,7 +94,7 @@ def test_create_cluster(create_command: mock.MagicMock, instance_types, expected
"--role-arn",
"arn:aws:iam::1234567890:role/lai-byoc",
]
- + instance_types,
+ + extra_arguments,
)
create_command.assert_called_once_with(
@@ -101,7 +104,7 @@ def test_create_cluster(create_command: mock.MagicMock, instance_types, expected
external_id="dummy",
instance_types=expected_instance_types,
edit_before_creation=False,
- cost_savings=False,
+ cost_savings=expected_cost_savings_mode,
wait=False,
)
From 3b18da3eafa8ece27cde46ad978f765a1390d72d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Thu, 11 Aug 2022 17:49:46 +0200
Subject: [PATCH 55/59] Fix saving hyperparameters in a composition where
parent is not a LM or LDM (#14151)
Co-authored-by: Rohit Gupta
---
src/pytorch_lightning/CHANGELOG.md | 4 ++++
src/pytorch_lightning/utilities/parsing.py | 17 ++++++++++++-----
tests/tests_pytorch/models/test_hparams.py | 19 +++++++++++++++++++
3 files changed, 35 insertions(+), 5 deletions(-)
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 97bb317b02a14..04db3d1908bb2 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -70,6 +70,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061))
+- Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151))
+
+
+
## [1.7.1] - 2022-08-09
### Fixed
diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py
index 81877f1dffba7..073423ab60773 100644
--- a/src/pytorch_lightning/utilities/parsing.py
+++ b/src/pytorch_lightning/utilities/parsing.py
@@ -162,7 +162,10 @@ def get_init_args(frame: types.FrameType) -> Dict[str, Any]:
def collect_init_args(
- frame: types.FrameType, path_args: List[Dict[str, Any]], inside: bool = False
+ frame: types.FrameType,
+ path_args: List[Dict[str, Any]],
+ inside: bool = False,
+ classes: Tuple[Type, ...] = (),
) -> List[Dict[str, Any]]:
"""Recursively collects the arguments passed to the child constructors in the inheritance tree.
@@ -170,6 +173,7 @@ def collect_init_args(
frame: the current stack frame
path_args: a list of dictionaries containing the constructor args in all parent classes
inside: track if we are inside inheritance path, avoid terminating too soon
+ classes: the classes in which to inspect the frames
Return:
A list of dictionaries where each dictionary contains the arguments passed to the
@@ -181,13 +185,13 @@ def collect_init_args(
if not isinstance(frame.f_back, types.FrameType):
return path_args
- if "__class__" in local_vars:
+ if "__class__" in local_vars and (not classes or issubclass(local_vars["__class__"], classes)):
local_args = get_init_args(frame)
# recursive update
path_args.append(local_args)
- return collect_init_args(frame.f_back, path_args, inside=True)
+ return collect_init_args(frame.f_back, path_args, inside=True, classes=classes)
if not inside:
- return collect_init_args(frame.f_back, path_args, inside)
+ return collect_init_args(frame.f_back, path_args, inside, classes=classes)
return path_args
@@ -225,7 +229,10 @@ def save_hyperparameters(
init_args = {f.name: getattr(obj, f.name) for f in fields(obj)}
else:
init_args = {}
- for local_args in collect_init_args(frame, []):
+
+ from pytorch_lightning.core.mixins import HyperparametersMixin
+
+ for local_args in collect_init_args(frame, [], classes=(HyperparametersMixin,)):
init_args.update(local_args)
if ignore is None:
diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py
index c130381c7832d..84311d6f780fb 100644
--- a/tests/tests_pytorch/models/test_hparams.py
+++ b/tests/tests_pytorch/models/test_hparams.py
@@ -29,6 +29,7 @@
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.core.datamodule import LightningDataModule
+from pytorch_lightning.core.mixins import HyperparametersMixin
from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml
from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset
from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict, is_picklable
@@ -399,6 +400,24 @@ def _raw_checkpoint_path(trainer) -> str:
return raw_checkpoint_path
+@pytest.mark.parametrize("base_class", (HyperparametersMixin, LightningModule, LightningDataModule))
+def test_save_hyperparameters_under_composition(base_class):
+ """Test that in a composition where the parent is not a Lightning-like module, the parent's arguments don't get
+ collected."""
+
+ class ChildInComposition(base_class):
+ def __init__(self, same_arg):
+ super().__init__()
+ self.save_hyperparameters()
+
+ class NotPLSubclass: # intentionally not subclassing LightningModule/LightningDataModule
+ def __init__(self, same_arg="parent_default", other_arg="other"):
+ self.child = ChildInComposition(same_arg="cocofruit")
+
+ parent = NotPLSubclass()
+ assert parent.child.hparams == dict(same_arg="cocofruit")
+
+
class LocalVariableModelSuperLast(BoringModel):
"""This model has the super().__init__() call at the end."""
From 56533368afe14407867dc999a65b799d0f4bd89b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Adrian=20W=C3=A4lchli?=
Date: Thu, 11 Aug 2022 18:17:56 +0200
Subject: [PATCH 56/59] Remove DeepSpeed version restriction from Lite (#13967)
---
.azure/gpu-tests.yml | 2 +-
requirements/pytorch/strategies.txt | 2 +-
src/pytorch_lightning/CHANGELOG.md | 3 +++
src/pytorch_lightning/lite/lite.py | 15 ---------------
tests/tests_pytorch/lite/test_lite.py | 13 +------------
5 files changed, 6 insertions(+), 29 deletions(-)
diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml
index 68ba6974a3527..8ae670d265ced 100644
--- a/.azure/gpu-tests.yml
+++ b/.azure/gpu-tests.yml
@@ -75,7 +75,7 @@ jobs:
CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0"
pip install -e .[strategies]
- pip install deepspeed>0.6.4 # TODO: remove when docker images are upgraded
+ pip install -U deepspeed # TODO: remove when docker images are upgraded
pip install --requirement requirements/pytorch/devel.txt
pip list
env:
diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt
index 4e916fbc6c61f..c5fc92a67a837 100644
--- a/requirements/pytorch/strategies.txt
+++ b/requirements/pytorch/strategies.txt
@@ -2,7 +2,7 @@
# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment
fairscale>=0.4.5, <=0.4.6
-deepspeed>=0.6.0, <0.7.0
+deepspeed>=0.6.0, <=0.7.0
# no need to install with [pytorch] as pytorch is already installed
horovod>=0.21.2, !=0.24.0, <0.25.1
hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux'
diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md
index 04db3d1908bb2..6d67d2d58643a 100644
--- a/src/pytorch_lightning/CHANGELOG.md
+++ b/src/pytorch_lightning/CHANGELOG.md
@@ -22,6 +22,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
- Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([13961](https://github.com/Lightning-AI/lightning/pull/13961))
+- Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967))
+
+
### Deprecated
- Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000))
diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py
index 5125bf4486a9d..981eed30635f6 100644
--- a/src/pytorch_lightning/lite/lite.py
+++ b/src/pytorch_lightning/lite/lite.py
@@ -40,7 +40,6 @@
has_iterable_dataset,
)
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _RequirementAvailable
from pytorch_lightning.utilities.seed import seed_everything
@@ -106,8 +105,6 @@ def __init__(
self._precision_plugin = self._strategy.precision_plugin
self._models_setup: int = 0
- self._check_deepspeed_support()
-
# wrap the run method so we can inject setup logic or spawn processes for the user
setattr(self, "run", partial(self._run_impl, self.run))
@@ -459,18 +456,6 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N
f" Choose one of {supported} or pass in a `Strategy` instance."
)
- def _check_deepspeed_support(self) -> None:
- if (
- isinstance(self._strategy, DeepSpeedStrategy)
- and self._strategy.zero_stage_3
- and _RequirementAvailable("deepspeed>=0.6.5")
- ):
- # https://github.com/microsoft/DeepSpeed/issues/2139
- raise RuntimeError(
- "DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`."
- " Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available."
- )
-
@staticmethod
def _supported_device_types() -> Sequence[_AcceleratorType]:
return (
diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py
index 2215ab3129780..86a0a5a82195a 100644
--- a/tests/tests_pytorch/lite/test_lite.py
+++ b/tests/tests_pytorch/lite/test_lite.py
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import contextlib
import os
from copy import deepcopy
from unittest import mock
@@ -30,7 +29,6 @@
from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy
from pytorch_lightning.utilities import _StrategyType
from pytorch_lightning.utilities.exceptions import MisconfigurationException
-from pytorch_lightning.utilities.imports import _RequirementAvailable
from pytorch_lightning.utilities.seed import pl_worker_init_function
from tests_pytorch.helpers.runif import RunIf
@@ -480,13 +478,4 @@ def run(self):
assert self.broadcast(True)
assert self.is_global_zero == (self.local_rank == 0)
- if _RequirementAvailable("deepspeed>=0.6.5"):
- # https://github.com/microsoft/DeepSpeed/issues/2139
- raise_if_deepspeed_incompatible = pytest.raises(
- RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite"
- )
- else:
- raise_if_deepspeed_incompatible = contextlib.suppress()
-
- with raise_if_deepspeed_incompatible:
- Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()
+ Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()
From d0f82abe35c271247d58da35442719e01a54604c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Carlos=20Mochol=C3=AD?=
Date: Thu, 11 Aug 2022 18:55:01 +0200
Subject: [PATCH 57/59] Configure the check-group app (#14165)
Co-authored-by: Jirka
---
.github/checkgroup.yml | 165 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 165 insertions(+)
create mode 100644 .github/checkgroup.yml
diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml
new file mode 100644
index 0000000000000..8f1d3c6fb5e86
--- /dev/null
+++ b/.github/checkgroup.yml
@@ -0,0 +1,165 @@
+custom_service_name: "Lightning CI required checker"
+subprojects:
+ - id: "CI: CircleCI"
+ paths:
+ - ".circleci/**"
+ checks:
+ - "test-on-tpus"
+
+ - id: "CI: Azure"
+ paths:
+ - ".azure/**"
+ checks:
+ - "pytorch-lightning (GPUs)"
+ - "pytorch-lightning (GPUs) (testing PyTorch - stable)"
+ - "pytorch-lightning (HPUs)"
+ - "pytorch-lightning (IPUs)"
+
+ - id: "pytorch_lightning"
+ paths:
+ # all examples don't need to be added because they aren't used in CI, but these are
+ - "examples/run_ddp_examples.sh"
+ - "examples/convert_from_pt_to_pl/**"
+ - "examples/run_pl_examples.sh"
+ - "examples/pl_basics/backbone_image_classifier.py"
+ - "examples/pl_basics/autoencoder.py"
+ - "examples/pl_loops/mnist_lite.py"
+ - "examples/pl_fault_tolerant/automatic.py"
+ - "examples/test_pl_examples.py"
+ - "examples/pl_integrations/dali_image_classifier.py"
+ - "requirements/pytorch/**"
+ - "src/pytorch_lightning/**"
+ - "tests/tests_pytorch/**"
+ - "setup.cfg" # includes pytest config
+ - ".github/workflows/ci-pytorch*.yml"
+ - ".github/workflows/docs-*.yml"
+ checks:
+ - "conda (3.8, 1.10)"
+ - "conda (3.8, 1.9)"
+ - "conda (3.9, 1.11)"
+ - "conda (3.9, 1.12)"
+ - "cpu (macOS-11, 3.10, latest, stable)"
+ - "cpu (macOS-11, 3.7, latest, stable)"
+ - "cpu (macOS-11, 3.7, oldest, stable)"
+ - "cpu (ubuntu-20.04, 3.10, latest, stable)"
+ - "cpu (ubuntu-20.04, 3.7, latest, stable)"
+ - "cpu (ubuntu-20.04, 3.7, oldest, stable)"
+ - "cpu (windows-2022, 3.10, latest, stable)"
+ - "cpu (windows-2022, 3.7, latest, stable)"
+ - "cpu (windows-2022, 3.7, oldest, stable)"
+ - "doctest (pytorch)"
+ - "make-docs (pytorch)"
+ - "mypy"
+ - "PR Gatekeeper (pytorch)"
+ - "pytorch-lightning (GPUs)"
+ - "pytorch-lightning (GPUs) (testing PyTorch - stable)"
+ - "pytorch-lightning (HPUs)"
+ - "pytorch-lightning (IPUs)"
+ - "slow (macOS-11, 3.7, 1.11)"
+ - "slow (ubuntu-20.04, 3.7, 1.11)"
+ - "slow (windows-2022, 3.7, 1.11)"
+ - "test-on-tpus"
+
+ - id: "pytorch_lightning: Docs"
+ paths:
+ - "docs/source-pytorch/**"
+ - ".github/workflows/docs-*.yml"
+ - "requirements/pytorch/**"
+ checks:
+ - "doctest (pytorch)"
+ - "make-docs (pytorch)"
+
+ - id: "pytorch_lightning: Docker"
+ paths:
+ - "dockers/**"
+ checks:
+ - "build-conda (3.8, 1.10)"
+ - "build-conda (3.8, 1.9)"
+ - "build-conda (3.9, 1.11)"
+ - "build-conda (3.9, 1.12)"
+ - "build-cuda (3.8, 1.9, 11.1.1)"
+ - "build-cuda (3.9, 1.10, 11.3.1)"
+ - "build-cuda (3.9, 1.11, 11.3.1)"
+ - "build-cuda (3.9, 1.12, 11.3.1)"
+ - "build-cuda (3.9, 1.9, 11.1.1)"
+ - "build-hpu (1.5.0, 1.11.0)"
+ - "build-ipu (3.9, 1.9)"
+ - "build-NGC"
+ - "build-pl (3.9, 1.10, 11.3.1)"
+ - "build-pl (3.9, 1.11, 11.3.1)"
+ - "build-pl (3.9, 1.12, 11.3.1)"
+ - "build-pl (3.9, 1.9, 11.1.1)"
+ - "build-xla (3.7, 1.12)"
+
+ - id: "pytorch_lightning: mypy"
+ paths:
+ - ".github/workflows/code-checks.yml"
+ - "pyproject.toml" # includes mypy config
+ checks:
+ - "mypy"
+
+ - id: "lightning_app"
+ paths:
+ - ".github/workflows/ci-app*.yml"
+ - "examples/app_**"
+ - "requirements/app/**"
+ - "src/lightning_app/**"
+ - "tests/tests_app/**"
+ - "tests/tests_app_examples/**"
+ - "tests/tests_clusters/**"
+ # the examples are used in the app CI
+ - "examples/app_*"
+ checks:
+ - "Cloud Test (boring_app)"
+ - "Cloud Test (collect_failures)"
+ - "Cloud Test (commands_and_api)"
+ - "Cloud Test (custom_work_dependencies)"
+ - "Cloud Test (drive)"
+ - "Cloud Test (idle_timeout)"
+ - "Cloud Test (payload)"
+ - "Cloud Test (template_jupyterlab)"
+ - "Cloud Test (template_react_ui)"
+ - "Cloud Test (template_streamlit_ui)"
+ - "Cloud Test (v0_app)"
+ - "doctest (app)"
+ - "make-docs (app)"
+ - "pytest (macOS-11, 3.8, latest)"
+ - "pytest (macOS-11, 3.8, oldest)"
+ - "pytest (ubuntu-20.04, 3.8, latest)"
+ - "pytest (ubuntu-20.04, 3.8, oldest)"
+ - "pytest (windows-2022, 3.8, latest)"
+ - "pytest (windows-2022, 3.8, oldest)"
+
+ - id: "lightning_app: Docs"
+ paths:
+ - "docs/source-app/**"
+ - ".github/workflows/docs-*.yml"
+ - "requirements/app/**"
+ checks:
+ - "doctest (app)"
+ - "make-docs (app)"
+
+ - id: "install"
+ paths:
+ - ".actions/setup_tools.py"
+ - ".github/workflows/ci-pkg-install.yml"
+ - "setup.py"
+ - "src/lightning/**"
+ # all __about__, __version__, __setup__
+ - "src/*/__*.py"
+ checks:
+ - "install-meta-pypi (macOS-11, 3.8)"
+ - "install-meta-pypi (ubuntu-20.04, 3.8)"
+ - "install-meta-pypi (windows-2022, 3.8)"
+ - "install-meta-src (macOS-11, 3.8)"
+ - "install-meta-src (macOS-11, lightning, 3.8)"
+ - "install-meta-src (ubuntu-20.04, 3.8)"
+ - "install-meta-src (ubuntu-20.04, lightning, 3.8)"
+ - "install-meta-src (windows-2022, 3.8)"
+ - "install-meta-src (windows-2022, lightning, 3.8)"
+ - "install-standalone (macOS-11, app, 3.8)"
+ - "install-standalone (macOS-11, pytorch, 3.8)"
+ - "install-standalone (ubuntu-20.04, app, 3.8)"
+ - "install-standalone (ubuntu-20.04, pytorch, 3.8)"
+ - "install-standalone (windows-2022, app, 3.8)"
+ - "install-standalone (windows-2022, pytorch, 3.8)"
From 31ecf9bfac32e226eb670e743c79dbceb4f88345 Mon Sep 17 00:00:00 2001
From: Raphael Randschau
Date: Thu, 11 Aug 2022 11:34:24 -0700
Subject: [PATCH 58/59] [CLI] adjust command description (#14130)
* adjust CLI copy
Co-authored-by: RobertLaurella <99420295+RobertLaurella@users.noreply.github.com>
---
src/lightning_app/cli/lightning_cli.py | 10 +++++-----
src/lightning_app/cli/lightning_cli_create.py | 2 +-
src/lightning_app/cli/lightning_cli_delete.py | 2 +-
src/lightning_app/cli/lightning_cli_list.py | 2 +-
4 files changed, 8 insertions(+), 8 deletions(-)
diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py
index 6a6e41df57026..81d2a773b4619 100644
--- a/src/lightning_app/cli/lightning_cli.py
+++ b/src/lightning_app/cli/lightning_cli.py
@@ -143,7 +143,7 @@ def logs(app_name: str, components: List[str], follow: bool) -> None:
@_main.command()
def login():
- """Log in to your Lightning.ai account."""
+ """Log in to your lightning.ai account."""
auth = Auth()
auth.clear()
@@ -156,7 +156,7 @@ def login():
@_main.command()
def logout():
- """Log out of your Lightning.ai account."""
+ """Log out of your lightning.ai account."""
Auth().clear()
@@ -215,7 +215,7 @@ def on_before_run(*args):
@_main.group()
def run():
- """Run your application."""
+ """Run a Lightning application locally or on the cloud."""
@run.command("app")
@@ -321,7 +321,7 @@ def stop():
@_main.group()
def install():
- """Install Lightning apps and components."""
+ """Install a Lightning App and/or component."""
@install.command("app")
@@ -379,7 +379,7 @@ def install_component(name, yes, version):
@_main.group()
def init():
- """Init a Lightning app and component."""
+ """Init a Lightning App and/or component."""
@init.command("app")
diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py
index c9cea2a5676f9..7e9a6b9d2143b 100644
--- a/src/lightning_app/cli/lightning_cli_create.py
+++ b/src/lightning_app/cli/lightning_cli_create.py
@@ -5,7 +5,7 @@
@click.group("create")
def create():
- """Create Lightning AI BYOC managed resources."""
+ """Create Lightning AI self-managed resources (clusters, etc…)"""
pass
diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py
index c304b130bdf5d..366f4aa01e995 100644
--- a/src/lightning_app/cli/lightning_cli_delete.py
+++ b/src/lightning_app/cli/lightning_cli_delete.py
@@ -5,7 +5,7 @@
@click.group("delete")
def delete():
- """Delete Lightning AI BYOC managed resources."""
+ """Delete Lightning AI self-managed resources (clusters, etc…)"""
pass
diff --git a/src/lightning_app/cli/lightning_cli_list.py b/src/lightning_app/cli/lightning_cli_list.py
index d0d1d34a6dd4d..7d38b5b57760f 100644
--- a/src/lightning_app/cli/lightning_cli_list.py
+++ b/src/lightning_app/cli/lightning_cli_list.py
@@ -6,7 +6,7 @@
@click.group(name="list")
def get_list():
- """List your Lightning AI BYOC managed resources."""
+ """List Lightning AI self-managed resources (clusters, etc…)"""
pass
From e53c4e8e6c14c92968df9bed8861e578bfe731aa Mon Sep 17 00:00:00 2001
From: Krishna Kalyan
Date: Thu, 11 Aug 2022 22:10:05 +0100
Subject: [PATCH 59/59] Fix mypy errors attributed to `pytorch_lightning.
strategies.sharded_spawn` (#14102)
Co-authored-by: rohitgr7
Co-authored-by: Jirka Borovec
Co-authored-by: awaelchli
---
pyproject.toml | 1 -
src/pytorch_lightning/overrides/base.py | 1 +
src/pytorch_lightning/strategies/sharded_spawn.py | 14 +++++++++-----
3 files changed, 10 insertions(+), 6 deletions(-)
diff --git a/pyproject.toml b/pyproject.toml
index b5e806bc69900..9f7cc28d0b002 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -58,7 +58,6 @@ module = [
"pytorch_lightning.profilers.base",
"pytorch_lightning.profilers.pytorch",
"pytorch_lightning.strategies.sharded",
- "pytorch_lightning.strategies.sharded_spawn",
"pytorch_lightning.trainer.callback_hook",
"pytorch_lightning.trainer.connectors.data_connector",
"pytorch_lightning.trainer.supporters",
diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py
index 26c2837bda7e3..3e9fda2f966f5 100644
--- a/src/pytorch_lightning/overrides/base.py
+++ b/src/pytorch_lightning/overrides/base.py
@@ -75,6 +75,7 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any:
trainer = pl_module._trainer
if trainer is not None:
+ assert isinstance(self.module, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
if trainer.training:
output = self.module.training_step(*inputs, **kwargs)
# In manual_optimization, we need to prevent DDP reducer as
diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py
index 4550e397ded80..882302e101cb6 100644
--- a/src/pytorch_lightning/strategies/sharded_spawn.py
+++ b/src/pytorch_lightning/strategies/sharded_spawn.py
@@ -12,13 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from contextlib import contextmanager
-from typing import Dict, Generator, List, Optional, Tuple
+from typing import Any, Dict, Generator, List, Optional, Tuple
from torch import Tensor
from torch.nn import Module
from torch.optim import Optimizer
import pytorch_lightning as pl
+from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase
from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE
from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy
from pytorch_lightning.trainer.states import TrainerFn
@@ -42,7 +43,9 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy):
def configure_ddp(self) -> None:
# set up optimizers after the wrapped module has been moved to the device
+ assert self.lightning_module is not None
self.setup_optimizers(self.lightning_module.trainer)
+ assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase))
self.model, self.optimizers = self._setup_model_and_optimizers(
model=LightningShardedDataParallel(self.model), optimizers=self.optimizers
)
@@ -69,12 +72,13 @@ def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS"
return optimizers
def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]:
- if self.model is not None and self.model.trainer.state.fn != TrainerFn.FITTING:
+ assert self.lightning_module
+ if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING:
return optimizers
return self._reinit_optimizers_with_oss(optimizers)
- def optimizer_state(self, optimizer: "OSS") -> Optional[dict]:
+ def optimizer_state(self, optimizer: "OSS") -> Dict[str, Any]:
if isinstance(optimizer, OSS):
optimizer.consolidate_state_dict()
return self._optim_state_dict(optimizer)
@@ -93,7 +97,7 @@ def block_backward_sync(self) -> Generator:
yield None
@rank_zero_only
- def _optim_state_dict(self, optimizer):
+ def _optim_state_dict(self, optimizer: Optimizer) -> Dict[str, Any]:
"""
Retrieves state dict only on rank 0, which contains the entire optimizer state after calling
:meth:`consolidate_state_dict`.
@@ -112,7 +116,7 @@ def lightning_module(self) -> Optional["pl.LightningModule"]:
def pre_backward(self, closure_loss: Tensor) -> None:
pass
- def post_training_step(self):
+ def post_training_step(self) -> None:
pass
@classmethod