From 1b31039c5870daa8d40016f9dd2374154099ca59 Mon Sep 17 00:00:00 2001 From: Mauricio Villegas Date: Mon, 25 Jul 2022 11:25:42 +0200 Subject: [PATCH 001/230] Update LightningCLI test for new support in latest release of jsonargparse (#13805) --- requirements/pytorch/extra.txt | 2 +- src/pytorch_lightning/cli.py | 2 +- tests/tests_pytorch/test_cli.py | 45 ++++++++++++++++++++++++++------- 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 440a80594d5d9..90571dd8cab91 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -3,6 +3,6 @@ matplotlib>3.1, <3.5.3 torchtext>=0.10.*, <=0.12.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 -jsonargparse[signatures]>=4.10.2, <=4.10.2 +jsonargparse[signatures]>=4.12.0, <=4.12.0 gcsfs>=2021.5.0, <2022.6.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 diff --git a/src/pytorch_lightning/cli.py b/src/pytorch_lightning/cli.py index 169f16b66cd33..d3990d79c5c88 100644 --- a/src/pytorch_lightning/cli.py +++ b/src/pytorch_lightning/cli.py @@ -27,7 +27,7 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import _warn, rank_zero_deprecation, rank_zero_warn -_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.10.2") +_JSONARGPARSE_SIGNATURES_AVAILABLE = _RequirementAvailable("jsonargparse[signatures]>=4.12.0") if _JSONARGPARSE_SIGNATURES_AVAILABLE: import docstring_parser diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 790a9fa14fb0d..965f53a86d4b7 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -1414,21 +1414,47 @@ def _test_logger_init_args(logger_name, init, unresolved={}): @pytest.mark.skipif(not _COMET_AVAILABLE, reason="comet-ml is required") def test_comet_logger_init_args(): - _test_logger_init_args("CometLogger", {"save_dir": "comet", "workspace": "comet"}) + _test_logger_init_args( + "CometLogger", + { + "save_dir": "comet", # Resolve from CometLogger.__init__ + "workspace": "comet", # Resolve from Comet{,Existing,Offline}Experiment.__init__ + }, + ) @pytest.mark.skipif(not _NEPTUNE_AVAILABLE, reason="neptune-client is required") def test_neptune_logger_init_args(): - _test_logger_init_args("NeptuneLogger", {"name": "neptune"}, {"description": "neptune"}) + _test_logger_init_args( + "NeptuneLogger", + { + "name": "neptune", # Resolve from NeptuneLogger.__init__ + }, + { + "description": "neptune", # Unsupported resolving from neptune.new.internal.init.run.init_run + }, + ) def test_tensorboard_logger_init_args(): - _test_logger_init_args("TensorBoardLogger", {"save_dir": "tb", "name": "tb"}) + _test_logger_init_args( + "TensorBoardLogger", + { + "save_dir": "tb", # Resolve from TensorBoardLogger.__init__ + "comment": "tb", # Resolve from tensorboard.writer.SummaryWriter.__init__ + }, + ) @pytest.mark.skipif(not _WANDB_AVAILABLE, reason="wandb is required") def test_wandb_logger_init_args(): - _test_logger_init_args("WandbLogger", {"save_dir": "wandb", "notes": "wandb"}) + _test_logger_init_args( + "WandbLogger", + { + "save_dir": "wandb", # Resolve from WandbLogger.__init__ + "notes": "wandb", # Resolve from wandb.sdk.wandb_init.init + }, + ) def test_cli_auto_seeding(): @@ -1512,13 +1538,13 @@ def test_pytorch_profiler_init_args(): from pytorch_lightning.profilers import Profiler, PyTorchProfiler init = { - "dirpath": "profiler", - "row_limit": 10, - "group_by_input_shapes": True, + "dirpath": "profiler", # Resolve from PyTorchProfiler.__init__ + "row_limit": 10, # Resolve from PyTorchProfiler.__init__ + "group_by_input_shapes": True, # Resolve from PyTorchProfiler.__init__ } unresolved = { - "profile_memory": True, - "record_shapes": True, + "profile_memory": True, # Not possible to resolve parameters from dynamically chosen Type[_PROFILER] + "record_shapes": True, # Resolve from PyTorchProfiler.__init__, gets moved to init_args } cli_args = ["--trainer.profiler=PyTorchProfiler"] cli_args += [f"--trainer.profiler.{k}={v}" for k, v in init.items()] @@ -1528,5 +1554,6 @@ def test_pytorch_profiler_init_args(): cli = LightningCLI(TestModel, run=False) assert isinstance(cli.config_init.trainer.profiler, PyTorchProfiler) + init["record_shapes"] = unresolved.pop("record_shapes") # Test move to init_args assert {k: cli.config.trainer.profiler.init_args[k] for k in init} == init assert cli.config.trainer.profiler.dict_kwargs == unresolved From 4e9fbdbe4013458ea33256f4e0b00cf0782e6064 Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Mon, 25 Jul 2022 06:28:19 -0400 Subject: [PATCH 002/230] Fix mypy errors attributed to `pytorch_lightning.loggers.comet` (#13689) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Carlos Mocholí Co-authored-by: otaj --- pyproject.toml | 1 - src/pytorch_lightning/loggers/comet.py | 32 ++++++++++---------- src/pytorch_lightning/loggers/csv_logs.py | 2 +- src/pytorch_lightning/loggers/mlflow.py | 4 +-- src/pytorch_lightning/loggers/tensorboard.py | 2 +- src/pytorch_lightning/loggers/wandb.py | 2 +- src/pytorch_lightning/utilities/logger.py | 6 ++-- 7 files changed, 25 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a0960c58f6e6d..177410cba79a6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,6 @@ module = [ "pytorch_lightning.core.saving", "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", - "pytorch_lightning.loggers.comet", "pytorch_lightning.loggers.neptune", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", diff --git a/src/pytorch_lightning/loggers/comet.py b/src/pytorch_lightning/loggers/comet.py index 2b853f59259ff..363d47c1166e6 100644 --- a/src/pytorch_lightning/loggers/comet.py +++ b/src/pytorch_lightning/loggers/comet.py @@ -21,7 +21,7 @@ from argparse import Namespace from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Union -from torch import is_tensor, Tensor +from torch import Tensor import pytorch_lightning as pl from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment @@ -141,7 +141,7 @@ def __init__( prefix: str = "", agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None, agg_default_func: Optional[Callable[[Sequence[float]], float]] = None, - **kwargs, + **kwargs: Any, ): if comet_ml is None: raise ModuleNotFoundError( @@ -149,6 +149,8 @@ def __init__( ) super().__init__(agg_key_funcs=agg_key_funcs, agg_default_func=agg_default_func) self._experiment = None + self._save_dir: Optional[str] + self.rest_api_key: Optional[str] # Determine online or offline mode based on which arguments were passed to CometLogger api_key = api_key or comet_ml.config.get_api_key(None, comet_ml.config.get_config()) @@ -170,12 +172,12 @@ def __init__( log.info(f"CometLogger will be initialized in {self.mode} mode") - self._project_name = project_name - self._experiment_key = experiment_key - self._experiment_name = experiment_name - self._prefix = prefix - self._kwargs = kwargs - self._future_experiment_key = None + self._project_name: Optional[str] = project_name + self._experiment_key: Optional[str] = experiment_key + self._experiment_name: Optional[str] = experiment_name + self._prefix: str = prefix + self._kwargs: Any = kwargs + self._future_experiment_key: Optional[str] = None if rest_api_key is not None: # Comet.ml rest API, used to determine version number @@ -185,9 +187,7 @@ def __init__( self.rest_api_key = None self.comet_api = None - self._kwargs = kwargs - - @property + @property # type: ignore[misc] @rank_zero_experiment def experiment(self) -> Union[CometExperiment, CometExistingExperiment, CometOfflineExperiment]: r""" @@ -240,19 +240,19 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.log_parameters(params) @rank_zero_only - def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, Union[Tensor, float]], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" # Comet.ml expects metrics to be a dictionary of detached tensors on CPU metrics_without_epoch = metrics.copy() for key, val in metrics_without_epoch.items(): - if is_tensor(val): + if isinstance(val, Tensor): metrics_without_epoch[key] = val.cpu().detach() epoch = metrics_without_epoch.pop("epoch", None) metrics_without_epoch = _add_prefix(metrics_without_epoch, self._prefix, self.LOGGER_JOIN_CHAR) self.experiment.log_metrics(metrics_without_epoch, step=step, epoch=epoch) - def reset_experiment(self): + def reset_experiment(self) -> None: self._experiment = None @rank_zero_only @@ -326,7 +326,7 @@ def version(self) -> str: return self._future_experiment_key - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: state = self.__dict__.copy() # Save the experiment id in case an experiment object already exists, @@ -340,6 +340,6 @@ def __getstate__(self): state["_experiment"] = None return state - def log_graph(self, model: "pl.LightningModule", input_array=None) -> None: + def log_graph(self, model: "pl.LightningModule", input_array: Optional[Tensor] = None) -> None: if self._experiment is not None: self._experiment.set_model_graph(model) diff --git a/src/pytorch_lightning/loggers/csv_logs.py b/src/pytorch_lightning/loggers/csv_logs.py index 72d21ae2c4974..45d5fffb51e33 100644 --- a/src/pytorch_lightning/loggers/csv_logs.py +++ b/src/pytorch_lightning/loggers/csv_logs.py @@ -195,7 +195,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.log_hparams(params) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Dict[str, Union[Tensor, float]], step: Optional[int] = None) -> None: metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) self.experiment.log_metrics(metrics, step) if step is not None and (step + 1) % self._flush_logs_every_n_steps == 0: diff --git a/src/pytorch_lightning/loggers/mlflow.py b/src/pytorch_lightning/loggers/mlflow.py index 313fcfe07f10e..5675a3bd9fc67 100644 --- a/src/pytorch_lightning/loggers/mlflow.py +++ b/src/pytorch_lightning/loggers/mlflow.py @@ -20,7 +20,7 @@ import re from argparse import Namespace from time import time -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, Mapping, Optional, Union from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment from pytorch_lightning.utilities.imports import _module_available @@ -230,7 +230,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.log_param(self.run_id, k, v) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index 12ec2e21b84ce..dacecf129523b 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -216,7 +216,7 @@ def log_hyperparams( writer.add_summary(sei) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index bc2a84dc82b00..8e30827759b99 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -379,7 +379,7 @@ def log_hyperparams(self, params: Union[Dict[str, Any], Namespace]) -> None: self.experiment.config.update(params, allow_val_change=True) @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics: Mapping[str, float], step: Optional[int] = None) -> None: assert rank_zero_only.rank == 0, "experiment tried to log from global_rank != 0" metrics = _add_prefix(metrics, self._prefix, self.LOGGER_JOIN_CHAR) diff --git a/src/pytorch_lightning/utilities/logger.py b/src/pytorch_lightning/utilities/logger.py index 07ecf4c3c0ca0..24d75e4f41034 100644 --- a/src/pytorch_lightning/utilities/logger.py +++ b/src/pytorch_lightning/utilities/logger.py @@ -14,7 +14,7 @@ """Utilities for loggers.""" from argparse import Namespace -from typing import Any, Dict, Generator, List, MutableMapping, Optional, Union +from typing import Any, Dict, Generator, List, Mapping, MutableMapping, Optional, Union import numpy as np import torch @@ -132,7 +132,9 @@ def _sanitize_params(params: Dict[str, Any]) -> Dict[str, Any]: return params -def _add_prefix(metrics: Dict[str, float], prefix: str, separator: str) -> Dict[str, float]: +def _add_prefix( + metrics: Mapping[str, Union[Tensor, float]], prefix: str, separator: str +) -> Mapping[str, Union[Tensor, float]]: """Insert prefix before each key in a dict, separated by the separator. Args: From 11f8fa2ca6ffed5d1790acf06f778d3bd3dc9f1b Mon Sep 17 00:00:00 2001 From: Ha YongWook Date: Mon, 25 Jul 2022 15:32:49 +0200 Subject: [PATCH 003/230] Fix typos in Checkpointing doc (#13827) Co-authored-by: awaelchli --- .../common/checkpointing_intermediate.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source-pytorch/common/checkpointing_intermediate.rst b/docs/source-pytorch/common/checkpointing_intermediate.rst index 2bcbfff7c9e4e..f6dc1c955880f 100644 --- a/docs/source-pytorch/common/checkpointing_intermediate.rst +++ b/docs/source-pytorch/common/checkpointing_intermediate.rst @@ -12,7 +12,7 @@ Checkpointing (intermediate) ***************************** Modify checkpointing behavior ***************************** -For fine-grain control over checkpointing behavior, use the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` object +For fine-grained control over checkpointing behavior, use the :class:`~pytorch_lightning.callbacks.ModelCheckpoint` object .. code-block:: python @@ -45,10 +45,10 @@ To save checkpoints based on a (*when/which/what/where*) condition (for example When ==== -- When using iterative training which doesn't have an epoch, you can checkpoint at every ``N`` training steps by specifying ``every_n_training_steps=N``. -- You can also control the interval of epochs between checkpoints using ``every_n_epochs`` between checkpoints, to avoid slowdowns. -- You can checkpoint at a regular time interval using ``train_time_interval`` argument independent of the steps or epochs. -- In case you are monitoring a training metrics, we'd suggest using ``save_on_train_epoch_end=True`` to ensure the required metric is being accumulated correctly for creating a checkpoint. +- When using iterative training which doesn't have an epoch, you can checkpoint at every ``N`` training steps by specifying ``every_n_train_steps=N``. +- You can also control the interval of epochs between checkpoints using ``every_n_epochs``, to avoid slowdowns. +- You can checkpoint at a regular time interval using the ``train_time_interval`` argument independent of the steps or epochs. +- In case you are monitoring a training metric, we'd suggest using ``save_on_train_epoch_end=True`` to ensure the required metric is being accumulated correctly for creating a checkpoint. Which From 71c9d89d21d598e7045cc4537209b9e8eb880d49 Mon Sep 17 00:00:00 2001 From: Sebastian Raschka Date: Mon, 25 Jul 2022 09:08:08 -0500 Subject: [PATCH 004/230] Add missing docstring for LightningWork.stop() (#13368) --- src/lightning_app/core/work.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 0c12242e26d6a..53c9e07e80020 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -493,6 +493,7 @@ def on_exit(self): pass def stop(self): + """Stops LightingWork component and shuts down hardware provisioned via L.CloudCompute.""" if not self._backend: raise Exception( "Can't stop the work, it looks like it isn't attached to a LightningFlow. " From 227871982dd44ef9e6e060e7bd16043b99d8087e Mon Sep 17 00:00:00 2001 From: Justus Schock <12886177+justusschock@users.noreply.github.com> Date: Mon, 25 Jul 2022 16:46:45 +0200 Subject: [PATCH 005/230] Merge different gpu backends with accelerator='gpu' (#13642) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Rename GPUAccelerator to CUDAAccelerator * Add back GPUAccelerator and deprecate it * Remove temporary registration * accelerator connector reroute * accelerator_connector tests * update enums * lite support + tests * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * move "gpu" support up before actual accelerator flag checks * Stupid arguments * fix tests * change exception type * fix registry test * pre-commit * CI: debug HPU flow (#13419) * Update the hpu-tests.yml to pull docker from vault * fire & sudo * habana-gaudi-hpus * Check the driver status on gaudi server (#13718) Co-authored-by: arao Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> * Update typing-extensions requirement from <4.2.1,>=4.0.0 to >=4.0.0,<4.3.1 in /requirements (#13529) Update typing-extensions requirement in /requirements Updates the requirements on [typing-extensions](https://github.com/python/typing_extensions) to permit the latest version. - [Release notes](https://github.com/python/typing_extensions/releases) - [Changelog](https://github.com/python/typing_extensions/blob/main/CHANGELOG.md) - [Commits](https://github.com/python/typing_extensions/compare/4.0.0...4.3.0) --- updated-dependencies: - dependency-name: typing-extensions dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> * [pre-commit.ci] pre-commit suggestions (#13540) updates: - [github.com/psf/black: 22.3.0 → 22.6.0](https://github.com/psf/black/compare/22.3.0...22.6.0) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> * [FIX] Native FSDP precision + tests (#12985) * Simplify fetching's loader types (#13111) * Include app templates to the lightning and app packages (#13731) * Include app templates to the package Co-authored-by: mansy Co-authored-by: Adrian Wälchli * Fix mypy typing errors in pytorch_lightning/callbacks/model_checkpoint.py (#13617) Co-authored-by: Carlos Mocholí * Fix typos initialize in docs (#13557) Co-authored-by: Carlos Mocholí Co-authored-by: Adrian Wälchli * Fix main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` (#12832) * Fix mypy errors attributed to `pytorch_lightning.loggers.tensorboard.py` (#13688) Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta Co-authored-by: Carlos Mocholí * Fix mypy errors attributed to `pytorch_lightning.loggers.mlflow` (#13691) Co-authored-by: Jirka Borovec Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> * fix mypy errors for loggers/wandb.py (#13483) Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta Co-authored-by: Akihiro Nitta * Fix gatekeeper minimum check (#13769) * changelog * changelog * fix order * move up again * add missing test Co-authored-by: rohitgr7 Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec Co-authored-by: arao Co-authored-by: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Sean Naren Co-authored-by: Carlos Mocholí Co-authored-by: Mansy Co-authored-by: mansy Co-authored-by: Adrian Wälchli Co-authored-by: Lee Jungwon <33821003+BongYang@users.noreply.github.com> Co-authored-by: Nathaniel D'Amours <88633026+NathanielDamours@users.noreply.github.com> Co-authored-by: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: Gautier Dagan Co-authored-by: Akihiro Nitta --- src/pytorch_lightning/CHANGELOG.md | 6 +++ src/pytorch_lightning/accelerators/cuda.py | 6 --- src/pytorch_lightning/lite/lite.py | 7 +-- .../connectors/accelerator_connector.py | 21 +++++++-- src/pytorch_lightning/utilities/enums.py | 4 +- .../test_accelerator_connector.py | 44 +++++++++++++++++-- .../accelerators/test_accelerator_registry.py | 2 +- tests/tests_pytorch/lite/test_lite.py | 2 + 8 files changed, 73 insertions(+), 19 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index af53c9b063853..b2c7ca54e68a7 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -110,6 +110,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed +- `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) + + - Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) @@ -166,6 +169,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated +- Deprecated `pytorch_lightning.accelerators.gpu.GPUAccelerator` in favor of `pytorch_lightning.accelerators.cuda.CUDAAccelerator` ([#13636](https://github.com/Lightning-AI/lightning/pull/13636)) + + - Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014)) diff --git a/src/pytorch_lightning/accelerators/cuda.py b/src/pytorch_lightning/accelerators/cuda.py index a474ef9a99031..1c69015546976 100644 --- a/src/pytorch_lightning/accelerators/cuda.py +++ b/src/pytorch_lightning/accelerators/cuda.py @@ -97,12 +97,6 @@ def register_accelerators(cls, accelerator_registry: Dict) -> None: cls, description=f"{cls.__class__.__name__}", ) - # temporarily enable "gpu" to point to the CUDA Accelerator - accelerator_registry.register( - "gpu", - cls, - description=f"{cls.__class__.__name__}", - ) def teardown(self) -> None: # clean up memory diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 86bddaf676e01..0195e6852eb28 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -54,7 +54,8 @@ class LightningLite(ABC): - Multi-node support. Args: - accelerator: The hardware to run on. Possible choices are: ``"cpu"``, ``"gpu"``, ``"tpu"``, ``"auto"``. + accelerator: The hardware to run on. Possible choices are: + ``"cpu"``, ``"cuda"``, ``"mps"``, ``"gpu"``, ``"tpu"``, ``"auto"``. strategy: Strategy for how to run across multiple devices. Possible choices are: ``"dp"``, ``"ddp"``, ``"ddp_spawn"``, ``"deepspeed"``, ``"ddp_sharded"``. devices: Number of devices to train on (``int``), which GPUs to train on (``list`` or ``str``), or ``"auto"``. @@ -436,7 +437,7 @@ def _get_distributed_sampler(dataloader: DataLoader, **kwargs: Any) -> Distribut return DistributedSamplerWrapper(dataloader.sampler, **kwargs) def _check_accelerator_support(self, accelerator: Optional[Union[str, Accelerator]]) -> None: - supported = [t.value.lower() for t in self._supported_device_types()] + ["auto"] + supported = [t.value.lower() for t in self._supported_device_types()] + ["gpu", "auto"] valid = accelerator is None or isinstance(accelerator, Accelerator) or accelerator in supported if not valid: raise MisconfigurationException( @@ -457,7 +458,7 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N def _supported_device_types() -> Sequence[_AcceleratorType]: return ( _AcceleratorType.CPU, - _AcceleratorType.GPU, + _AcceleratorType.CUDA, _AcceleratorType.TPU, _AcceleratorType.MPS, ) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index dc8594bfd7021..bd879cf85ff7a 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -201,10 +201,14 @@ def __init__( devices=devices, num_nodes=num_nodes, num_processes=num_processes, gpus=gpus, ipus=ipus, tpu_cores=tpu_cores ) # 2. Instantiate Accelerator - # handle `auto` and `None` self._set_accelerator_if_ipu_strategy_is_passed() + + # handle `auto`, `None` and `gpu` if self._accelerator_flag == "auto" or self._accelerator_flag is None: - self._accelerator_flag = self._choose_accelerator() + self._accelerator_flag = self._choose_auto_accelerator() + elif self._accelerator_flag == "gpu": + self._accelerator_flag = self._choose_gpu_accelerator_backend() + self._set_parallel_devices_and_init_accelerator() # 3. Instantiate ClusterEnvironment @@ -280,7 +284,7 @@ def _check_config_and_set_final_flags( if ( accelerator is not None and accelerator not in self._accelerator_types - and accelerator != "auto" + and accelerator not in ("auto", "gpu") and not isinstance(accelerator, Accelerator) ): raise ValueError( @@ -487,7 +491,7 @@ def _set_accelerator_if_ipu_strategy_is_passed(self) -> None: if isinstance(self._strategy_flag, IPUStrategy): self._accelerator_flag = "ipu" - def _choose_accelerator(self) -> str: + def _choose_auto_accelerator(self) -> str: """Choose the accelerator type (str) based on availability when ``accelerator='auto'``.""" if self._accelerator_flag == "auto": if _TPU_AVAILABLE: @@ -502,6 +506,15 @@ def _choose_accelerator(self) -> str: return "cuda" return "cpu" + @staticmethod + def _choose_gpu_accelerator_backend() -> str: + if MPSAccelerator.is_available(): + return "mps" + if CUDAAccelerator.is_available(): + return "cuda" + + raise MisconfigurationException("No supported gpu backend found!") + def _set_parallel_devices_and_init_accelerator(self) -> None: if isinstance(self._accelerator_flag, Accelerator): self.accelerator: Accelerator = self._accelerator_flag diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index 91f8466b77500..d7d3a14ec924a 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -244,7 +244,7 @@ class _AcceleratorType(LightningEnum): >>> _AcceleratorType.CPU == _AcceleratorType.from_str('cpu') True >>> # you can match the type with string - >>> _AcceleratorType.GPU == 'GPU' + >>> _AcceleratorType.CUDA == 'CUDA' True >>> # which is case invariant >>> _AcceleratorType.TPU in ('tpu', 'CPU') @@ -252,7 +252,7 @@ class _AcceleratorType(LightningEnum): """ CPU = "CPU" - GPU = "GPU" + CUDA = "CUDA" IPU = "IPU" TPU = "TPU" HPU = "HPU" diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 82a7f89a7647f..06f088e87ea4d 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -278,7 +278,7 @@ def test_accelerator_cpu(_): MisconfigurationException, match="CUDAAccelerator can not run on your system since the accelerator is not available.", ): - Trainer(accelerator="gpu") + Trainer(accelerator="cuda") with pytest.deprecated_call(match=r"is deprecated in v1.7 and will be removed"): Trainer(accelerator="cpu", gpus=1) @@ -671,7 +671,7 @@ def test_devices_auto_choice_mps(): @pytest.mark.parametrize( ["parallel_devices", "accelerator"], - [([torch.device("cpu")], "gpu"), ([torch.device("cuda", i) for i in range(8)], ("tpu"))], + [([torch.device("cpu")], "cuda"), ([torch.device("cuda", i) for i in range(8)], ("tpu"))], ) def test_parallel_devices_in_strategy_confilict_with_accelerator(parallel_devices, accelerator): with pytest.raises(MisconfigurationException, match=r"parallel_devices set through"): @@ -746,13 +746,51 @@ def test_plugin_only_one_instance_for_one_type(plugins, expected): Trainer(plugins=plugins) -@pytest.mark.parametrize("accelerator", ("cpu", "gpu", "tpu", "ipu")) +@pytest.mark.parametrize("accelerator", ("cpu", "cuda", "mps", "tpu", "ipu")) @pytest.mark.parametrize("devices", ("0", 0, [])) def test_passing_zero_and_empty_list_to_devices_flag(accelerator, devices): with pytest.raises(MisconfigurationException, match="value is not a valid input using"): Trainer(accelerator=accelerator, devices=devices) +@pytest.mark.parametrize( + "expected_accelerator_flag,expected_accelerator_class", + [ + pytest.param("cuda", CUDAAccelerator, marks=RunIf(min_cuda_gpus=1)), + pytest.param("mps", MPSAccelerator, marks=RunIf(mps=True)), + ], +) +def test_gpu_accelerator_backend_choice(expected_accelerator_flag, expected_accelerator_class): + + trainer = Trainer(accelerator="gpu") + assert trainer._accelerator_connector._accelerator_flag == expected_accelerator_flag + assert isinstance(trainer.accelerator, expected_accelerator_class) + + +@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) +def test_gpu_accelerator_backend_choice_cuda(_): + trainer = Trainer(accelerator="gpu") + + assert trainer._accelerator_connector._accelerator_flag == "cuda" + assert isinstance(trainer.accelerator, CUDAAccelerator) + + +@mock.patch("pytorch_lightning.accelerators.mps._MPS_AVAILABLE", return_value=True) +@mock.patch("torch.device", return_value="mps") # necessary because torch doesn't allow creation of mps devices +def test_gpu_accelerator_backend_choice_mps(*_): + trainer = Trainer(accelerator="gpu") + + assert trainer._accelerator_connector._accelerator_flag == "mps" + assert isinstance(trainer.accelerator, MPSAccelerator) + + +@mock.patch("pytorch_lightning.accelerators.mps.MPSAccelerator.is_available", return_value=False) +@mock.patch("pytorch_lightning.accelerators.cuda.CUDAAccelerator.is_available", return_value=False) +def test_gpu_accelerator_misconfiguration_exception(*_): + with pytest.raises(MisconfigurationException, match="No supported gpu backend found!"): + Trainer(accelerator="gpu") + + @mock.patch("pytorch_lightning.accelerators.hpu.HPUAccelerator.is_available", return_value=True) @mock.patch("pytorch_lightning.strategies.hpu_parallel._HPU_AVAILABLE", return_value=True) @mock.patch("pytorch_lightning.plugins.precision.hpu._HPU_AVAILABLE", return_value=True) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_registry.py b/tests/tests_pytorch/accelerators/test_accelerator_registry.py index 791d4c33dbbe8..004723c19eeb6 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_registry.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_registry.py @@ -63,4 +63,4 @@ def is_available(): def test_available_accelerators_in_registry(): - assert AcceleratorRegistry.available_accelerators() == ["cpu", "cuda", "gpu", "hpu", "ipu", "mps", "tpu"] + assert AcceleratorRegistry.available_accelerators() == ["cpu", "cuda", "hpu", "ipu", "mps", "tpu"] diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 6d0c0fe891695..c0439854013a2 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -315,9 +315,11 @@ def test_setup_dataloaders_replace_standard_sampler(shuffle, strategy): "accelerator, expected", [ ("cpu", "cpu"), + pytest.param("cuda", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("gpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("tpu", "xla:0", marks=RunIf(tpu=True)), pytest.param("mps", "mps:0", marks=RunIf(mps=True)), + pytest.param("gpu", "mps:0", marks=RunIf(mps=True)), ], ) def test_to_device(accelerator, expected): From a8d7b4476c0f1bbe82de3e75550977c70265f90a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Mon, 25 Jul 2022 18:51:16 +0200 Subject: [PATCH 006/230] Fix PyTorch spelling errors (#13774) * Fix PyTorch spelling errors * more --- .github/CONTRIBUTING.md | 2 +- .github/ISSUE_TEMPLATE/documentation.md | 2 +- .github/ISSUE_TEMPLATE/feature_request.md | 2 +- .github/ISSUE_TEMPLATE/refactor.md | 2 +- .github/stale.yml | 2 +- dockers/base-xla/Dockerfile | 2 +- docs/source-app/get_started/training_with_apps.rst | 4 ++-- docs/source-pytorch/deploy/production_advanced_2.rst | 2 +- docs/source-pytorch/ecosystem/asr_nlp_tts.rst | 2 +- examples/pl_domain_templates/computer_vision_fine_tuning.py | 2 +- src/lightning_app/components/python/tracer.py | 2 +- src/pytorch_lightning/CHANGELOG.md | 4 ++-- src/pytorch_lightning/overrides/distributed.py | 2 +- src/pytorch_lightning/strategies/fully_sharded_native.py | 2 +- src/pytorch_lightning/trainer/trainer.py | 2 +- tests/tests_pytorch/helpers/datasets.py | 2 +- 16 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 1d47028bfef89..7bec2d8763afd 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -225,7 +225,7 @@ git push -f #### How to add new tests? -We are using [pytest](https://docs.pytest.org/en/stable/) in Pytorch Lightning. +We are using [pytest](https://docs.pytest.org/en/stable/) in PyTorch Lightning. Here are tutorials: diff --git a/.github/ISSUE_TEMPLATE/documentation.md b/.github/ISSUE_TEMPLATE/documentation.md index 9336d4bd35415..8f94ee921e7ee 100644 --- a/.github/ISSUE_TEMPLATE/documentation.md +++ b/.github/ISSUE_TEMPLATE/documentation.md @@ -30,4 +30,4 @@ ______________________________________________________________________ - [**Bolts**](https://github.com/Lightning-AI/lightning-bolts): Pretrained SOTA Deep Learning models, callbacks, and more for research and production with PyTorch Lightning and PyTorch. -- [**Lightning Transformers**](https://github.com/Lightning-AI/lightning-transformers): Flexible interface for high-performance research using SOTA Transformers leveraging Pytorch Lightning, Transformers, and Hydra. +- [**Lightning Transformers**](https://github.com/Lightning-AI/lightning-transformers): Flexible interface for high-performance research using SOTA Transformers leveraging PyTorch Lightning, Transformers, and Hydra. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 77f5bac403d72..0d506dd923087 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -38,4 +38,4 @@ ______________________________________________________________________ - [**Bolts**](https://github.com/Lightning-AI/lightning-bolts): Pretrained SOTA Deep Learning models, callbacks, and more for research and production with PyTorch Lightning and PyTorch. -- [**Lightning Transformers**](https://github.com/Lightning-AI/lightning-transformers): Flexible interface for high-performance research using SOTA Transformers leveraging Pytorch Lightning, Transformers, and Hydra. +- [**Lightning Transformers**](https://github.com/Lightning-AI/lightning-transformers): Flexible interface for high-performance research using SOTA Transformers leveraging PyTorch Lightning, Transformers, and Hydra. diff --git a/.github/ISSUE_TEMPLATE/refactor.md b/.github/ISSUE_TEMPLATE/refactor.md index 7df1c3002665e..159a4ce8d651b 100644 --- a/.github/ISSUE_TEMPLATE/refactor.md +++ b/.github/ISSUE_TEMPLATE/refactor.md @@ -34,4 +34,4 @@ ______________________________________________________________________ - [**Bolts**](https://github.com/Lightning-AI/lightning-bolts): Pretrained SOTA Deep Learning models, callbacks, and more for research and production with PyTorch Lightning and PyTorch. -- [**Lightning Transformers**](https://github.com/Lightning-AI/lightning-transformers): Flexible interface for high-performance research using SOTA Transformers leveraging Pytorch Lightning, Transformers, and Hydra. +- [**Lightning Transformers**](https://github.com/Lightning-AI/lightning-transformers): Flexible interface for high-performance research using SOTA Transformers leveraging PyTorch Lightning, Transformers, and Hydra. diff --git a/.github/stale.yml b/.github/stale.yml index 51b57c079879d..a1fb9abfc9257 100644 --- a/.github/stale.yml +++ b/.github/stale.yml @@ -14,7 +14,7 @@ issues: markComment: > This issue has been automatically marked as stale because it hasn't had any recent activity. This issue will be closed in 7 days if no further activity occurs. - Thank you for your contributions, Pytorch Lightning Team! + Thank you for your contributions, PyTorch Lightning Team! # Comment to post when closing a stale issue. Set to `false` to disable closeComment: false diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 13da7c22086d8..977aee878ffcd 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -77,7 +77,7 @@ ENV \ RUN pip --version && \ pip config set global.cache-dir false && \ conda remove pytorch torchvision && \ - # Install Pytorch XLA + # Install PyTorch XLA py_version=${PYTHON_VERSION/./} && \ gsutil cp "gs://tpu-pytorch/wheels/torch-${XLA_VERSION}-cp${py_version}-cp${py_version}m-linux_x86_64.whl" . && \ gsutil cp "gs://tpu-pytorch/wheels/torch_xla-${XLA_VERSION}-cp${py_version}-cp${py_version}m-linux_x86_64.whl" . && \ diff --git a/docs/source-app/get_started/training_with_apps.rst b/docs/source-app/get_started/training_with_apps.rst index a7061cae562fb..f509ba4cf0267 100644 --- a/docs/source-app/get_started/training_with_apps.rst +++ b/docs/source-app/get_started/training_with_apps.rst @@ -8,7 +8,7 @@ Evolve a model into an ML system **Required background:** Basic Python familiarity and complete the :ref:`build_model` guide. -**Goal:** We'll walk you through the two key steps to build your first Lightning App from your existing Pytorch Lightning scripts. +**Goal:** We'll walk you through the two key steps to build your first Lightning App from your existing PyTorch Lightning scripts. .. join_slack:: :align: left @@ -50,7 +50,7 @@ Inside the ``app.py`` file, add the following code. .. literalinclude:: ../code_samples/convert_pl_to_app/app.py -This App runs the Pytorch Lightning script contained in the ``train.py`` file using the powerful :class:`~lightning_app.components.python.tracer.TracerPythonScript` component. This is really worth checking out! +This App runs the PyTorch Lightning script contained in the ``train.py`` file using the powerful :class:`~lightning_app.components.python.tracer.TracerPythonScript` component. This is really worth checking out! ---- diff --git a/docs/source-pytorch/deploy/production_advanced_2.rst b/docs/source-pytorch/deploy/production_advanced_2.rst index 5f6fe58d6ef72..ea5ca9fd24a8b 100644 --- a/docs/source-pytorch/deploy/production_advanced_2.rst +++ b/docs/source-pytorch/deploy/production_advanced_2.rst @@ -34,7 +34,7 @@ can save or directly use. It is recommended that you install the latest supported version of PyTorch to use this feature without limitations. -Once you have the exported model, you can run it in Pytorch or C++ runtime: +Once you have the exported model, you can run it in PyTorch or C++ runtime: .. code-block:: python diff --git a/docs/source-pytorch/ecosystem/asr_nlp_tts.rst b/docs/source-pytorch/ecosystem/asr_nlp_tts.rst index b624696886c73..abec585df6ff7 100644 --- a/docs/source-pytorch/ecosystem/asr_nlp_tts.rst +++ b/docs/source-pytorch/ecosystem/asr_nlp_tts.rst @@ -48,7 +48,7 @@ so that each can be configured from .yaml or the Hydra CLI. .. note:: Every NeMo model has an example configuration file and a corresponding script that contains all configurations needed for training. -The end result of using NeMo, Pytorch Lightning, and Hydra is that +The end result of using NeMo, PyTorch Lightning, and Hydra is that NeMo models all have the same look and feel. This makes it easy to do Conversational AI research across multiple domains. NeMo models are also fully compatible with the PyTorch ecosystem. diff --git a/examples/pl_domain_templates/computer_vision_fine_tuning.py b/examples/pl_domain_templates/computer_vision_fine_tuning.py index fedd837de0348..b33d63eb6589b 100644 --- a/examples/pl_domain_templates/computer_vision_fine_tuning.py +++ b/examples/pl_domain_templates/computer_vision_fine_tuning.py @@ -150,7 +150,7 @@ def val_dataloader(self): return self.__dataloader(train=False) -# --- Pytorch-lightning module --- +# --- PyTorch Lightning module --- class TransferLearningModel(LightningModule): diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index 5605eee6b6d47..ed692c7f3ed27 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -79,7 +79,7 @@ def __init__( This callback has a reference to the work and on every batch end, we are capturing the trainer ``global_step`` and ``best_model_path``. - Even more interesting, this component works for ANY Pytorch Lightning script and + Even more interesting, this component works for ANY PyTorch Lightning script and its state can be used in real time in a UI. .. literalinclude:: ../../../../examples/app_components/python/component_tracer.py diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b2c7ca54e68a7..1c3a3b9d5a1be 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -1800,7 +1800,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146)) - Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945)) - Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621)) -- Deprecated `PytorchProfiler(profiled_functions)` in favor of `record_functions` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349)) +- Deprecated `PyTorchProfiler(profiled_functions)` in favor of `record_functions` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349)) - Deprecated `@auto_move_data` in favor of `trainer.predict` ([#6993](https://github.com/PyTorchLightning/pytorch-lightning/pull/6993)) - Deprecated `Callback.on_load_checkpoint(checkpoint)` in favor of `Callback.on_load_checkpoint(trainer, pl_module, checkpoint)` ([#7253](https://github.com/PyTorchLightning/pytorch-lightning/pull/7253)) - Deprecated metrics in favor of `torchmetrics` ( @@ -2386,7 +2386,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). [#4737](https://github.com/PyTorchLightning/pytorch-lightning/pull/4737), [#4773](https://github.com/PyTorchLightning/pytorch-lightning/pull/4773)) - Added `experiment_id` to the NeptuneLogger ([#3462](https://github.com/PyTorchLightning/pytorch-lightning/pull/3462)) -- Added `Pytorch Geometric` integration example with Lightning ([#4568](https://github.com/PyTorchLightning/pytorch-lightning/pull/4568)) +- Added `PyTorch Geometric` integration example with Lightning ([#4568](https://github.com/PyTorchLightning/pytorch-lightning/pull/4568)) - Added `all_gather` method to `LightningModule` which allows gradient based tensor synchronizations for use-cases such as negative sampling. ([#5012](https://github.com/PyTorchLightning/pytorch-lightning/pull/5012)) - Enabled `self.log` in most functions ([#4969](https://github.com/PyTorchLightning/pytorch-lightning/pull/4969)) - Added changeable extension variable for `ModelCheckpoint` ([#4977](https://github.com/PyTorchLightning/pytorch-lightning/pull/4977)) diff --git a/src/pytorch_lightning/overrides/distributed.py b/src/pytorch_lightning/overrides/distributed.py index 8048d83252af7..f09a7b9e3ae08 100644 --- a/src/pytorch_lightning/overrides/distributed.py +++ b/src/pytorch_lightning/overrides/distributed.py @@ -41,7 +41,7 @@ def _find_tensors( # In manual_optimization, we need to call reducer prepare_for_backward. -# Note: Keep track of Pytorch DDP and update if there is a change +# Note: Keep track of PyTorch DDP and update if there is a change # https://github.com/pytorch/pytorch/blob/v1.7.1/torch/nn/parallel/distributed.py#L626-L638 def prepare_for_backward(model: DistributedDataParallel, output: Any) -> None: # `prepare_for_backward` is `DistributedDataParallel` specific. diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 6290164a169b4..553f010763edb 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -85,7 +85,7 @@ def __init__( `For more information: https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/`. .. warning:: ``DDPFullyShardedNativeStrategy`` is in beta and subject to change. The interface can - bring breaking changes and new features with the next release of Pytorch. + bring breaking changes and new features with the next release of PyTorch. Defaults have been set and options have been exposed, but may require configuration based on your level of memory/speed efficiency. We suggest having a look at this tutorial for diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index d10225fea2d65..561fe799f1010 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -256,7 +256,7 @@ def __init__( deterministic: If ``True``, sets whether PyTorch operations must use deterministic algorithms. Set to ``"warn"`` to use deterministic algorithms whenever possible, throwing warnings on operations - that don't support deterministic mode (requires Pytorch 1.11+). If not set, defaults to ``False``. + that don't support deterministic mode (requires PyTorch 1.11+). If not set, defaults to ``False``. Default: ``None``. devices: Will be mapped to either `gpus`, `tpu_cores`, `num_processes` or `ipus`, diff --git a/tests/tests_pytorch/helpers/datasets.py b/tests/tests_pytorch/helpers/datasets.py index 2366145004c6d..3443020d4528f 100644 --- a/tests/tests_pytorch/helpers/datasets.py +++ b/tests/tests_pytorch/helpers/datasets.py @@ -23,7 +23,7 @@ class MNIST(Dataset): - """Customized `MNIST `_ dataset for testing Pytorch Lightning without the + """Customized `MNIST `_ dataset for testing PyTorch Lightning without the torchvision dependency. Part of the code was copied from From 4c35867b618b4c36bfac5428756b95223f7f526a Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 25 Jul 2022 19:13:46 +0200 Subject: [PATCH 007/230] [App] Introduce Commands (#13602) --- .github/workflows/ci-app_cloud_e2e_test.yml | 3 +- .gitignore | 3 + examples/app_commands/.lightning | 1 + examples/app_commands/app.py | 39 +++ examples/app_commands/command.py | 17 ++ src/lightning_app/CHANGELOG.md | 4 + src/lightning_app/cli/lightning_cli.py | 91 ++++++- src/lightning_app/components/python/tracer.py | 4 +- src/lightning_app/core/api.py | 84 +++++- src/lightning_app/core/app.py | 9 + src/lightning_app/core/constants.py | 2 +- src/lightning_app/core/flow.py | 37 ++- src/lightning_app/core/queues.py | 17 ++ src/lightning_app/runners/backends/backend.py | 4 +- src/lightning_app/runners/multiprocess.py | 3 + src/lightning_app/source_code/uploader.py | 22 +- src/lightning_app/testing/testing.py | 6 +- src/lightning_app/utilities/cli_helpers.py | 60 ++++- .../utilities/commands/__init__.py | 3 + src/lightning_app/utilities/commands/base.py | 245 ++++++++++++++++++ .../utilities/packaging/lightning_utils.py | 18 +- src/lightning_app/utilities/proxies.py | 4 + tests/tests_app/cli/test_cli.py | 8 +- .../components/python/test_python.py | 7 +- tests/tests_app/core/test_lightning_api.py | 28 +- tests/tests_app/source_code/test_uploader.py | 5 +- tests/tests_app/utilities/test_commands.py | 162 ++++++++++++ tests/tests_app/utilities/test_state.py | 4 +- tests/tests_app_examples/test_commands.py | 31 +++ 29 files changed, 858 insertions(+), 63 deletions(-) create mode 100644 examples/app_commands/.lightning create mode 100644 examples/app_commands/app.py create mode 100644 examples/app_commands/command.py create mode 100644 src/lightning_app/utilities/commands/__init__.py create mode 100644 src/lightning_app/utilities/commands/base.py create mode 100644 tests/tests_app/utilities/test_commands.py create mode 100644 tests/tests_app_examples/test_commands.py diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app_cloud_e2e_test.yml index 3abdf9c92ba5e..cb0fbdf40a9e0 100644 --- a/.github/workflows/ci-app_cloud_e2e_test.yml +++ b/.github/workflows/ci-app_cloud_e2e_test.yml @@ -54,6 +54,7 @@ jobs: - custom_work_dependencies - drive - payload + - commands timeout-minutes: 35 steps: - uses: actions/checkout@v2 @@ -155,7 +156,7 @@ jobs: shell: bash run: | mkdir -p ${VIDEO_LOCATION} - HEADLESS=1 python -m pytest tests/tests_app_examples/test_${{ matrix.app_name }}.py::test_${{ matrix.app_name }}_example_cloud --timeout=900 --capture=no -v --color=yes + HEADLESS=1 PACKAGE_LIGHTNING=1 python -m pytest tests/tests_app_examples/test_${{ matrix.app_name }}.py::test_${{ matrix.app_name }}_example_cloud --timeout=900 --capture=no -v --color=yes # Delete the artifacts if successful rm -r ${VIDEO_LOCATION}/${{ matrix.app_name }} diff --git a/.gitignore b/.gitignore index ad4422b1a7ff7..7040a912974e1 100644 --- a/.gitignore +++ b/.gitignore @@ -109,6 +109,7 @@ celerybeat-schedule # dotenv .env +.env_stagging # virtualenv .venv @@ -160,3 +161,5 @@ tags .tags src/lightning_app/ui/* *examples/template_react_ui* +hars* +artifacts/* diff --git a/examples/app_commands/.lightning b/examples/app_commands/.lightning new file mode 100644 index 0000000000000..3efc0ce6284b0 --- /dev/null +++ b/examples/app_commands/.lightning @@ -0,0 +1 @@ +name: app-commands diff --git a/examples/app_commands/app.py b/examples/app_commands/app.py new file mode 100644 index 0000000000000..99eb15c75c709 --- /dev/null +++ b/examples/app_commands/app.py @@ -0,0 +1,39 @@ +from command import CustomCommand, CustomConfig + +from lightning import LightningFlow +from lightning_app.core.app import LightningApp + + +class ChildFlow(LightningFlow): + def trigger_method(self, name: str): + print(f"Hello {name}") + + def configure_commands(self): + return [{"nested_trigger_command": self.trigger_method}] + + +class FlowCommands(LightningFlow): + def __init__(self): + super().__init__() + self.names = [] + self.child_flow = ChildFlow() + + def run(self): + if len(self.names): + print(self.names) + + def trigger_without_client_command(self, name: str): + self.names.append(name) + + def trigger_with_client_command(self, config: CustomConfig): + self.names.append(config.name) + + def configure_commands(self): + commands = [ + {"trigger_without_client_command": self.trigger_without_client_command}, + {"trigger_with_client_command": CustomCommand(self.trigger_with_client_command)}, + ] + return commands + self.child_flow.configure_commands() + + +app = LightningApp(FlowCommands()) diff --git a/examples/app_commands/command.py b/examples/app_commands/command.py new file mode 100644 index 0000000000000..8c3070f6d764c --- /dev/null +++ b/examples/app_commands/command.py @@ -0,0 +1,17 @@ +from argparse import ArgumentParser + +from pydantic import BaseModel + +from lightning.app.utilities.commands import ClientCommand + + +class CustomConfig(BaseModel): + name: str + + +class CustomCommand(ClientCommand): + def run(self): + parser = ArgumentParser() + parser.add_argument("--name", type=str) + args = parser.parse_args() + self.invoke_handler(config=CustomConfig(name=args.name)) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index bc5bf25dc866a..7d0dcb589b9e3 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -8,6 +8,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) + +### Changed + - Update the Lightning App docs ([#13537](https://github.com/PyTorchLightning/pytorch-lightning/pull/13537)) ### Changed diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 696269c712ea9..74b2d1c4926e1 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -1,9 +1,13 @@ import logging import os +import sys +from argparse import ArgumentParser from pathlib import Path from typing import List, Tuple, Union +from uuid import uuid4 import click +import requests from requests.exceptions import ConnectionError from lightning_app import __version__ as ver @@ -11,9 +15,13 @@ from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType -from lightning_app.utilities.cli_helpers import _format_input_env_variables +from lightning_app.utilities.cli_helpers import ( + _format_input_env_variables, + _retrieve_application_url_and_available_commands, +) from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth +from lightning_app.utilities.state import headers_for logger = logging.getLogger(__name__) @@ -26,14 +34,23 @@ def get_app_url(runtime_type: RuntimeType, *args) -> str: return "http://127.0.0.1:7501/admin" if LOCAL_LAUNCH_ADMIN_VIEW else "http://127.0.0.1:7501/view" +def main(): + if len(sys.argv) == 1: + _main() + elif sys.argv[1] in _main.commands.keys() or sys.argv[1] == "--help": + _main() + else: + app_command() + + @click.group() @click.version_option(ver) -def main(): +def _main(): register_all_external_components() pass -@main.command() +@_main.command() def login(): """Log in to your Lightning.ai account.""" auth = Auth() @@ -46,7 +63,7 @@ def login(): exit(1) -@main.command() +@_main.command() def logout(): """Log out of your Lightning.ai account.""" Auth().clear() @@ -93,7 +110,7 @@ def on_before_run(*args): click.echo("Application is ready in the cloud") -@main.group() +@_main.group() def run(): """Run your application.""" @@ -125,31 +142,83 @@ def run_app( _run_app(file, cloud, without_server, no_cache, name, blocking, open_ui, env) -@main.group(hidden=True) +def app_command(): + """Execute a function in a running application from its name.""" + from lightning_app.utilities.commands.base import _download_command + + logger.warn("Lightning Commands are a beta feature and APIs aren't stable yet.") + + debug_mode = bool(int(os.getenv("DEBUG", "0"))) + + parser = ArgumentParser() + parser.add_argument("--app_id", default=None, type=str, help="Optional argument to identify an application.") + hparams, argv = parser.parse_known_args() + + # 1: Collect the url and comments from the running application + url, commands = _retrieve_application_url_and_available_commands(hparams.app_id) + if url is None or commands is None: + raise Exception("We couldn't find any matching running app.") + + if not commands: + raise Exception("This application doesn't expose any commands yet.") + + command = argv[0] + + command_names = [c["command"] for c in commands] + if command not in command_names: + raise Exception(f"The provided command {command} isn't available in {command_names}") + + # 2: Send the command from the user + command_metadata = [c for c in commands if c["command"] == command][0] + params = command_metadata["params"] + + # 3: Execute the command + if not command_metadata["is_client_command"]: + # TODO: Improve what is supported there. + kwargs = {k.split("=")[0].replace("--", ""): k.split("=")[1] for k in argv[1:]} + for param in params: + if param not in kwargs: + raise Exception(f"The argument --{param}=X hasn't been provided.") + json = { + "command_name": command, + "command_arguments": kwargs, + "affiliation": command_metadata["affiliation"], + "id": str(uuid4()), + } + resp = requests.post(url + "/api/v1/commands", json=json, headers=headers_for({})) + assert resp.status_code == 200, resp.json() + else: + client_command, models = _download_command(command_metadata, hparams.app_id, debug_mode=debug_mode) + client_command._setup(metadata=command_metadata, models=models, app_url=url) + sys.argv = argv + client_command.run() + + +@_main.group(hidden=True) def fork(): """Fork an application.""" pass -@main.group(hidden=True) +@_main.group(hidden=True) def stop(): """Stop your application.""" pass -@main.group(hidden=True) +@_main.group(hidden=True) def delete(): """Delete an application.""" pass -@main.group(name="list", hidden=True) +@_main.group(name="list", hidden=True) def get_list(): """List your applications.""" pass -@main.group() +@_main.group() def install(): """Install Lightning apps and components.""" @@ -207,7 +276,7 @@ def install_component(name, yes, version): cmd_install.gallery_component(name, yes, version) -@main.group() +@_main.group() def init(): """Init a Lightning app and component.""" diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index ed692c7f3ed27..fa955646acbbf 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -93,8 +93,6 @@ def __init__( :language: python """ super().__init__(**kwargs) - if not os.path.exists(script_path): - raise FileNotFoundError(f"The provided `script_path` {script_path}` wasn't found.") self.script_path = str(script_path) if isinstance(script_args, str): script_args = script_args.split(" ") @@ -105,6 +103,8 @@ def __init__( setattr(self, name, None) def run(self, **kwargs): + if not os.path.exists(self.script_path): + raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.") kwargs = {k: v.value if isinstance(v, Payload) else v for k, v in kwargs.items()} init_globals = globals() init_globals.update(kwargs) diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index 024eb712389b2..f38c1844e28e0 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -3,6 +3,8 @@ import os import queue import sys +import time +import traceback from copy import deepcopy from multiprocessing import Queue from threading import Event, Lock, Thread @@ -40,6 +42,10 @@ class SessionMiddleware: frontend_static_dir = os.path.join(FRONTEND_DIR, "static") api_app_delta_queue: Queue = None +api_commands_requests_queue: Queue = None +api_commands_metadata_queue: Queue = None +api_commands_responses_queue: Queue = None + template = {"ui": {}, "app": {}} templates = Jinja2Templates(directory=FRONTEND_DIR) @@ -50,6 +56,8 @@ class SessionMiddleware: lock = Lock() app_spec: Optional[List] = None +app_commands_metadata: Optional[Dict] = None +commands_response_store = {} logger = logging.getLogger(__name__) @@ -59,16 +67,22 @@ class SessionMiddleware: class UIRefresher(Thread): - def __init__(self, api_publish_state_queue) -> None: + def __init__(self, api_publish_state_queue, api_commands_metadata_queue, api_commands_responses_queue) -> None: super().__init__(daemon=True) self.api_publish_state_queue = api_publish_state_queue + self.api_commands_metadata_queue = api_commands_metadata_queue + self.api_commands_responses_queue = api_commands_responses_queue self._exit_event = Event() def run(self): # TODO: Create multiple threads to handle the background logic # TODO: Investigate the use of `parallel=True` - while not self._exit_event.is_set(): - self.run_once() + try: + while not self._exit_event.is_set(): + self.run_once() + except Exception as e: + logger.error(traceback.print_exc()) + raise e def run_once(self): try: @@ -78,6 +92,22 @@ def run_once(self): except queue.Empty: pass + try: + metadata = self.api_commands_metadata_queue.get(timeout=0) + with lock: + global app_commands_metadata + app_commands_metadata = metadata + except queue.Empty: + pass + + try: + response = self.api_commands_responses_queue.get(timeout=0) + with lock: + global commands_response_store + commands_response_store[response["id"]] = response["response"] + except queue.Empty: + pass + def join(self, timeout: Optional[float] = None) -> None: self._exit_event.set() super().join(timeout) @@ -146,6 +176,43 @@ async def get_spec( return app_spec or [] +@fastapi_service.post("/api/v1/commands", response_class=JSONResponse) +async def run_remote_command( + request: Request, +) -> None: + data = await request.json() + command_name = data.get("command_name", None) + if not command_name: + raise Exception("The provided command name is empty.") + command_arguments = data.get("command_arguments", None) + if not command_arguments: + raise Exception("The provided command metadata is empty.") + affiliation = data.get("affiliation", None) + if not affiliation: + raise Exception("The provided affiliation is empty.") + + async def fn(data): + request_id = data["id"] + api_commands_requests_queue.put(data) + + t0 = time.time() + while request_id not in commands_response_store: + await asyncio.sleep(0.1) + if (time.time() - t0) > 15: + raise Exception("The response was never received.") + + return commands_response_store[request_id] + + return await asyncio.create_task(fn(data)) + + +@fastapi_service.get("/api/v1/commands", response_class=JSONResponse) +async def get_commands() -> Optional[Dict]: + global app_commands_metadata + with lock: + return app_commands_metadata + + @fastapi_service.post("/api/v1/delta") async def post_delta( request: Request, @@ -279,6 +346,9 @@ async def check_is_started(self, queue): def start_server( api_publish_state_queue, api_delta_queue, + commands_requests_queue, + commands_responses_queue, + commands_metadata_queue, has_started_queue: Optional[Queue] = None, host="127.0.0.1", port=8000, @@ -288,16 +358,22 @@ def start_server( ): global api_app_delta_queue global global_app_state_store + global api_commands_requests_queue + global api_commands_responses_queue global app_spec + app_spec = spec api_app_delta_queue = api_delta_queue + api_commands_requests_queue = commands_requests_queue + api_commands_responses_queue = commands_responses_queue + api_commands_metadata_queue = commands_metadata_queue if app_state_store is not None: global_app_state_store = app_state_store global_app_state_store.add(TEST_SESSION_UUID) - refresher = UIRefresher(api_publish_state_queue) + refresher = UIRefresher(api_publish_state_queue, api_commands_metadata_queue, commands_responses_queue) refresher.setDaemon(True) refresher.start() diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 81a1a2115e523..6599b53efcb95 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -16,6 +16,7 @@ from lightning_app.frontend import Frontend from lightning_app.storage.path import storage_root_dir from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef +from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests from lightning_app.utilities.component import _convert_paths_after_init from lightning_app.utilities.enum import AppStage from lightning_app.utilities.exceptions import CacheMissException, ExitAppException @@ -72,6 +73,9 @@ def __init__( # queues definition. self.delta_queue: t.Optional[BaseQueue] = None self.readiness_queue: t.Optional[BaseQueue] = None + self.commands_requests_queue: t.Optional[BaseQueue] = None + self.commands_responses_queue: t.Optional[BaseQueue] = None + self.commands_metadata_queue: t.Optional[BaseQueue] = None self.api_publish_state_queue: t.Optional[BaseQueue] = None self.api_delta_queue: t.Optional[BaseQueue] = None self.error_queue: t.Optional[BaseQueue] = None @@ -81,6 +85,7 @@ def __init__( self.copy_response_queues: t.Optional[t.Dict[str, BaseQueue]] = None self.caller_queues: t.Optional[t.Dict[str, BaseQueue]] = None self.work_queues: t.Optional[t.Dict[str, BaseQueue]] = None + self.commands: t.Optional[t.List] = None self.should_publish_changes_to_api = False self.component_affiliation = None @@ -345,6 +350,8 @@ def run_once(self): elif self.stage == AppStage.RESTARTING: return self._apply_restarting() + _process_command_requests(self) + try: self.check_error_queue() t0 = time() @@ -397,6 +404,8 @@ def _run(self) -> bool: self._reset_run_time_monitor() + _populate_commands_endpoint(self) + while not done: done = self.run_once() diff --git a/src/lightning_app/core/constants.py b/src/lightning_app/core/constants.py index 7644f60a2c50e..fd62de13cc013 100644 --- a/src/lightning_app/core/constants.py +++ b/src/lightning_app/core/constants.py @@ -22,7 +22,7 @@ REDIS_WARNING_QUEUE_SIZE = 1000 USER_ID = os.getenv("USER_ID", "1234") FRONTEND_DIR = os.path.join(os.path.dirname(lightning_app.__file__), "ui") -PREPARE_LIGHTING = bool(int(os.getenv("PREPARE_LIGHTING", "0"))) +PACKAGE_LIGHTNING = os.getenv("PACKAGE_LIGHTNING", None) LOCAL_LAUNCH_ADMIN_VIEW = bool(int(os.getenv("LOCAL_LAUNCH_ADMIN_VIEW", "0"))) CLOUD_UPLOAD_WARNING = int(os.getenv("CLOUD_UPLOAD_WARNING", "2")) DISABLE_DEPENDENCY_CACHE = bool(int(os.getenv("DISABLE_DEPENDENCY_CACHE", "0"))) diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index a5dcfd0a77e2e..d1af891476a02 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -356,8 +356,7 @@ def schedule( class Flow(LightningFlow): def run(self): if self.schedule("hourly"): - # run some code once every hour. - print("run this every hour") + print("run some code every hour") Arguments: cron_pattern: The cron pattern to provide. Learn more at https://crontab.guru/. @@ -509,20 +508,16 @@ def my_streamlit_ui(state): # add your streamlit code here! import streamlit as st - st.button("Hello!") **Example:** Arrange the UI of my children in tabs (default UI by Lightning). .. code-block:: python class Flow(LightningFlow): - ... - def configure_layout(self): return [ dict(name="First Tab", content=self.child0), dict(name="Second Tab", content=self.child1), - # You can include direct URLs too dict(name="Lightning", content="https://lightning.ai"), ] @@ -608,3 +603,33 @@ def experimental_iterate(self, iterable: Iterable, run_once: bool = True, user_k yield value self._calls[call_hash].update({"has_finished": True}) + + def configure_commands(self): + """Configure the commands of this LightningFlow. + + Returns a list of dictionaries mapping a command name to a flow method. + + .. code-block:: python + + class Flow(LightningFlow): + def __init__(self): + super().__init__() + self.names = [] + + def configure_commands(self): + return {"my_command_name": self.my_remote_method} + + def my_remote_method(self, name): + self.names.append(name) + + Once the app is running with the following command: + + .. code-block:: bash + + lightning run app app.py + + .. code-block:: bash + + lightning my_command_name --args name=my_own_name + """ + raise NotImplementedError diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py index 3b88d896536fe..efac8230047e0 100644 --- a/src/lightning_app/core/queues.py +++ b/src/lightning_app/core/queues.py @@ -36,6 +36,9 @@ ORCHESTRATOR_COPY_REQUEST_CONSTANT = "ORCHESTRATOR_COPY_REQUEST" ORCHESTRATOR_COPY_RESPONSE_CONSTANT = "ORCHESTRATOR_COPY_RESPONSE" WORK_QUEUE_CONSTANT = "WORK_QUEUE" +COMMANDS_REQUESTS_QUEUE_CONSTANT = "COMMANDS_REQUESTS_QUEUE" +COMMANDS_RESPONSES_QUEUE_CONSTANT = "COMMANDS_RESPONSES_QUEUE" +COMMANDS_METADATA_QUEUE_CONSTANT = "COMMANDS_METADATA_QUEUE" class QueuingSystem(Enum): @@ -51,6 +54,20 @@ def _get_queue(self, queue_name: str) -> "BaseQueue": else: return SingleProcessQueue(queue_name, default_timeout=STATE_UPDATE_TIMEOUT) + def get_commands_requests_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": + queue_name = f"{queue_id}_{COMMANDS_REQUESTS_QUEUE_CONSTANT}" if queue_id else COMMANDS_REQUESTS_QUEUE_CONSTANT + return self._get_queue(queue_name) + + def get_commands_responses_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": + queue_name = ( + f"{queue_id}_{COMMANDS_RESPONSES_QUEUE_CONSTANT}" if queue_id else COMMANDS_RESPONSES_QUEUE_CONSTANT + ) + return self._get_queue(queue_name) + + def get_commands_metadata_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": + queue_name = f"{queue_id}_{COMMANDS_METADATA_QUEUE_CONSTANT}" if queue_id else COMMANDS_METADATA_QUEUE_CONSTANT + return self._get_queue(queue_name) + def get_readiness_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": queue_name = f"{queue_id}_{READINESS_QUEUE_CONSTANT}" if queue_id else READINESS_QUEUE_CONSTANT return self._get_queue(queue_name) diff --git a/src/lightning_app/runners/backends/backend.py b/src/lightning_app/runners/backends/backend.py index 80ceb105bbbd1..c370c7098b778 100644 --- a/src/lightning_app/runners/backends/backend.py +++ b/src/lightning_app/runners/backends/backend.py @@ -82,9 +82,11 @@ def _prepare_queues(self, app): kw = dict(queue_id=self.queue_id) app.delta_queue = self.queues.get_delta_queue(**kw) app.readiness_queue = self.queues.get_readiness_queue(**kw) + app.commands_requests_queue = self.queues.get_commands_requests_queue(**kw) + app.commands_responses_queue = self.queues.get_commands_responses_queue(**kw) + app.commands_metadata_queue = self.queues.get_commands_metadata_queue(**kw) app.error_queue = self.queues.get_error_queue(**kw) app.delta_queue = self.queues.get_delta_queue(**kw) - app.readiness_queue = self.queues.get_readiness_queue(**kw) app.error_queue = self.queues.get_error_queue(**kw) app.api_publish_state_queue = self.queues.get_api_state_publish_queue(**kw) app.api_delta_queue = self.queues.get_api_delta_queue(**kw) diff --git a/src/lightning_app/runners/multiprocess.py b/src/lightning_app/runners/multiprocess.py index 4c58c816c566c..92ec900d89c65 100644 --- a/src/lightning_app/runners/multiprocess.py +++ b/src/lightning_app/runners/multiprocess.py @@ -66,6 +66,9 @@ def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwarg api_publish_state_queue=self.app.api_publish_state_queue, api_delta_queue=self.app.api_delta_queue, has_started_queue=has_started_queue, + commands_requests_queue=self.app.commands_requests_queue, + commands_responses_queue=self.app.commands_responses_queue, + commands_metadata_queue=self.app.commands_metadata_queue, spec=extract_metadata_from_app(self.app), ) server_proc = multiprocessing.Process(target=start_server, kwargs=kwargs) diff --git a/src/lightning_app/source_code/uploader.py b/src/lightning_app/source_code/uploader.py index b3a77bc6334bc..5816c01c3fb1c 100644 --- a/src/lightning_app/source_code/uploader.py +++ b/src/lightning_app/source_code/uploader.py @@ -39,16 +39,15 @@ def __init__(self, presigned_url: str, source_file: str, total_size: int, name: self.total_size = total_size self.name = name - @staticmethod - def upload_s3_data(url: str, data: bytes, retries: int, disconnect_retry_wait_seconds: int) -> str: - """Send data to s3 url. + def upload_data(self, url: str, data: bytes, retries: int, disconnect_retry_wait_seconds: int) -> str: + """Send data to url. Parameters ---------- url: str - S3 url string to send data to + url string to send data to data: bytes - Bytes of data to send to S3 + Bytes of data to send to url retries: int Amount of retries disconnect_retry_wait_seconds: int @@ -65,16 +64,19 @@ def upload_s3_data(url: str, data: bytes, retries: int, disconnect_retry_wait_se retries = Retry(total=10) with requests.Session() as s: s.mount("https://", HTTPAdapter(max_retries=retries)) - response = s.put(url, data=data) - if "ETag" not in response.headers: - raise ValueError(f"Unexpected response from S3, response: {response.content}") - return response.headers["ETag"] + return self._upload_data(s, url, data) except BrokenPipeError: time.sleep(disconnect_retry_wait_seconds) disconnect_retries -= 1 raise ValueError("Unable to upload file after multiple attempts") + def _upload_data(self, s: requests.Session, url: str, data: bytes): + resp = s.put(url, data=data) + if "ETag" not in resp.headers: + raise ValueError(f"Unexpected response from {url}, response: {resp.content}") + return resp.headers["ETag"] + def upload(self) -> None: """Upload files from source dir into target path in S3.""" task_id = self.progress.add_task("upload", filename=self.name, total=self.total_size) @@ -82,7 +84,7 @@ def upload(self) -> None: try: with open(self.source_file, "rb") as f: data = f.read() - self.upload_s3_data(self.presigned_url, data, self.retries, self.disconnect_retry_wait_seconds) + self.upload_data(self.presigned_url, data, self.retries, self.disconnect_retry_wait_seconds) self.progress.update(task_id, advance=len(data)) finally: self.progress.stop() diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 9e7c727756ba0..bdf37cacf04a7 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -179,11 +179,7 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator: # 5. Create chromium browser, auth to lightning_app.ai and yield the admin and view pages. with sync_playwright() as p: browser = p.chromium.launch(headless=bool(int(os.getenv("HEADLESS", "0")))) - payload = { - "apiKey": Config.api_key, - "username": Config.username, - "duration": "120000", - } + payload = {"apiKey": Config.api_key, "username": Config.username, "duration": "120000"} context = browser.new_context( # Eventually this will need to be deleted http_credentials=HttpCredentials( diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index b573440501b3e..fcce96ec64407 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -1,5 +1,11 @@ import re -from typing import Dict +from typing import Dict, Optional + +import requests + +from lightning_app.core.constants import APP_SERVER_PORT +from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.network import LightningClient def _format_input_env_variables(env_list: tuple) -> Dict[str, str]: @@ -35,3 +41,55 @@ def _format_input_env_variables(env_list: tuple) -> Dict[str, str]: env_vars_dict[var_name] = value return env_vars_dict + + +def _is_url(id: Optional[str]) -> bool: + if isinstance(id, str) and (id.startswith("https://") or id.startswith("http://")): + return True + return False + + +def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Optional[str]): + """This function is used to retrieve the current url associated with an id.""" + + if _is_url(app_id_or_name_or_url): + url = app_id_or_name_or_url + assert url + resp = requests.get(url + "/api/v1/commands") + if resp.status_code != 200: + raise Exception(f"The server didn't process the request properly. Found {resp.json()}") + return url, resp.json() + + # 2: If no identifier has been provided, evaluate the local application + failed_locally = False + + if app_id_or_name_or_url is None: + try: + url = f"http://localhost:{APP_SERVER_PORT}" + resp = requests.get(f"{url}/api/v1/commands") + if resp.status_code != 200: + raise Exception(f"The server didn't process the request properly. Found {resp.json()}") + return url, resp.json() + except requests.exceptions.ConnectionError: + failed_locally = True + + # 3: If an identified was provided or the local evaluation has failed, evaluate the cloud. + if app_id_or_name_or_url or failed_locally: + client = LightningClient() + project = _get_project(client) + list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) + + lightningapp_names = [lightningapp.name for lightningapp in list_lightningapps.lightningapps] + + if not app_id_or_name_or_url: + raise Exception(f"Provide an application name, id or url with --app_id=X. Found {lightningapp_names}") + + for lightningapp in list_lightningapps.lightningapps: + if lightningapp.id == app_id_or_name_or_url or lightningapp.name == app_id_or_name_or_url: + if lightningapp.status.url == "": + raise Exception("The application is starting. Try in a few moments.") + resp = requests.get(lightningapp.status.url + "/api/v1/commands") + if resp.status_code != 200: + raise Exception(f"The server didn't process the request properly. Found {resp.json()}") + return lightningapp.status.url, resp.json() + return None, None diff --git a/src/lightning_app/utilities/commands/__init__.py b/src/lightning_app/utilities/commands/__init__.py new file mode 100644 index 0000000000000..2ae6aba120168 --- /dev/null +++ b/src/lightning_app/utilities/commands/__init__.py @@ -0,0 +1,3 @@ +from lightning_app.utilities.commands.base import ClientCommand + +__all__ = ["ClientCommand"] diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py new file mode 100644 index 0000000000000..11661e51ca26a --- /dev/null +++ b/src/lightning_app/utilities/commands/base.py @@ -0,0 +1,245 @@ +import errno +import inspect +import logging +import os +import os.path as osp +import shutil +import sys +from getpass import getuser +from importlib.util import module_from_spec, spec_from_file_location +from tempfile import gettempdir +from typing import Any, Callable, Dict, List, Optional, Tuple +from uuid import uuid4 + +import requests +from pydantic import BaseModel + +from lightning_app.utilities.app_helpers import is_overridden +from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.network import LightningClient +from lightning_app.utilities.state import AppState + +_logger = logging.getLogger(__name__) + + +def makedirs(path: str): + r"""Recursive directory creation function.""" + try: + os.makedirs(osp.expanduser(osp.normpath(path))) + except OSError as e: + if e.errno != errno.EEXIST and osp.isdir(path): + raise e + + +class _ClientCommandConfig(BaseModel): + command: str + affiliation: str + params: Dict[str, str] + is_client_command: bool + cls_path: str + cls_name: str + owner: str + requirements: Optional[List[str]] + + +class ClientCommand: + def __init__(self, method: Callable, requirements: Optional[List[str]] = None) -> None: + self.method = method + flow = getattr(method, "__self__", None) + self.owner = flow.name if flow else None + self.requirements = requirements + self.metadata = None + self.models: Optional[Dict[str, BaseModel]] = None + self.app_url = None + self._state = None + + def _setup(self, metadata: Dict[str, Any], models: Dict[str, BaseModel], app_url: str) -> None: + self.metadata = metadata + self.models = models + self.app_url = app_url + + @property + def state(self): + if self._state is None: + assert self.app_url + # TODO: Resolve this hack + os.environ["LIGHTNING_APP_STATE_URL"] = "1" + self._state = AppState(host=self.app_url) + self._state._request_state() + os.environ.pop("LIGHTNING_APP_STATE_URL") + return self._state + + def run(self, **cli_kwargs) -> None: + """Overrides with the logic to execute on the client side.""" + + def invoke_handler(self, **kwargs: Any) -> Dict[str, Any]: + from lightning.app.utilities.state import headers_for + + assert kwargs.keys() == self.models.keys() + for k, v in kwargs.items(): + assert isinstance(v, self.models[k]) + json = { + "command_name": self.metadata["command"], + "command_arguments": {k: v.json() for k, v in kwargs.items()}, + "affiliation": self.metadata["affiliation"], + "id": str(uuid4()), + } + resp = requests.post(self.app_url + "/api/v1/commands", json=json, headers=headers_for({})) + assert resp.status_code == 200, resp.json() + return resp.json() + + def _to_dict(self): + return {"owner": self.owner, "requirements": self.requirements} + + def __call__(self, **kwargs: Any) -> Any: + assert self.models + input = {} + for k, v in kwargs.items(): + input[k] = self.models[k].parse_raw(v) + return self.method(**input) + + +def _download_command( + command_metadata: Dict[str, Any], + app_id: Optional[str], + debug_mode: bool = False, +) -> Tuple[ClientCommand, Dict[str, BaseModel]]: + # TODO: This is a skateboard implementation and the final version will rely on versioned + # immutable commands for security concerns + config = _ClientCommandConfig(**command_metadata) + tmpdir = osp.join(gettempdir(), f"{getuser()}_commands") + makedirs(tmpdir) + target_file = osp.join(tmpdir, f"{config.command}.py") + if app_id: + client = LightningClient() + project_id = _get_project(client).project_id + response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts(project_id, app_id) + for artifact in response.artifacts: + if f"commands/{config.command}.py" == artifact.filename: + r = requests.get(artifact.url, allow_redirects=True) + with open(target_file, "wb") as f: + f.write(r.content) + else: + if not debug_mode: + shutil.copy(config.cls_path, target_file) + + cls_name = config.cls_name + spec = spec_from_file_location(config.cls_name, config.cls_path if debug_mode else target_file) + mod = module_from_spec(spec) + sys.modules[cls_name] = mod + spec.loader.exec_module(mod) + command = getattr(mod, cls_name)(method=None, requirements=config.requirements) + models = {k: getattr(mod, v) for k, v in config.params.items()} + if debug_mode: + shutil.rmtree(tmpdir) + return command, models + + +def _to_annotation(anno: str) -> str: + anno = anno.split("'")[1] + if "." in anno: + return anno.split(".")[-1] + return anno + + +def _command_to_method_and_metadata(command: ClientCommand) -> Tuple[Callable, Dict[str, Any]]: + """Extract method and its metadata from a ClientCommand.""" + params = inspect.signature(command.method).parameters + command_metadata = { + "cls_path": inspect.getfile(command.__class__), + "cls_name": command.__class__.__name__, + "params": {p.name: _to_annotation(str(p.annotation)) for p in params.values()}, + **command._to_dict(), + } + method = command.method + command.models = {} + for k, v in command_metadata["params"].items(): + if v == "_empty": + raise Exception( + f"Please, annotate your method {method} with pydantic BaseModel. Refer to the documentation." + ) + config = getattr(sys.modules[command.__module__], v, None) + if config is None: + config = getattr(sys.modules[method.__module__], v, None) + if config: + raise Exception( + f"The provided annotation for the argument {k} should in the file " + f"{inspect.getfile(command.__class__)}, not {inspect.getfile(command.method)}." + ) + if config is None or not issubclass(config, BaseModel): + raise Exception( + f"The provided annotation for the argument {k} shouldn't an instance of pydantic BaseModel." + ) + command.models[k] = config + return method, command_metadata + + +def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]: + from lightning_app.storage.path import _is_s3fs_available, filesystem, shared_storage_path + + filepath = f"commands/{command_name}.py" + remote_url = str(shared_storage_path() / "artifacts" / filepath) + fs = filesystem() + + if _is_s3fs_available(): + from s3fs import S3FileSystem + + if not isinstance(fs, S3FileSystem): + return + source_file = str(inspect.getfile(command.__class__)) + remote_url = str(shared_storage_path() / "artifacts" / filepath) + fs.put(source_file, remote_url) + return filepath + + +def _populate_commands_endpoint(app): + if not is_overridden("configure_commands", app.root): + return + + # 1: Populate commands metadata + commands = app.root.configure_commands() + commands_metadata = [] + command_names = set() + for command_mapping in commands: + for command_name, command in command_mapping.items(): + is_client_command = isinstance(command, ClientCommand) + extras = {} + if is_client_command: + _upload_command(command_name, command) + command, extras = _command_to_method_and_metadata(command) + if command_name in command_names: + raise Exception(f"The component name {command_name} has already been used. They need to be unique.") + command_names.add(command_name) + params = inspect.signature(command).parameters + commands_metadata.append( + { + "command": command_name, + "affiliation": command.__self__.name, + "params": list(params.keys()), + "is_client_command": is_client_command, + **extras, + } + ) + + # 1.2: Pass the collected commands through the queue to the Rest API. + app.commands_metadata_queue.put(commands_metadata) + app.commands = commands + + +def _process_command_requests(app): + if not is_overridden("configure_commands", app.root): + return + + # 1: Populate commands metadata + commands = app.commands + + # 2: Collect requests metadata + command_query = app.get_state_changed_from_queue(app.commands_requests_queue) + if command_query: + for command in commands: + for command_name, method in command.items(): + if command_query["command_name"] == command_name: + # 2.1: Evaluate the method associated to a specific command. + # Validation is done on the CLI side. + response = method(**command_query["command_arguments"]) + app.commands_responses_queue.put({"response": response, "id": command_query["id"]}) diff --git a/src/lightning_app/utilities/packaging/lightning_utils.py b/src/lightning_app/utilities/packaging/lightning_utils.py index ae26d39ec5bbb..37f4ff22988eb 100644 --- a/src/lightning_app/utilities/packaging/lightning_utils.py +++ b/src/lightning_app/utilities/packaging/lightning_utils.py @@ -15,7 +15,7 @@ from lightning_app import _logger, _PROJECT_ROOT, _root_logger from lightning_app.__version__ import version -from lightning_app.core.constants import PREPARE_LIGHTING +from lightning_app.core.constants import PACKAGE_LIGHTNING from lightning_app.utilities.git import check_github_repository, get_dir_name logger = logging.getLogger(__name__) @@ -96,11 +96,13 @@ def _prepare_lightning_wheels_and_requirements(root: Path) -> Optional[Callable] # Packaging the Lightning codebase happens only inside the `lightning` repo. git_dir_name = get_dir_name() if check_github_repository() else None - if not PREPARE_LIGHTING and (not git_dir_name or (git_dir_name and not git_dir_name.startswith("lightning"))): + is_lightning = git_dir_name and git_dir_name == "lightning" + + if (PACKAGE_LIGHTNING is None and not is_lightning) or PACKAGE_LIGHTNING == "0": return - if not bool(int(os.getenv("SKIP_LIGHTING_WHEELS_BUILD", "0"))): - download_frontend(_PROJECT_ROOT) - _prepare_wheel(_PROJECT_ROOT) + + download_frontend(_PROJECT_ROOT) + _prepare_wheel(_PROJECT_ROOT) logger.info("Packaged Lightning with your application.") @@ -108,11 +110,12 @@ def _prepare_lightning_wheels_and_requirements(root: Path) -> Optional[Callable] tar_files = [os.path.join(root, tar_name)] - # skipping this by default - if not bool(int(os.getenv("SKIP_LIGHTING_UTILITY_WHEELS_BUILD", "1"))): + # Don't skip by default + if (PACKAGE_LIGHTNING or is_lightning) and not bool(int(os.getenv("SKIP_LIGHTING_UTILITY_WHEELS_BUILD", "0"))): # building and copying launcher wheel if installed in editable mode launcher_project_path = get_dist_path_if_editable_install("lightning_launcher") if launcher_project_path: + logger.info("Packaged Lightning Launcher with your application.") _prepare_wheel(launcher_project_path) tar_name = _copy_tar(launcher_project_path, root) tar_files.append(os.path.join(root, tar_name)) @@ -120,6 +123,7 @@ def _prepare_lightning_wheels_and_requirements(root: Path) -> Optional[Callable] # building and copying lightning-cloud wheel if installed in editable mode lightning_cloud_project_path = get_dist_path_if_editable_install("lightning_cloud") if lightning_cloud_project_path: + logger.info("Packaged Lightning Cloud with your application.") _prepare_wheel(lightning_cloud_project_path) tar_name = _copy_tar(lightning_cloud_project_path, root) tar_files.append(os.path.join(root, tar_name)) diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index ead681bff7788..c33e41bb70203 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -5,6 +5,7 @@ import sys import threading import time +import traceback import warnings from copy import deepcopy from dataclasses import dataclass @@ -398,6 +399,9 @@ def run_once(self): ) self.delta_queue.put(ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(state, self.work.state)))) self.work.on_exception(e) + print("########## CAPTURED EXCEPTION ###########") + print(traceback.print_exc()) + print("########## CAPTURED EXCEPTION ###########") return # 14. Copy all artifacts to the shared storage so other Works can access them while this Work gets scaled down diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 2626116990340..39d8d6b7890b6 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -5,7 +5,7 @@ from click.testing import CliRunner from lightning_cloud.openapi import Externalv1LightningappInstance -from lightning_app.cli.lightning_cli import get_app_url, login, logout, main, run +from lightning_app.cli.lightning_cli import _main, get_app_url, login, logout, run from lightning_app.runners.runtime_type import RuntimeType @@ -37,7 +37,7 @@ def test_start_target_url(runtime_type, extra_args, lightning_cloud_url, expecte assert get_app_url(runtime_type, *extra_args) == expected_url -@pytest.mark.parametrize("command", [main, run]) +@pytest.mark.parametrize("command", [_main, run]) def test_commands(command): runner = CliRunner() result = runner.invoke(command) @@ -46,12 +46,12 @@ def test_commands(command): def test_main_lightning_cli_help(): """Validate the Lightning CLI.""" - res = os.popen("python -m lightning_app --help").read() + res = os.popen("python -m lightning --help").read() assert "login " in res assert "logout " in res assert "run " in res - res = os.popen("python -m lightning_app run --help").read() + res = os.popen("python -m lightning run --help").read() assert "app " in res # hidden run commands should not appear in the help text diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index 283f449092d06..61969ef1c4c51 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -17,10 +17,9 @@ def test_non_existing_python_script(): run_work_isolated(python_script) assert not python_script.has_started - with pytest.raises(FileNotFoundError, match=match): - python_script = TracerPythonScript(match) - run_work_isolated(python_script) - assert not python_script.has_started + python_script = TracerPythonScript(match, raise_exception=False) + run_work_isolated(python_script) + assert python_script.has_failed def test_simple_python_script(): diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index 81ba6fe0ba179..9de7c63051b63 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -161,10 +161,12 @@ def test_update_publish_state_and_maybe_refresh_ui(): app = AppStageTestingApp(FlowA(), debug=True) publish_state_queue = MockQueue("publish_state_queue") + commands_metadata_queue = MockQueue("commands_metadata_queue") + commands_responses_queue = MockQueue("commands_metadata_queue") publish_state_queue.put(app.state_with_changes) - thread = UIRefresher(publish_state_queue) + thread = UIRefresher(publish_state_queue, commands_metadata_queue, commands_responses_queue) thread.run_once() assert global_app_state_store.get_app_state("1234") == app.state_with_changes @@ -190,11 +192,21 @@ def get(self, timeout: int = 0): publish_state_queue = InfiniteQueue("publish_state_queue") change_state_queue = MockQueue("change_state_queue") has_started_queue = MockQueue("has_started_queue") + commands_requests_queue = MockQueue("commands_requests_queue") + commands_responses_queue = MockQueue("commands_responses_queue") + commands_metadata_queue = MockQueue("commands_metadata_queue") state = app.state_with_changes publish_state_queue.put(state) spec = extract_metadata_from_app(app) ui_refresher = start_server( - publish_state_queue, change_state_queue, has_started_queue=has_started_queue, uvicorn_run=False, spec=spec + publish_state_queue, + change_state_queue, + commands_requests_queue, + commands_responses_queue, + commands_metadata_queue, + has_started_queue=has_started_queue, + uvicorn_run=False, + spec=spec, ) headers = headers_for({"type": x_lightning_type}) @@ -331,10 +343,16 @@ def test_start_server_started(): api_publish_state_queue = mp.Queue() api_delta_queue = mp.Queue() has_started_queue = mp.Queue() + commands_requests_queue = mp.Queue() + commands_responses_queue = mp.Queue() + commands_metadata_queue = mp.Queue() kwargs = dict( api_publish_state_queue=api_publish_state_queue, api_delta_queue=api_delta_queue, has_started_queue=has_started_queue, + commands_requests_queue=commands_requests_queue, + commands_responses_queue=commands_responses_queue, + commands_metadata_queue=commands_metadata_queue, port=1111, ) @@ -354,12 +372,18 @@ def test_start_server_info_message(ui_refresher, uvicorn_run, caplog, monkeypatc api_publish_state_queue = MockQueue() api_delta_queue = MockQueue() has_started_queue = MockQueue() + commands_requests_queue = MockQueue() + commands_responses_queue = MockQueue() + commands_metadata_queue = MockQueue() kwargs = dict( host=host, port=1111, api_publish_state_queue=api_publish_state_queue, api_delta_queue=api_delta_queue, has_started_queue=has_started_queue, + commands_requests_queue=commands_requests_queue, + commands_responses_queue=commands_responses_queue, + commands_metadata_queue=commands_metadata_queue, ) monkeypatch.setattr(api, "logger", logging.getLogger()) diff --git a/tests/tests_app/source_code/test_uploader.py b/tests/tests_app/source_code/test_uploader.py index 82789e83e37a9..774442291deed 100644 --- a/tests/tests_app/source_code/test_uploader.py +++ b/tests/tests_app/source_code/test_uploader.py @@ -39,10 +39,11 @@ def test_file_uploader(): @mock.patch("lightning_app.source_code.uploader.requests.Session", MockedRequestSession) def test_file_uploader_failing_when_no_etag(): response["response"] = MagicMock(headers={}) + presigned_url = "https://test-url" file_uploader = uploader.FileUploader( - presigned_url="https://test-url", source_file="test.txt", total_size=100, name="test.txt" + presigned_url=presigned_url, source_file="test.txt", total_size=100, name="test.txt" ) file_uploader.progress = MagicMock() - with pytest.raises(ValueError, match="Unexpected response from S3, response"): + with pytest.raises(ValueError, match=f"Unexpected response from {presigned_url}, response"): file_uploader.upload() diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py new file mode 100644 index 0000000000000..1e8e36ed09545 --- /dev/null +++ b/tests/tests_app/utilities/test_commands.py @@ -0,0 +1,162 @@ +import argparse +import sys +from multiprocessing import Process +from time import sleep +from unittest.mock import MagicMock + +import pytest +import requests +from pydantic import BaseModel + +from lightning import LightningFlow +from lightning_app import LightningApp +from lightning_app.cli.lightning_cli import app_command +from lightning_app.core.constants import APP_SERVER_PORT +from lightning_app.runners import MultiProcessRuntime +from lightning_app.testing.helpers import RunIf +from lightning_app.utilities.commands.base import _command_to_method_and_metadata, _download_command, ClientCommand +from lightning_app.utilities.state import AppState + + +class SweepConfig(BaseModel): + sweep_name: str + num_trials: int + + +class SweepCommand(ClientCommand): + def run(self) -> None: + print(sys.argv) + parser = argparse.ArgumentParser() + parser.add_argument("--sweep_name", type=str) + parser.add_argument("--num_trials", type=int) + hparams = parser.parse_args() + + config = SweepConfig(sweep_name=hparams.sweep_name, num_trials=hparams.num_trials) + response = self.invoke_handler(config=config) + assert response is True + + +class FlowCommands(LightningFlow): + def __init__(self): + super().__init__() + self.names = [] + self.has_sweep = False + + def run(self): + if self.has_sweep and len(self.names) == 1: + sleep(2) + self._exit() + + def trigger_method(self, name: str): + self.names.append(name) + + def sweep(self, config: SweepConfig): + self.has_sweep = True + return True + + def configure_commands(self): + return [{"user_command": self.trigger_method}, {"sweep": SweepCommand(self.sweep)}] + + +class DummyConfig(BaseModel): + something: str + something_else: int + + +class DummyCommand(ClientCommand): + def run(self, something: str, something_else: int) -> None: + config = DummyConfig(something=something, something_else=something_else) + response = self.invoke_handler(config=config) + assert response == {"body": 0} + + +def run(config: DummyConfig): + assert isinstance(config, DummyCommand) + + +def run_failure_0(name: str): + pass + + +def run_failure_1(name): + pass + + +class CustomModel(BaseModel): + pass + + +def run_failure_2(name: CustomModel): + pass + + +@RunIf(skip_windows=True) +def test_command_to_method_and_metadata(): + with pytest.raises(Exception, match="The provided annotation for the argument name"): + _command_to_method_and_metadata(ClientCommand(run_failure_0)) + + with pytest.raises(Exception, match="annotate your method"): + _command_to_method_and_metadata(ClientCommand(run_failure_1)) + + with pytest.raises(Exception, match="lightning_app/utilities/commands/base.py"): + _command_to_method_and_metadata(ClientCommand(run_failure_2)) + + +def test_client_commands(monkeypatch): + import requests + + resp = MagicMock() + resp.status_code = 200 + value = {"body": 0} + resp.json = MagicMock(return_value=value) + post = MagicMock() + post.return_value = resp + monkeypatch.setattr(requests, "post", post) + url = "http//" + kwargs = {"something": "1", "something_else": "1"} + command = DummyCommand(run) + _, command_metadata = _command_to_method_and_metadata(command) + command_metadata.update( + { + "command": "dummy", + "affiliation": "root", + "is_client_command": True, + "owner": "root", + } + ) + client_command, models = _download_command(command_metadata, None) + client_command._setup(metadata=command_metadata, models=models, app_url=url) + client_command.run(**kwargs) + + +def target(): + app = LightningApp(FlowCommands()) + MultiProcessRuntime(app).dispatch() + + +def test_configure_commands(monkeypatch): + process = Process(target=target) + process.start() + time_left = 15 + while time_left > 0: + try: + requests.get(f"http://localhost:{APP_SERVER_PORT}/healthz") + break + except requests.exceptions.ConnectionError: + sleep(0.1) + time_left -= 0.1 + + sleep(0.5) + monkeypatch.setattr(sys, "argv", ["lightning", "user_command", "--name=something"]) + app_command() + sleep(0.5) + state = AppState() + state._request_state() + assert state.names == ["something"] + monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name", "my_name", "--num_trials", "1"]) + app_command() + time_left = 15 + while time_left > 0 or process.exitcode is None: + sleep(0.1) + time_left -= 0.1 + assert process.exitcode == 0 diff --git a/tests/tests_app/utilities/test_state.py b/tests/tests_app/utilities/test_state.py index e275817f680fc..0740ffc615b87 100644 --- a/tests/tests_app/utilities/test_state.py +++ b/tests/tests_app/utilities/test_state.py @@ -15,7 +15,7 @@ def test_app_state_not_connected(_): """Test an error message when a disconnected AppState tries to access attributes.""" - state = AppState() + state = AppState(port=8000) with pytest.raises(AttributeError, match="Failed to connect and fetch the app state"): _ = state.value with pytest.raises(AttributeError, match="Failed to connect and fetch the app state"): @@ -209,7 +209,7 @@ def test_attach_plugin(): @mock.patch("lightning_app.utilities.state._configure_session", return_value=requests) def test_app_state_connection_error(_): """Test an error message when a connection to retrieve the state can't be established.""" - app_state = AppState() + app_state = AppState(port=8000) with pytest.raises(AttributeError, match=r"Failed to connect and fetch the app state\. Is the app running?"): app_state._request_state() diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py new file mode 100644 index 0000000000000..5116b1b9d54bb --- /dev/null +++ b/tests/tests_app_examples/test_commands.py @@ -0,0 +1,31 @@ +import os +from subprocess import Popen +from time import sleep +from unittest import mock + +import pytest +from tests_app import _PROJECT_ROOT + +from lightning_app.testing.testing import run_app_in_cloud + + +@mock.patch.dict(os.environ, {"SKIP_LIGHTING_UTILITY_WHEELS_BUILD": "0"}) +@pytest.mark.cloud +def test_commands_example_cloud() -> None: + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands")) as ( + admin_page, + _, + fetch_logs, + ): + app_id = admin_page.url.split("/")[-1] + cmd = f"lightning trigger_with_client_command --name=something --app_id {app_id}" + Popen(cmd, shell=True).wait() + cmd = f"lightning trigger_without_client_command --name=else --app_id {app_id}" + Popen(cmd, shell=True).wait() + + has_logs = False + while not has_logs: + for log in fetch_logs(): + if "['something', 'else']" in log: + has_logs = True + sleep(1) From 85b0356ea22242305bb55a8347ed5733cd806b0d Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Tue, 26 Jul 2022 09:38:06 +0200 Subject: [PATCH 008/230] Fix mypy errors attributed to `pytorch_lightning.core.mixins.device_dtype_mixin` (#13704) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pyproject.toml | 1 - .../core/mixins/device_dtype_mixin.py | 27 +++++++------------ 2 files changed, 9 insertions(+), 19 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 177410cba79a6..5a710faf3544b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ module = [ "pytorch_lightning.callbacks.stochastic_weight_avg", "pytorch_lightning.core.datamodule", "pytorch_lightning.core.decorators", - "pytorch_lightning.core.mixins.device_dtype_mixin", "pytorch_lightning.core.module", "pytorch_lightning.core.saving", "pytorch_lightning.demos.boring_classes", diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index 5f6397e4562e5..b12e1cf042a1f 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -16,16 +16,7 @@ import torch from torch.nn import Module - -try: - from typing_extensions import Self -except ImportError: - # workaround for Python 3.7. - # see https://www.python.org/dev/peps/pep-0673/ - from typing import TypeVar - - Self = TypeVar("TDeviceDtypeModuleMixin", bound="DeviceDtypeModuleMixin") - +from typing_extensions import Self import pytorch_lightning as pl @@ -57,7 +48,7 @@ def device(self) -> Union[str, torch.device]: return device - def to(self, *args: Any, **kwargs: Any) -> Self: + def to(self, *args: Any, **kwargs: Any) -> Self: # type: ignore[valid-type] """Moves and/or casts the parameters and buffers. This can be called as @@ -121,7 +112,7 @@ def to(self, *args: Any, **kwargs: Any) -> Self: self.__update_properties(device=out[0], dtype=out[1]) return super().to(*args, **kwargs) - def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: + def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: # type: ignore[valid-type] """Moves all model parameters and buffers to the GPU. This also makes associated parameters and buffers different objects. So it should be called before constructing optimizer if the module will live on GPU while being optimized. @@ -134,11 +125,11 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: Module: self """ if device is None or isinstance(device, int): - device = torch.device("cuda", index=device) + device = torch.device("cuda", index=(device or 0)) self.__update_properties(device=device) return super().cuda(device=device) - def cpu(self) -> Self: + def cpu(self) -> Self: # type: ignore[valid-type] """Moves all model parameters and buffers to the CPU. Returns: @@ -147,7 +138,7 @@ def cpu(self) -> Self: self.__update_properties(device=torch.device("cpu")) return super().cpu() - def type(self, dst_type: Union[str, torch.dtype]) -> Self: + def type(self, dst_type: Union[str, torch.dtype]) -> Self: # type: ignore[valid-type] """Casts all parameters and buffers to :attr:`dst_type`. Arguments: @@ -159,7 +150,7 @@ def type(self, dst_type: Union[str, torch.dtype]) -> Self: self.__update_properties(dtype=dst_type) return super().type(dst_type=dst_type) - def float(self) -> Self: + def float(self) -> Self: # type: ignore[valid-type] """Casts all floating point parameters and buffers to ``float`` datatype. Returns: @@ -168,7 +159,7 @@ def float(self) -> Self: self.__update_properties(dtype=torch.float) return super().float() - def double(self) -> Self: + def double(self) -> Self: # type: ignore[valid-type] """Casts all floating point parameters and buffers to ``double`` datatype. Returns: @@ -177,7 +168,7 @@ def double(self) -> Self: self.__update_properties(dtype=torch.double) return super().double() - def half(self) -> Self: + def half(self) -> Self: # type: ignore[valid-type] """Casts all floating point parameters and buffers to ``half`` datatype. Returns: From e77accfdb1d3b6bb35c3bc569271dc2bd74b5dcb Mon Sep 17 00:00:00 2001 From: donlapark <10988155+donlapark@users.noreply.github.com> Date: Tue, 26 Jul 2022 17:46:17 +0700 Subject: [PATCH 009/230] fixes typing in pytorch_lightning/callbacks/stochastic_weight_avg.py (#13685) Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- pyproject.toml | 1 - .../callbacks/stochastic_weight_avg.py | 67 ++++++++++--------- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5a710faf3544b..8c64c4452ca15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,6 @@ warn_no_return = "False" module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.callbacks.quantization", - "pytorch_lightning.callbacks.stochastic_weight_avg", "pytorch_lightning.core.datamodule", "pytorch_lightning.core.decorators", "pytorch_lightning.core.module", diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 83fb1cf169794..093c8e47d07dd 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -16,19 +16,19 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ """ from copy import deepcopy -from typing import Callable, List, Optional, Union +from typing import Any, Callable, cast, List, Optional, Union import torch -from torch import FloatTensor, nn, Tensor +from torch import nn, Tensor from torch.optim.swa_utils import SWALR import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn -from pytorch_lightning.utilities.types import LRSchedulerConfig +from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig -_AVG_FN = Callable[[Tensor, Tensor, torch.LongTensor], FloatTensor] +_AVG_FN = Callable[[Tensor, Tensor, Tensor], Tensor] class StochasticWeightAveraging(Callback): @@ -106,7 +106,7 @@ def __init__( if wrong_type or wrong_float or wrong_list: raise MisconfigurationException("The `swa_lrs` should a positive float, or a list of positive floats") - if avg_fn is not None and not isinstance(avg_fn, Callable): + if avg_fn is not None and not callable(avg_fn): raise MisconfigurationException("The `avg_fn` should be callable.") if device is not None and not isinstance(device, (torch.device, str)): @@ -118,11 +118,13 @@ def __init__( self._annealing_strategy = annealing_strategy self._avg_fn = avg_fn or self.avg_fn self._device = device - self._model_contains_batch_norm = None - self._average_model = None + self._max_epochs: int + self._model_contains_batch_norm: bool + self._average_model: "pl.LightningModule" @property def swa_start(self) -> int: + assert isinstance(self._swa_epoch_start, int) return max(self._swa_epoch_start - 1, 0) # 0-based @property @@ -130,7 +132,7 @@ def swa_end(self) -> int: return self._max_epochs - 1 # 0-based @staticmethod - def pl_module_contains_batch_norm(pl_module: "pl.LightningModule"): + def pl_module_contains_batch_norm(pl_module: "pl.LightningModule") -> bool: return any(isinstance(module, nn.modules.batchnorm._BatchNorm) for module in pl_module.modules()) def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: Optional[str] = None) -> None: @@ -138,7 +140,7 @@ def setup(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", stage: O with pl_module._prevent_trainer_and_dataloaders_deepcopy(): self._average_model = deepcopy(pl_module) - def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): + def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: if len(trainer.optimizers) != 1: raise MisconfigurationException("SWA currently works with 1 `optimizer`.") @@ -155,7 +157,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): # virtually increase max_epochs to perform batch norm update on latest epoch. trainer.fit_loop.max_epochs += 1 - def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): + def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: if trainer.current_epoch == self.swa_start: # move average model to request device. self._average_model = self._average_model.to(self._device or pl_module.device) @@ -167,12 +169,15 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo for lr, group in zip(self._swa_lrs, optimizer.param_groups): group["initial_lr"] = lr - self._swa_scheduler = SWALR( - optimizer, - swa_lr=self._swa_lrs, - anneal_epochs=self._annealing_epochs, - anneal_strategy=self._annealing_strategy, - last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1, + self._swa_scheduler: _LRScheduler = cast( + _LRScheduler, + SWALR( + optimizer, + swa_lr=self._swa_lrs, # type: ignore[arg-type] + anneal_epochs=self._annealing_epochs, + anneal_strategy=self._annealing_strategy, + last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1, + ), ) # We assert that there is only one optimizer on fit start, so know opt_idx is always 0 default_scheduler_cfg = LRSchedulerConfig(self._swa_scheduler, opt_idx=0) @@ -213,10 +218,10 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo trainer.accumulate_grad_batches = trainer.num_training_batches - def on_train_epoch_end(self, trainer: "pl.Trainer", *args): + def on_train_epoch_end(self, trainer: "pl.Trainer", *args: Any) -> None: trainer.fit_loop._skip_backward = False - def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): + def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: # the trainer increases the current epoch before this hook is called if self._model_contains_batch_norm and trainer.current_epoch - 1 == self.swa_end + 1: # BatchNorm epoch update. Reset state @@ -229,35 +234,39 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): self.transfer_weights(self._average_model, pl_module) @staticmethod - def transfer_weights(src_pl_module: "pl.LightningModule", dst_pl_module: "pl.LightningModule"): + def transfer_weights(src_pl_module: "pl.LightningModule", dst_pl_module: "pl.LightningModule") -> None: for src_param, dst_param in zip(src_pl_module.parameters(), dst_pl_module.parameters()): dst_param.detach().copy_(src_param.to(dst_param.device)) - def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule"): + def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule") -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L140-L154.""" self.momenta = {} for module in pl_module.modules(): if not isinstance(module, nn.modules.batchnorm._BatchNorm): continue module.running_mean = torch.zeros_like( - module.running_mean, device=pl_module.device, dtype=module.running_mean.dtype + module.running_mean, # type: ignore[arg-type] + device=pl_module.device, + dtype=module.running_mean.dtype, # type: ignore[union-attr] ) module.running_var = torch.ones_like( - module.running_var, device=pl_module.device, dtype=module.running_var.dtype + module.running_var, # type: ignore[arg-type] + device=pl_module.device, + dtype=module.running_var.dtype, # type: ignore[union-attr] ) self.momenta[module] = module.momentum - module.momentum = None - module.num_batches_tracked *= 0 + module.momentum = None # type: ignore[assignment] + module.num_batches_tracked *= 0 # type: ignore[assignment, operator] - def reset_momenta(self): + def reset_momenta(self) -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165.""" for bn_module in self.momenta: bn_module.momentum = self.momenta[bn_module] @staticmethod def update_parameters( - average_model: "pl.LightningModule", model: "pl.LightningModule", n_averaged: torch.LongTensor, avg_fn: _AVG_FN - ): + average_model: "pl.LightningModule", model: "pl.LightningModule", n_averaged: Tensor, avg_fn: _AVG_FN + ) -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L104-L112.""" for p_swa, p_model in zip(average_model.parameters(), model.parameters()): device = p_swa.device @@ -268,8 +277,6 @@ def update_parameters( n_averaged += 1 @staticmethod - def avg_fn( - averaged_model_parameter: Tensor, model_parameter: Tensor, num_averaged: torch.LongTensor - ) -> FloatTensor: + def avg_fn(averaged_model_parameter: Tensor, model_parameter: Tensor, num_averaged: Tensor) -> Tensor: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97.""" return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1) From 22b88b9a6488d3557475d3a3a44410381defab12 Mon Sep 17 00:00:00 2001 From: Naiyarah Hussain Date: Tue, 26 Jul 2022 16:17:56 +0400 Subject: [PATCH 010/230] Added new email to enforce the Code of Conduct (#13833) --- .github/CODE_OF_CONDUCT.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODE_OF_CONDUCT.md b/.github/CODE_OF_CONDUCT.md index 94333b0479e06..6a97713b6d6f4 100644 --- a/.github/CODE_OF_CONDUCT.md +++ b/.github/CODE_OF_CONDUCT.md @@ -55,7 +55,7 @@ further defined and clarified by project maintainers. ## Enforcement Instances of abusive, harassing, or otherwise unacceptable behavior may be -reported by contacting the project team at waf2107@columbia.edu. All +reported by contacting the project team at community@lightning.ai. All complaints will be reviewed and investigated and will result in a response that is deemed necessary and appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. From 6a9d626e189038f09a0cf7157f61e75300ea99cc Mon Sep 17 00:00:00 2001 From: Cyprien Ricque <48893621+Cyprien-Ricque@users.noreply.github.com> Date: Tue, 26 Jul 2022 14:42:58 +0200 Subject: [PATCH 011/230] fix mypy typing errors in pytorch_lightning/strategies/strategy.py (#13519) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: otaj --- pyproject.toml | 1 - .../plugins/precision/apex_amp.py | 2 +- .../plugins/precision/deepspeed.py | 2 +- .../plugins/precision/ipu.py | 4 +- .../plugins/precision/native_amp.py | 2 +- .../plugins/precision/precision_plugin.py | 4 +- .../plugins/precision/tpu.py | 4 +- src/pytorch_lightning/strategies/bagua.py | 1 + .../strategies/fully_sharded_native.py | 2 + src/pytorch_lightning/strategies/strategy.py | 55 +++++++++++++------ src/pytorch_lightning/utilities/types.py | 36 ++++++++++++ 11 files changed, 85 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8c64c4452ca15..6d973aa0dde51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,6 @@ module = [ "pytorch_lightning.strategies.ipu", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", - "pytorch_lightning.strategies.strategy", "pytorch_lightning.strategies.tpu_spawn", "pytorch_lightning.trainer.callback_hook", "pytorch_lightning.trainer.connectors.callback_connector", diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index c329aedcf6f00..15825dedd2ef6 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -75,7 +75,7 @@ def backward( def optimizer_step( self, - model: Union["pl.LightningModule", Module], + model: Optional[Union["pl.LightningModule", Module]], optimizer: Optimizer, optimizer_idx: int, closure: Callable[[], Any], diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index e3c2afe1ad337..4cc12de400ef4 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -88,7 +88,7 @@ def _run_backward( def optimizer_step( self, - model: Union["pl.LightningModule", Module], + model: Optional[Union["pl.LightningModule", Module]], optimizer: Optimizer, optimizer_idx: int, closure: Callable[[], Any], diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index a299be9a730a5..329a8b8978e50 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Callable, Union +from typing import Any, Callable, Optional, Union from torch.nn import Module from torch.optim import LBFGS, Optimizer @@ -53,7 +53,7 @@ def backward(self, model: "pl.LightningModule", *args: Any, **kwargs: Any) -> No def optimizer_step( self, - model: Union["pl.LightningModule", Module], + model: Optional[Union["pl.LightningModule", Module]], optimizer: Optimizer, optimizer_idx: int, closure: Callable[[], Any], diff --git a/src/pytorch_lightning/plugins/precision/native_amp.py b/src/pytorch_lightning/plugins/precision/native_amp.py index 07f8c03babd55..4df1b166ca8dd 100644 --- a/src/pytorch_lightning/plugins/precision/native_amp.py +++ b/src/pytorch_lightning/plugins/precision/native_amp.py @@ -69,7 +69,7 @@ def _run_backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **k def optimizer_step( self, - model: Union["pl.LightningModule", Module], + model: Optional[Union["pl.LightningModule", Module]], optimizer: Optimizer, optimizer_idx: int, closure: Callable[[], Any], diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index a7b5bcb4e9f35..cbf18b8c4fa41 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -101,7 +101,7 @@ def _run_backward(self, tensor: Tensor, model: Optional[Module], *args: Any, **k tensor.backward(*args, **kwargs) def _after_closure( - self, model: Union["pl.LightningModule", Module], optimizer: Optimizer, optimizer_idx: int + self, model: Optional[Union["pl.LightningModule", Module]], optimizer: Optimizer, optimizer_idx: int ) -> None: """Utility to share some code after the closure has been run.""" if not isinstance(model, pl.LightningModule): @@ -140,7 +140,7 @@ def _wrap_closure( def optimizer_step( self, - model: Union["pl.LightningModule", Module], + model: Optional[Union["pl.LightningModule", Module]], optimizer: Optimizer, optimizer_idx: int, closure: Callable[[], Any], diff --git a/src/pytorch_lightning/plugins/precision/tpu.py b/src/pytorch_lightning/plugins/precision/tpu.py index a0ed9de0a4239..b393492a168bb 100644 --- a/src/pytorch_lightning/plugins/precision/tpu.py +++ b/src/pytorch_lightning/plugins/precision/tpu.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from functools import partial -from typing import Any, Callable, Union +from typing import Any, Callable, Optional, Union from torch.nn import Module from torch.optim import Optimizer @@ -31,7 +31,7 @@ class TPUPrecisionPlugin(PrecisionPlugin): def optimizer_step( self, - model: Union["pl.LightningModule", Module], + model: Optional[Union["pl.LightningModule", Module]], optimizer: Optimizer, optimizer_idx: int, closure: Callable[[], Any], diff --git a/src/pytorch_lightning/strategies/bagua.py b/src/pytorch_lightning/strategies/bagua.py index 35faa8ececfcd..d100d1aa97adc 100644 --- a/src/pytorch_lightning/strategies/bagua.py +++ b/src/pytorch_lightning/strategies/bagua.py @@ -157,6 +157,7 @@ def setup(self, trainer: "pl.Trainer") -> None: if self._should_run_deadlock_detection(): self._share_information_to_prevent_deadlock() + assert self.accelerator is not None self.accelerator.setup(trainer) # move the model to the correct device diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 553f010763edb..4c351f26fa3b9 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -201,6 +201,7 @@ def _configure_launcher(self) -> None: self._rank_0_will_call_children_scripts = True def setup(self, trainer: "pl.Trainer") -> None: + assert self.accelerator is not None self.accelerator.setup(trainer) # share ddp pids to all processes self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts) @@ -290,6 +291,7 @@ def teardown(self) -> None: self.model = self._layer_sync.revert(self.model) assert self.cluster_environment is not None + assert self.accelerator is not None self.cluster_environment.teardown() self.precision_plugin.teardown() self.accelerator.teardown() diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 01b8f1b793791..3d45c61abb1c1 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -33,7 +33,15 @@ from pytorch_lightning.utilities.apply_func import move_data_to_device from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.optimizer import optimizer_to_device, optimizers_to_device -from pytorch_lightning.utilities.types import _PATH, LRSchedulerConfig, STEP_OUTPUT +from pytorch_lightning.utilities.types import ( + _PATH, + LRSchedulerConfig, + PredictStep, + STEP_OUTPUT, + TestStep, + TrainingStep, + ValidationStep, +) TBroadcast = TypeVar("TBroadcast") TReduce = TypeVar("TReduce") @@ -50,11 +58,11 @@ def __init__( checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ) -> None: - self.accelerator = accelerator + self._accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = accelerator + self._checkpoint_io: Optional[CheckpointIO] = checkpoint_io + self._precision_plugin: Optional[PrecisionPlugin] = precision_plugin self._launcher: Optional[_Launcher] = None self._model: Optional[Module] = None - self._checkpoint_io: Optional[CheckpointIO] = checkpoint_io - self.precision_plugin = precision_plugin self._optimizers: List[Optimizer] = [] self._lightning_optimizers: Dict[int, LightningOptimizer] = {} self.lr_scheduler_configs: List[LRSchedulerConfig] = [] @@ -65,7 +73,7 @@ def launcher(self) -> Optional[_Launcher]: return self._launcher @property - def accelerator(self) -> "pl.accelerators.accelerator.Accelerator": + def accelerator(self) -> Optional["pl.accelerators.accelerator.Accelerator"]: return self._accelerator @accelerator.setter @@ -106,7 +114,7 @@ def connect(self, model: Module) -> None: """Called by the accelerator to connect the accelerator and the model with this plugin.""" self.model = model - def _configure_launcher(self): + def _configure_launcher(self) -> None: """Attach the launcher based on Strategy.""" def setup_environment(self) -> None: @@ -115,6 +123,7 @@ def setup_environment(self) -> None: This is called before the LightningModule/DataModule setup hook which allows the user to access the accelerator environment before setup is complete. """ + assert self.accelerator is not None self.accelerator.setup_environment(self.root_device) def setup_optimizers(self, trainer: "pl.Trainer") -> None: @@ -125,6 +134,7 @@ def setup_optimizers(self, trainer: "pl.Trainer") -> None: """ if trainer.state.fn not in (TrainerFn.FITTING, TrainerFn.TUNING): return + assert self.lightning_module is not None self.optimizers, self.lr_scheduler_configs, self.optimizer_frequencies = _init_optimizers_and_lr_schedulers( self.lightning_module ) @@ -135,6 +145,7 @@ def setup(self, trainer: "pl.Trainer") -> None: Args: trainer: the trainer instance """ + assert self.accelerator is not None self.accelerator.setup(trainer) self.setup_optimizers(trainer) self.setup_precision_plugin() @@ -142,6 +153,7 @@ def setup(self, trainer: "pl.Trainer") -> None: def setup_precision_plugin(self) -> None: """Attaches the precision plugin to the accelerator.""" + assert self.model is not None model, optimizers, lr_scheduler_configs = self.precision_plugin.connect( self.model, self.optimizers, self.lr_scheduler_configs ) @@ -163,6 +175,7 @@ def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor: closure_loss: a tensor holding the loss value to backpropagate """ self.pre_backward(closure_loss) + assert self.lightning_module is not None closure_loss = self.precision_plugin.pre_backward(self.lightning_module, closure_loss) self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs) @@ -316,6 +329,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: return self.checkpoint_io.load_checkpoint(checkpoint_path) def load_model_state_dict(self, checkpoint: Mapping[str, Any]) -> None: + assert self.lightning_module is not None self.lightning_module.load_state_dict(checkpoint["state_dict"]) def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None: @@ -324,48 +338,52 @@ def load_optimizer_state_dict(self, checkpoint: Mapping[str, Any]) -> None: optimizer.load_state_dict(opt_state) optimizer_to_device(optimizer, self.root_device) - def training_step(self, *args, **kwargs) -> STEP_OUTPUT: + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: """The actual training step. See :meth:`~pytorch_lightning.core.module.LightningModule.training_step` for more details """ with self.precision_plugin.train_step_context(): + assert isinstance(self.model, TrainingStep) return self.model.training_step(*args, **kwargs) - def post_training_step(self): + def post_training_step(self) -> None: pass - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: """The actual validation step. See :meth:`~pytorch_lightning.core.module.LightningModule.validation_step` for more details """ with self.precision_plugin.val_step_context(): + assert isinstance(self.model, ValidationStep) return self.model.validation_step(*args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: """The actual test step. See :meth:`~pytorch_lightning.core.module.LightningModule.test_step` for more details """ with self.precision_plugin.test_step_context(): + assert isinstance(self.model, TestStep) return self.model.test_step(*args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: """The actual predict step. See :meth:`~pytorch_lightning.core.module.LightningModule.predict_step` for more details """ with self.precision_plugin.predict_step_context(): + assert isinstance(self.model, PredictStep) return self.model.predict_step(*args, **kwargs) - def training_step_end(self, output): + def training_step_end(self, output: STEP_OUTPUT) -> STEP_OUTPUT: return output - def validation_step_end(self, output): + def validation_step_end(self, output: STEP_OUTPUT) -> STEP_OUTPUT: return output - def test_step_end(self, output): + def test_step_end(self, output: STEP_OUTPUT) -> STEP_OUTPUT: return output def process_dataloader(self, dataloader: DataLoader) -> DataLoader: @@ -401,8 +419,8 @@ def handles_gradient_accumulation(self) -> bool: def lightning_module_state_dict(self) -> Dict[str, Union[Any, Tensor]]: """Returns model state.""" - model = self.lightning_module - return model.state_dict() + assert self.lightning_module is not None + return self.lightning_module.state_dict() def save_checkpoint( self, checkpoint: Dict[str, Any], filepath: _PATH, storage_options: Optional[Any] = None @@ -447,10 +465,11 @@ def teardown(self) -> None: log.detail(f"{self.__class__.__name__}: moving model to CPU") self.lightning_module.cpu() self.precision_plugin.teardown() + assert self.accelerator is not None self.accelerator.teardown() @classmethod - def register_strategies(cls, strategy_registry) -> None: + def register_strategies(cls, strategy_registry: Dict[str, Any]) -> None: pass def on_train_start(self) -> None: @@ -481,7 +500,7 @@ def on_test_end(self) -> None: """Called when test end.""" pass - def on_predict_end(self): + def on_predict_end(self) -> None: """Called when predict ends.""" pass diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index 0fd804cb986ef..f6c14d366805f 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -51,6 +51,42 @@ _DEVICE = Union[torch.device, str, int] +@runtime_checkable +class TrainingStep(Protocol): + """This class is used to detect if an object implements the `training_step` hook using `isinstance(model, + TrainingStep)`.""" + + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + ... + + +@runtime_checkable +class ValidationStep(Protocol): + """This class is used to detect if an object implements the `validation_step` hook using `isinstance(model, + ValidationStep)`.""" + + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + ... + + +@runtime_checkable +class TestStep(Protocol): + """This class is used to detect if an object implements the `test_step` hook using `isinstance(model, + TestStep)`.""" + + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + ... + + +@runtime_checkable +class PredictStep(Protocol): + """This class is used to detect if an object implements the `predict_step` hook using `isinstance(model, + PredictStep)`.""" + + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + ... + + @runtime_checkable class _Stateful(Protocol): """This class is used to detect if an object is stateful using `isinstance(obj, _Stateful)`.""" From 9c720c8adf9ea4fff1e5cf9a6d786ce747e0c76f Mon Sep 17 00:00:00 2001 From: Amin Setayesh Date: Tue, 26 Jul 2022 18:13:36 +0430 Subject: [PATCH 012/230] Fix wrong error message in ModelPruning (#13820) --- src/pytorch_lightning/callbacks/pruning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/callbacks/pruning.py b/src/pytorch_lightning/callbacks/pruning.py index 37e8caa32b669..63516028b129f 100644 --- a/src/pytorch_lightning/callbacks/pruning.py +++ b/src/pytorch_lightning/callbacks/pruning.py @@ -467,7 +467,7 @@ def sanitize_parameters_to_prune( if missing_modules or missing_parameters: raise MisconfigurationException( - "Some provided `parameters_to_tune` don't exist in the model." + "Some provided `parameters_to_prune` don't exist in the model." f" Found missing modules: {missing_modules} and missing parameters: {missing_parameters}" ) else: From faf7ff57c0937ea8f1d77f6492200c46d5ded23e Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 26 Jul 2022 21:13:19 +0530 Subject: [PATCH 013/230] Add support for async checkpointing (#13658) --- docs/source-pytorch/api_references.rst | 1 + .../common/checkpointing_expert.rst | 37 +++++++++ docs/source-pytorch/extensions/plugins.rst | 1 + src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/plugins/__init__.py | 2 + src/pytorch_lightning/plugins/io/__init__.py | 11 ++- .../plugins/io/async_plugin.py | 62 +++++++++++++++ .../plugins/io/checkpoint_plugin.py | 8 +- .../plugins/io/torch_plugin.py | 2 +- src/pytorch_lightning/plugins/io/wrapper.py | 66 ++++++++++++++++ .../strategies/hpu_parallel.py | 4 + .../strategies/single_hpu.py | 4 + .../strategies/single_tpu.py | 4 + src/pytorch_lightning/strategies/strategy.py | 4 + src/pytorch_lightning/strategies/tpu_spawn.py | 4 + src/pytorch_lightning/utilities/cloud_io.py | 1 - .../plugins/test_checkpoint_io_plugin.py | 78 ++++++++++++++++++- 17 files changed, 282 insertions(+), 10 deletions(-) create mode 100644 src/pytorch_lightning/plugins/io/async_plugin.py create mode 100644 src/pytorch_lightning/plugins/io/wrapper.py diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index 96a061a941b57..db4fc1e2c4cf8 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -210,6 +210,7 @@ io :nosignatures: :template: classtemplate.rst + AsyncCheckpointIO CheckpointIO HPUCheckpointIO TorchCheckpointIO diff --git a/docs/source-pytorch/common/checkpointing_expert.rst b/docs/source-pytorch/common/checkpointing_expert.rst index c4a948a34cb9d..665acfeef548f 100644 --- a/docs/source-pytorch/common/checkpointing_expert.rst +++ b/docs/source-pytorch/common/checkpointing_expert.rst @@ -45,6 +45,10 @@ Built-in Checkpoint IO Plugins respectively, common for most use cases. * - :class:`~pytorch_lightning.plugins.io.XLACheckpointIO` - CheckpointIO that utilizes :func:`xm.save` to save checkpoints for TPU training strategies. + * - :class:`~pytorch_lightning.plugins.io.HPUCheckpointIO` + - CheckpointIO to save checkpoints for HPU training strategies. + * - :class:`~pytorch_lightning.plugins.io.AsyncCheckpointIO` + - ``AsyncCheckpointIO`` enables saving the checkpoints asynchronously in a thread. *************************** @@ -94,3 +98,36 @@ Custom Checkpoint IO Plugin .. note:: Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable. + + +************************** +Asynchronous Checkpointing +************************** + +.. warning:: + + This is currently an experimental plugin/feature and API changes are to be expected. + +To enable saving the checkpoints asynchronously without blocking your training, you can configure +:class:`~pytorch_lightning.plugins.io.async_plugin.AsyncCheckpointIO` plugin to ``Trainer``. + +.. code-block:: python + + from pytorch_lightning.plugins.io import AsyncCheckpointIO + + + async_ckpt_io = AsyncCheckpointIO() + trainer = Trainer(plugins=[async_ckpt_io]) + + +It uses its base ``CheckpointIO`` plugin's saving logic to save the checkpoint but performs this operation asynchronously. +By default, this base ``CheckpointIO`` will be set-up for you and all you need to provide is the ``AsyncCheckpointIO`` instance to the ``Trainer``. +But if you want the plugin to use your own custom base ``CheckpointIO`` and want the base to behave asynchronously, pass it as an argument while initializing ``AsyncCheckpointIO``. + +.. code-block:: python + + from pytorch_lightning.plugins.io import AsyncCheckpointIO + + base_ckpt_io = MyCustomCheckpointIO() + async_ckpt_io = AsyncCheckpointIO(checkpoint_io=base_ckpt_io) + trainer = Trainer(plugins=[async_ckpt_io]) diff --git a/docs/source-pytorch/extensions/plugins.rst b/docs/source-pytorch/extensions/plugins.rst index 6ea8d42815f46..a0dbefd141464 100644 --- a/docs/source-pytorch/extensions/plugins.rst +++ b/docs/source-pytorch/extensions/plugins.rst @@ -87,6 +87,7 @@ Below is a list of built-in plugins for checkpointing. :nosignatures: :template: classtemplate.rst + AsyncCheckpointIO CheckpointIO HPUCheckpointIO TorchCheckpointIO diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 1c3a3b9d5a1be..327b03c3aac6c 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -108,6 +108,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for DDP Fork ([#13405](https://github.com/PyTorchLightning/pytorch-lightning/pull/13405)) +- Added support for async checkpointing ([#13658](https://github.com/PyTorchLightning/pytorch-lightning/pull/13658)) + + ### Changed - `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index 0f1c4ca85ed5a..afd10c88c951d 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -1,6 +1,7 @@ from typing import Union from pytorch_lightning.plugins.environments import ClusterEnvironment +from pytorch_lightning.plugins.io.async_plugin import AsyncCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO @@ -38,6 +39,7 @@ PLUGIN_INPUT = Union[PLUGIN, str] __all__ = [ + "AsyncCheckpointIO", "CheckpointIO", "TorchCheckpointIO", "XLACheckpointIO", diff --git a/src/pytorch_lightning/plugins/io/__init__.py b/src/pytorch_lightning/plugins/io/__init__.py index abd196eb2b1e3..19a556bddf29c 100644 --- a/src/pytorch_lightning/plugins/io/__init__.py +++ b/src/pytorch_lightning/plugins/io/__init__.py @@ -11,7 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO # noqa: F401 -from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO # noqa: F401 -from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO # noqa: F401 -from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO # noqa: F401 +from pytorch_lightning.plugins.io.async_plugin import AsyncCheckpointIO +from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO +from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO +from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO + +__all__ = ["AsyncCheckpointIO", "CheckpointIO", "HPUCheckpointIO", "TorchCheckpointIO", "XLACheckpointIO"] diff --git a/src/pytorch_lightning/plugins/io/async_plugin.py b/src/pytorch_lightning/plugins/io/async_plugin.py new file mode 100644 index 0000000000000..1146bc373a4ac --- /dev/null +++ b/src/pytorch_lightning/plugins/io/async_plugin.py @@ -0,0 +1,62 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Optional + +from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO + + +class AsyncCheckpointIO(_WrappingCheckpointIO): + """``AsyncCheckpointIO`` enables saving the checkpoints asynchronously in a thread. + + .. warning:: + + This is currently an experimental plugin/feature and API changes are to be expected. + + Args: + checkpoint_io: A checkpoint IO plugin that is used as the basis for async checkpointing. + """ + + def __init__(self, checkpoint_io: Optional["CheckpointIO"] = None) -> None: + super().__init__(checkpoint_io) + + self._executor = ThreadPoolExecutor(max_workers=1) + self._error: Optional[BaseException] = None + + def save_checkpoint(self, *args: Any, **kwargs: Any) -> None: + """Uses the ``ThreadPoolExecutor`` to save the checkpoints using the base ``checkpoint_io``.""" + + def _save_checkpoint(*args: Any, **kwargs: Any) -> None: + try: + assert self.checkpoint_io is not None + self.checkpoint_io.save_checkpoint(*args, **kwargs) + except BaseException as e: + self._error = e + + self._executor.submit(_save_checkpoint, *args, **kwargs) + + # if an error was raised between the previous time `save_checkpoint`` was called and now, + # because `executor.submit` is not blocking + if self._error: + raise self._error + + def teardown(self) -> None: + """This method is called to close the threads.""" + self._executor.shutdown(wait=True) + + # if an error was raised anytime in any of the `executor.submit` calls + if self._error: + raise self._error diff --git a/src/pytorch_lightning/plugins/io/checkpoint_plugin.py b/src/pytorch_lightning/plugins/io/checkpoint_plugin.py index 1425a229963b7..7dcc85042425a 100644 --- a/src/pytorch_lightning/plugins/io/checkpoint_plugin.py +++ b/src/pytorch_lightning/plugins/io/checkpoint_plugin.py @@ -43,12 +43,13 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio """ @abstractmethod - def load_checkpoint(self, path: _PATH, storage_options: Optional[Any] = None) -> Dict[str, Any]: + def load_checkpoint(self, path: _PATH, map_location: Optional[Any] = None) -> Dict[str, Any]: """Load checkpoint from a path when resuming or loading ckpt for test/validate/predict stages. Args: path: Path to checkpoint - storage_options: Optional parameters when loading the model/training states. + map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage + locations. Returns: The loaded checkpoint. """ @@ -60,3 +61,6 @@ def remove_checkpoint(self, path: _PATH) -> None: Args: path: Path to checkpoint """ + + def teardown(self) -> None: + """This method is called to teardown the process.""" diff --git a/src/pytorch_lightning/plugins/io/torch_plugin.py b/src/pytorch_lightning/plugins/io/torch_plugin.py index 8791249e7d90c..0e5cba3837de3 100644 --- a/src/pytorch_lightning/plugins/io/torch_plugin.py +++ b/src/pytorch_lightning/plugins/io/torch_plugin.py @@ -69,7 +69,7 @@ def load_checkpoint( Args: path: Path to checkpoint map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage - locations. + locations. Returns: The loaded checkpoint. diff --git a/src/pytorch_lightning/plugins/io/wrapper.py b/src/pytorch_lightning/plugins/io/wrapper.py new file mode 100644 index 0000000000000..eb46990deffdf --- /dev/null +++ b/src/pytorch_lightning/plugins/io/wrapper.py @@ -0,0 +1,66 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, Optional + +from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO + + +class _WrappingCheckpointIO(CheckpointIO): + """``_WrappingCheckpointIO`` is a wrapper checkpoint_io that uses a base checkpoint_io to handle checkpointing. + + Args: + checkpoint_io: A checkpoint IO plugin that is used as the basis. + """ + + def __init__(self, checkpoint_io: Optional["CheckpointIO"] = None) -> None: + super().__init__() + + self._checkpoint_io = checkpoint_io + self._base_checkpoint_io_configured: bool = False + + if checkpoint_io is not None: + if isinstance(checkpoint_io, _WrappingCheckpointIO): + self._base_checkpoint_io_configured = checkpoint_io._base_checkpoint_io_configured + else: + self._base_checkpoint_io_configured = True + + @property + def checkpoint_io(self) -> Optional["CheckpointIO"]: + return self._checkpoint_io + + @checkpoint_io.setter + def checkpoint_io(self, checkpoint_io: "CheckpointIO") -> None: + assert not isinstance(checkpoint_io, _WrappingCheckpointIO) + + if self._checkpoint_io is None: + self._base_checkpoint_io_configured = True + self._checkpoint_io = checkpoint_io + elif isinstance(self._checkpoint_io, _WrappingCheckpointIO) and not self._base_checkpoint_io_configured: + self._base_checkpoint_io_configured = True + self._checkpoint_io.checkpoint_io = checkpoint_io + + def save_checkpoint(self, *args: Any, **kwargs: Any) -> None: + """Uses the base ``checkpoint_io`` to save the checkpoint.""" + assert self.checkpoint_io is not None + self.checkpoint_io.save_checkpoint(*args, **kwargs) + + def remove_checkpoint(self, *args: Any, **kwargs: Any) -> None: + """Uses the base ``checkpoint_io`` to remove the checkpoint.""" + assert self.checkpoint_io is not None + self.checkpoint_io.remove_checkpoint(*args, **kwargs) + + def load_checkpoint(self, *args: Any, **kwargs: Any) -> Dict[str, Any]: + """Uses the base ``checkpoint_io`` to load the checkpoint.""" + assert self.checkpoint_io is not None + return self.checkpoint_io.load_checkpoint(*args, **kwargs) diff --git a/src/pytorch_lightning/strategies/hpu_parallel.py b/src/pytorch_lightning/strategies/hpu_parallel.py index 591664e93e782..3e6f8e932e7c2 100644 --- a/src/pytorch_lightning/strategies/hpu_parallel.py +++ b/src/pytorch_lightning/strategies/hpu_parallel.py @@ -23,6 +23,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO +from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.utilities.distributed import group as _group @@ -78,6 +79,9 @@ def __init__( def checkpoint_io(self) -> CheckpointIO: if self._checkpoint_io is None: self._checkpoint_io = HPUCheckpointIO() + elif isinstance(self._checkpoint_io, _WrappingCheckpointIO): + self._checkpoint_io.checkpoint_io = HPUCheckpointIO() + return self._checkpoint_io @checkpoint_io.setter diff --git a/src/pytorch_lightning/strategies/single_hpu.py b/src/pytorch_lightning/strategies/single_hpu.py index bbba3904f6bc3..45eb8c58f2cd4 100644 --- a/src/pytorch_lightning/strategies/single_hpu.py +++ b/src/pytorch_lightning/strategies/single_hpu.py @@ -17,6 +17,7 @@ import pytorch_lightning as pl from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.hpu_plugin import HPUCheckpointIO +from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.single_device import SingleDeviceStrategy from pytorch_lightning.utilities import _HPU_AVAILABLE @@ -54,6 +55,9 @@ def __init__( def checkpoint_io(self) -> CheckpointIO: if self._checkpoint_io is None: self._checkpoint_io = HPUCheckpointIO() + elif isinstance(self._checkpoint_io, _WrappingCheckpointIO): + self._checkpoint_io.checkpoint_io = HPUCheckpointIO() + return self._checkpoint_io @checkpoint_io.setter diff --git a/src/pytorch_lightning/strategies/single_tpu.py b/src/pytorch_lightning/strategies/single_tpu.py index caf153ace0bb6..3084f17430338 100644 --- a/src/pytorch_lightning/strategies/single_tpu.py +++ b/src/pytorch_lightning/strategies/single_tpu.py @@ -16,6 +16,7 @@ import pytorch_lightning as pl from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.single_device import SingleDeviceStrategy @@ -50,6 +51,9 @@ def __init__( def checkpoint_io(self) -> CheckpointIO: if self._checkpoint_io is None: self._checkpoint_io = XLACheckpointIO() + elif isinstance(self._checkpoint_io, _WrappingCheckpointIO): + self._checkpoint_io.checkpoint_io = XLACheckpointIO() + return self._checkpoint_io @checkpoint_io.setter diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 3d45c61abb1c1..f47afc890bcbb 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -27,6 +27,7 @@ from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins import TorchCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.base import _Launcher from pytorch_lightning.trainer.states import TrainerFn @@ -84,6 +85,8 @@ def accelerator(self, accelerator: "pl.accelerators.accelerator.Accelerator") -> def checkpoint_io(self) -> CheckpointIO: if self._checkpoint_io is None: self._checkpoint_io = TorchCheckpointIO() + elif isinstance(self._checkpoint_io, _WrappingCheckpointIO): + self._checkpoint_io.checkpoint_io = TorchCheckpointIO() return self._checkpoint_io @@ -467,6 +470,7 @@ def teardown(self) -> None: self.precision_plugin.teardown() assert self.accelerator is not None self.accelerator.teardown() + self.checkpoint_io.teardown() @classmethod def register_strategies(cls, strategy_registry: Dict[str, Any]) -> None: diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 0c02c820840e2..f4953a9f64baa 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -24,6 +24,7 @@ from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.plugins.environments import XLAEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO from pytorch_lightning.plugins.io.xla_plugin import XLACheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy @@ -78,6 +79,9 @@ def __init__( def checkpoint_io(self) -> CheckpointIO: if self._checkpoint_io is None: self._checkpoint_io = XLACheckpointIO() + elif isinstance(self._checkpoint_io, _WrappingCheckpointIO): + self._checkpoint_io.checkpoint_io = XLACheckpointIO() + return self._checkpoint_io @checkpoint_io.setter diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py index 9055ff50c8d0d..81482a8ab24f9 100644 --- a/src/pytorch_lightning/utilities/cloud_io.py +++ b/src/pytorch_lightning/utilities/cloud_io.py @@ -62,7 +62,6 @@ def atomic_save(checkpoint: Dict[str, Any], filepath: Union[str, Path]) -> None: filepath: The path to which the checkpoint will be saved. This points to the file that the checkpoint will be stored in. """ - bytesbuffer = io.BytesIO() torch.save(checkpoint, bytesbuffer) with fsspec.open(filepath, "wb") as f: diff --git a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py index 651ab1cc4f49f..ae618ffa333dc 100644 --- a/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py +++ b/tests/tests_pytorch/plugins/test_checkpoint_io_plugin.py @@ -12,15 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from pathlib import Path from typing import Any, Dict, Optional -from unittest.mock import MagicMock +from unittest.mock import MagicMock, Mock import torch from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins import CheckpointIO +from pytorch_lightning.plugins.io.async_plugin import AsyncCheckpointIO +from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO +from pytorch_lightning.plugins.io.torch_plugin import TorchCheckpointIO from pytorch_lightning.strategies import SingleDeviceStrategy from pytorch_lightning.utilities.types import _PATH @@ -49,9 +52,16 @@ def test_checkpoint_plugin_called(tmpdir): strategy=SingleDeviceStrategy("cpu", checkpoint_io=checkpoint_plugin), callbacks=ck, max_epochs=2, + limit_train_batches=1, + limit_val_batches=0, + limit_test_batches=1, ) trainer.fit(model) + ckpt_files = {fn.name for fn in Path(tmpdir).glob("*.ckpt")} + assert ckpt_files == {"epoch=1-step=2.ckpt", "last.ckpt"} + assert trainer.checkpoint_callback.best_model_path == tmpdir / "epoch=1-step=2.ckpt" + assert trainer.checkpoint_callback.last_model_path == tmpdir / "last.ckpt" assert checkpoint_plugin.save_checkpoint.call_count == 4 assert checkpoint_plugin.remove_checkpoint.call_count == 1 @@ -68,12 +78,76 @@ def test_checkpoint_plugin_called(tmpdir): plugins=[checkpoint_plugin], callbacks=ck, max_epochs=2, + limit_train_batches=1, + limit_val_batches=0, + limit_test_batches=1, ) trainer.fit(model) + ckpt_files = {fn.name for fn in Path(tmpdir).glob("*.ckpt")} + assert ckpt_files == {"epoch=1-step=2.ckpt", "last.ckpt", "epoch=1-step=2-v1.ckpt", "last-v1.ckpt"} + assert trainer.checkpoint_callback.best_model_path == tmpdir / "epoch=1-step=2-v1.ckpt" + assert trainer.checkpoint_callback.last_model_path == tmpdir / "last-v1.ckpt" assert checkpoint_plugin.save_checkpoint.call_count == 4 assert checkpoint_plugin.remove_checkpoint.call_count == 1 trainer.test(model, ckpt_path=ck.last_model_path) checkpoint_plugin.load_checkpoint.assert_called_once() checkpoint_plugin.load_checkpoint.assert_called_with(tmpdir / "last-v1.ckpt") + + +def test_async_checkpoint_plugin(tmpdir): + """Ensure that the custom checkpoint IO plugin and torch checkpoint IO plugin is called when async saving and + loading.""" + + checkpoint_plugin = AsyncCheckpointIO() + + checkpoint_plugin.save_checkpoint = Mock(wraps=checkpoint_plugin.save_checkpoint) + checkpoint_plugin.remove_checkpoint = Mock(wraps=checkpoint_plugin.remove_checkpoint) + + class CustomBoringModel(BoringModel): + def on_fit_start(self): + base_ckpt_io = self.trainer.strategy.checkpoint_io.checkpoint_io + base_ckpt_io.save_checkpoint = Mock(wraps=base_ckpt_io.save_checkpoint) + base_ckpt_io.remove_checkpoint = Mock(wraps=base_ckpt_io.remove_checkpoint) + + ck = ModelCheckpoint(dirpath=tmpdir, save_top_k=2, monitor="step", mode="max") + + model = CustomBoringModel() + trainer = Trainer( + default_root_dir=tmpdir, + plugins=[checkpoint_plugin], + callbacks=ck, + max_epochs=3, + limit_train_batches=1, + limit_val_batches=0, + enable_progress_bar=False, + enable_model_summary=False, + ) + trainer.fit(model) + + assert checkpoint_plugin.save_checkpoint.call_count == 3 + assert checkpoint_plugin.remove_checkpoint.call_count == 1 + + base_ckpt_io = trainer.strategy.checkpoint_io.checkpoint_io + assert base_ckpt_io.save_checkpoint.call_count == 3 + assert base_ckpt_io.remove_checkpoint.call_count == 1 + + +def test_multi_wrapped_checkpoint_io_initialization(): + base_ckpt_io = TorchCheckpointIO() + wrap_ckpt = AsyncCheckpointIO(base_ckpt_io) + ckpt_io = AsyncCheckpointIO(wrap_ckpt) + assert ckpt_io.checkpoint_io is wrap_ckpt + assert ckpt_io.checkpoint_io.checkpoint_io is base_ckpt_io + assert ckpt_io._base_checkpoint_io_configured is True + assert ckpt_io.checkpoint_io._base_checkpoint_io_configured is True + + wrap_ckpt = AsyncCheckpointIO() + ckpt_io = AsyncCheckpointIO(wrap_ckpt) + trainer = Trainer(accelerator="cpu", plugins=[ckpt_io]) + trainer.strategy.checkpoint_io + assert ckpt_io.checkpoint_io is wrap_ckpt + assert isinstance(ckpt_io.checkpoint_io.checkpoint_io, TorchCheckpointIO) + assert ckpt_io._base_checkpoint_io_configured is True + assert ckpt_io.checkpoint_io._base_checkpoint_io_configured is True From 0cbfe08a41a5f517a27fd432d4770feb7bef37af Mon Sep 17 00:00:00 2001 From: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> Date: Wed, 27 Jul 2022 01:04:36 +0900 Subject: [PATCH 014/230] Add lightning apps to info_packages of collect_env_details (#13815) --- requirements/collect_env_details.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/requirements/collect_env_details.py b/requirements/collect_env_details.py index a39adf88cb253..1d65753a55553 100644 --- a/requirements/collect_env_details.py +++ b/requirements/collect_env_details.py @@ -27,6 +27,15 @@ sys.path += [os.path.abspath(".."), os.path.abspath("")] import pytorch_lightning # noqa: E402 +try: + import lightning +except ModuleNotFoundError: + pass +try: + import lightning_app +except ModuleNotFoundError: + pass + LEVEL_OFFSET = "\t" KEY_PADDING = 20 @@ -56,6 +65,8 @@ def info_packages(): "pyTorch_version": torch.__version__, "pyTorch_debug": torch.version.debug, "pytorch-lightning": pytorch_lightning.__version__, + "lightning": lightning.__version__ if "lightning" in sys.modules else None, + "lightning_app": lightning_app.__version__ if "lightning_app" in sys.modules else None, "tqdm": tqdm.__version__, } From a90ef3b751816fbcc2b7d45efcd38714a4f6c19b Mon Sep 17 00:00:00 2001 From: Akarsha Rao <94624926+raoakarsha@users.noreply.github.com> Date: Tue, 26 Jul 2022 10:54:01 -0700 Subject: [PATCH 015/230] CI: Correct test path to publish test results (#13862) --- .azure/hpu-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure/hpu-tests.yml b/.azure/hpu-tests.yml index f0b279bda3f60..bdfada907cac9 100644 --- a/.azure/hpu-tests.yml +++ b/.azure/hpu-tests.yml @@ -84,7 +84,7 @@ jobs: - task: PublishTestResults@2 inputs: - testResultsFiles: 'hpu*_test-results.xml' + testResultsFiles: 'tests/tests_pytorch/hpu*_test-results.xml' testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' condition: succeededOrFailed() displayName: 'Publish test results' From c3911700d155cdda0c6735da19cbcc98724ef6f5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 10:32:39 +0200 Subject: [PATCH 016/230] Fix error handling in learning rate finder (#13845) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 13 +++-- src/pytorch_lightning/tuner/lr_finder.py | 40 +++++++++------ tests/tests_pytorch/tuner/test_lr_finder.py | 54 +++++++++++++++++++++ 3 files changed, 89 insertions(+), 18 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 327b03c3aac6c..07266dd91a578 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -379,6 +379,16 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` ([#12832](https://github.com/Lightning-AI/lightning/pull/12832) +- Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/Lightning-AI/lightning/pull/13645)) + + +- Fixed error handling in learning rate finder when not enough data points are available to give a good suggestion ([#13845](https://github.com/Lightning-AI/lightning/pull/13845)) + + +- Fixed an issue that caused the learning rate finder to set the model's learning rate to None when no suggestion was possible ([#13845](https://github.com/Lightning-AI/lightning/pull/13845)) + + + ## [1.6.5] - 2022-07-13 ### Fixed @@ -389,9 +399,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the restoration of log step during restart ([#13467](https://github.com/PyTorchLightning/pytorch-lightning/pull/13467)) -- Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/PyTorchLightning/pytorch-lightning/pull/13645)) - - ## [1.6.4] - 2022-06-01 ### Added diff --git a/src/pytorch_lightning/tuner/lr_finder.py b/src/pytorch_lightning/tuner/lr_finder.py index 71d96ef428f35..186dfb5ea7416 100644 --- a/src/pytorch_lightning/tuner/lr_finder.py +++ b/src/pytorch_lightning/tuner/lr_finder.py @@ -174,24 +174,33 @@ def plot(self, suggest: bool = False, show: bool = False) -> Optional["plt.Figur return fig def suggestion(self, skip_begin: int = 10, skip_end: int = 1) -> Optional[float]: - """This will propose a suggestion for choice of initial learning rate as the point with the steepest + """This will propose a suggestion for an initial learning rate based on the point with the steepest negative gradient. + Args: + skip_begin: how many samples to skip in the beginning; helps to avoid too naive estimates + skip_end: how many samples to skip in the end; helps to avoid too optimistic estimates + Returns: - lr: suggested initial learning rate to use - skip_begin: how many samples to skip in the beginning. Prevent too naive estimates - skip_end: how many samples to skip in the end. Prevent too optimistic estimates + The suggested initial learning rate to use, or `None` if a suggestion is not possible due to too few + loss samples. """ - try: - loss = np.array(self.results["loss"][skip_begin:-skip_end]) - loss = loss[np.isfinite(loss)] - min_grad = np.gradient(loss).argmin() - self._optimal_idx = min_grad + skip_begin - return self.results["lr"][self._optimal_idx] - # todo: specify the possible exception - except Exception: - log.exception("Failed to compute suggesting for `lr`. There might not be enough points.") + losses = np.array(self.results["loss"][skip_begin:-skip_end]) + losses = losses[np.isfinite(losses)] + if len(losses) < 2: + # computing np.gradient requires at least 2 points + log.error( + "Failed to compute suggestion for learning rate because there are not enough points. Increase the loop" + " iteration limits or the size of your dataset/dataloader." + ) self._optimal_idx = None + return None + + # TODO: When computing the argmin here, and some losses are non-finite, the expected indices could be + # incorrectly shifted by an offset + min_grad = np.gradient(losses).argmin() + self._optimal_idx = min_grad + skip_begin + return self.results["lr"][self._optimal_idx] def lr_find( @@ -252,8 +261,9 @@ def lr_find( lr = lr_finder.suggestion() # TODO: log lr.results to self.logger - lightning_setattr(model, lr_attr_name, lr) - log.info(f"Learning rate set to {lr}") + if lr is not None: + lightning_setattr(model, lr_attr_name, lr) + log.info(f"Learning rate set to {lr}") # Restore initial state of model trainer._checkpoint_connector.restore(ckpt_path) diff --git a/tests/tests_pytorch/tuner/test_lr_finder.py b/tests/tests_pytorch/tuner/test_lr_finder.py index 529ef1c4c08c1..9be115d2f8fda 100644 --- a/tests/tests_pytorch/tuner/test_lr_finder.py +++ b/tests/tests_pytorch/tuner/test_lr_finder.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import logging import os from copy import deepcopy @@ -19,6 +20,7 @@ from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.tuner.lr_finder import _LRFinder from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.simple_models import ClassificationModel @@ -359,3 +361,55 @@ def test_multiple_lr_find_calls_gives_same_results(tmpdir): for curr_lr_finder in all_res[1:] for k in all_res[0].keys() ) + + +@pytest.mark.parametrize( + "skip_begin,skip_end,losses,expected_error", + [ + (0, 0, [], True), + (10, 1, [], True), + (0, 2, [0, 1, 2], True), + (0, 1, [0, 1, 2], False), + (1, 1, [0, 1, 2], True), + (1, 1, [0, 1, 2, 3], False), + (0, 1, [float("nan"), float("nan"), 0, float("inf"), 1, 2, 3, float("inf"), 2, float("nan"), 1], False), + (4, 1, [float("nan"), float("nan"), 0, float("inf"), 1, 2, 3, float("inf"), 2, float("nan"), 1], False), + ], +) +def test_suggestion_not_enough_finite_points(losses, skip_begin, skip_end, expected_error, caplog): + """Tests the error handling when not enough finite points are available to make a suggestion.""" + caplog.clear() + lr_finder = _LRFinder( + mode="exponential", + lr_min=1e-8, + lr_max=1, + num_training=100, + ) + lrs = list(torch.arange(len(losses))) + lr_finder.results = { + "lr": lrs, + "loss": losses, + } + with caplog.at_level(logging.ERROR, logger="root.tuner.lr_finder"): + lr = lr_finder.suggestion(skip_begin=skip_begin, skip_end=skip_end) + + if expected_error: + assert lr is None + assert "Failed to compute suggestion for learning rate" in caplog.text + else: + assert lr is not None + + +def test_lr_attribute_when_suggestion_invalid(tmpdir): + """Tests learning rate finder ends before `num_training` steps.""" + + class TestModel(BoringModel): + def __init__(self): + super().__init__() + self.learning_rate = 0.123 + + model = TestModel() + trainer = Trainer(default_root_dir=tmpdir) + lr_finder = trainer.tuner.lr_find(model=model, update_attr=True, num_training=1) # force insufficient data points + assert lr_finder.suggestion() is None + assert model.learning_rate == 0.123 # must remain unchanged because suggestion is not possible From a37fc72b57d7636dff0f745572dea4e3dc6f287e Mon Sep 17 00:00:00 2001 From: donlapark <10988155+donlapark@users.noreply.github.com> Date: Wed, 27 Jul 2022 15:43:43 +0700 Subject: [PATCH 017/230] fixes typing in `stochastic_weight_avg.py` (follow-up of #13685) (#13860) --- .../callbacks/stochastic_weight_avg.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 093c8e47d07dd..8c141b28bc93d 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -169,7 +169,7 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo for lr, group in zip(self._swa_lrs, optimizer.param_groups): group["initial_lr"] = lr - self._swa_scheduler: _LRScheduler = cast( + self._swa_scheduler = cast( _LRScheduler, SWALR( optimizer, @@ -244,19 +244,22 @@ def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule") -> No for module in pl_module.modules(): if not isinstance(module, nn.modules.batchnorm._BatchNorm): continue + assert module.running_mean is not None module.running_mean = torch.zeros_like( - module.running_mean, # type: ignore[arg-type] + module.running_mean, device=pl_module.device, - dtype=module.running_mean.dtype, # type: ignore[union-attr] + dtype=module.running_mean.dtype, ) + assert module.running_var is not None module.running_var = torch.ones_like( - module.running_var, # type: ignore[arg-type] + module.running_var, device=pl_module.device, - dtype=module.running_var.dtype, # type: ignore[union-attr] + dtype=module.running_var.dtype, ) self.momenta[module] = module.momentum - module.momentum = None # type: ignore[assignment] - module.num_batches_tracked *= 0 # type: ignore[assignment, operator] + module.momentum = float() + assert module.num_batches_tracked is not None + module.num_batches_tracked *= 0 def reset_momenta(self) -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165.""" From acdc1f137879883702b7a147163a04d3e3acb77e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Jul 2022 08:51:52 +0000 Subject: [PATCH 018/230] Update torchtext requirement from <=0.12.0,>=0.10.* to >=0.10.0.a,<0.14.0 in /requirements (#13758) * Update torchtext requirement in /requirements Updates the requirements on [torchtext](https://github.com/pytorch/text) to permit the latest version. - [Release notes](https://github.com/pytorch/text/releases) - [Commits](https://github.com/pytorch/text/compare/v0.10.0-rc1...v0.13.0) --- updated-dependencies: - dependency-name: torchtext dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update requirements/pytorch/extra.txt Co-authored-by: Akihiro Nitta Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> Co-authored-by: Akihiro Nitta --- requirements/pytorch/extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index 90571dd8cab91..f956d021976a6 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -1,6 +1,6 @@ # extended list of package dependencies to reach full functionality matplotlib>3.1, <3.5.3 -torchtext>=0.10.*, <=0.12.0 +torchtext>=0.10.*, <0.14.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.12.0, <=4.12.0 From acd8aa20d4834f5ee07a1b520aa94eeba89257e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 11:02:58 +0200 Subject: [PATCH 019/230] Skip code formatters on _notebooks submodule (#13867) --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 6d973aa0dde51..49d3259ea8e94 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,10 +15,12 @@ profile = "black" line_length = 120 force_sort_within_sections = "False" order_by_type = "False" +skip = ["_notebooks"] [tool.black] line-length = 120 +exclude = '(_notebooks/.*)' [tool.mypy] From 8792c6b01863bf5585c41930e909846ea6a179eb Mon Sep 17 00:00:00 2001 From: Mansy Date: Wed, 27 Jul 2022 11:05:16 +0200 Subject: [PATCH 020/230] Update Github issues template (#13857) Co-authored-by: mansy --- .github/ISSUE_TEMPLATE/bug_report.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 2823aad4d6981..f08865180ba1d 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -46,7 +46,9 @@ python collect_env_details.py You can also fill out the list below manually. --> +- Lightning Component (e.g. Trainer, LightningModule, LightningApp, LightningWork, LightningFlow): - PyTorch Lightning Version (e.g., 1.5.0): +- Lightning App Version (e.g., 0.5.2): - PyTorch Version (e.g., 1.10): - Python version (e.g., 3.9): - OS (e.g., Linux): @@ -54,6 +56,7 @@ You can also fill out the list below manually. - GPU models and configuration: - How you installed PyTorch (`conda`, `pip`, source): - If compiling from source, the output of `torch.__config__.show()`: +- Running environment of LightningApp (e.g. local, cloud): - Any other relevant information: ### Additional context From 41f45b475e366037ef3cb3b48991f1228076fd47 Mon Sep 17 00:00:00 2001 From: Anton Shevtsov <32237302+MrShevan@users.noreply.github.com> Date: Wed, 27 Jul 2022 12:10:57 +0300 Subject: [PATCH 021/230] Check if the scheduler already has `reduce_on_plateau` (#13838) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rohit Gupta Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/core/optimizer.py | 4 +++- .../trainer/optimization/test_optimizers.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 07266dd91a578..4af493b7f3209 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -379,6 +379,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` ([#12832](https://github.com/Lightning-AI/lightning/pull/12832) +- Improved support for custom `ReduceLROnPlateau` scheduler if `reduce_on_plateau` is set by the user in scheduler config ([#13838](https://github.com/Lightning-AI/lightning/pull/13838)) + + - Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/Lightning-AI/lightning/pull/13645)) diff --git a/src/pytorch_lightning/core/optimizer.py b/src/pytorch_lightning/core/optimizer.py index 7aa9baf794c5f..b96cfabd83b8b 100644 --- a/src/pytorch_lightning/core/optimizer.py +++ b/src/pytorch_lightning/core/optimizer.py @@ -284,7 +284,9 @@ def _configure_schedulers_automatic_opt(schedulers: list, monitor: Optional[str] 'The "interval" key in lr scheduler dict must be "step" or "epoch"' f' but is "{scheduler["interval"]}"' ) - scheduler["reduce_on_plateau"] = isinstance(scheduler["scheduler"], optim.lr_scheduler.ReduceLROnPlateau) + scheduler["reduce_on_plateau"] = scheduler.get( + "reduce_on_plateau", isinstance(scheduler["scheduler"], optim.lr_scheduler.ReduceLROnPlateau) + ) if scheduler["reduce_on_plateau"] and scheduler.get("monitor", None) is None: raise MisconfigurationException( "The lr scheduler dict must include a monitor when a `ReduceLROnPlateau` scheduler is used." diff --git a/tests/tests_pytorch/trainer/optimization/test_optimizers.py b/tests/tests_pytorch/trainer/optimization/test_optimizers.py index e7554fa2e6422..52fb6ba5028ae 100644 --- a/tests/tests_pytorch/trainer/optimization/test_optimizers.py +++ b/tests/tests_pytorch/trainer/optimization/test_optimizers.py @@ -115,6 +115,18 @@ def test_onecyclelr_with_epoch_interval_warns(): _configure_schedulers_automatic_opt([lr_scheduler], None) +def test_scheduler_initialized_with_custom_reduceonplateau(): + """Test for initialize custom scheduler with `reduce_on_plateau` argument.""" + + class CustomReduceLROnPlateau: + pass + + lr_scheduler = {"reduce_on_plateau": True, "scheduler": CustomReduceLROnPlateau(), "monitor": "my_loss"} + config = _configure_schedulers_automatic_opt([lr_scheduler], None) + assert isinstance(config[0].scheduler, CustomReduceLROnPlateau) + assert config[0].reduce_on_plateau + + def test_reducelronplateau_scheduling(tmpdir): class TestModel(BoringModel): def training_step(self, batch, batch_idx): From 56b1e1aaaace7855045609f8c5f039dda00d2330 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 27 Jul 2022 09:22:34 +0000 Subject: [PATCH 022/230] Update torchmetrics requirement from <0.9.2,>=0.7.0 to >=0.7.0,<0.9.3 in /requirements (#13528) Update torchmetrics requirement in /requirements Updates the requirements on [torchmetrics](https://github.com/Lightning-AI/metrics) to permit the latest version. - [Release notes](https://github.com/Lightning-AI/metrics/releases) - [Changelog](https://github.com/Lightning-AI/metrics/blob/master/CHANGELOG.md) - [Commits](https://github.com/Lightning-AI/metrics/compare/v0.7.0...v0.9.2) --- updated-dependencies: - dependency-name: torchmetrics dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index a0c1786362390..41a712c930cca 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,7 +4,7 @@ tqdm>=4.57.0, <=4.63.0 PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 tensorboard>=2.9.1, <2.10.0 -torchmetrics>=0.7.0, <0.9.2 # needed for using fixed compare_version +torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version pyDeprecate>=0.3.1, <=0.3.2 packaging>=17.0, <=21.3 typing-extensions>=4.0.0, <4.3.1 From 4c7b9f0b1114a87ab9a0b2fd7b8952d0b22d5c40 Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Wed, 27 Jul 2022 11:50:43 +0200 Subject: [PATCH 023/230] Disallow batch sampler with multiple IPU devices (#13854) Co-authored-by: Rohit Gupta --- .../source-pytorch/accelerators/ipu_basic.rst | 9 +-- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/strategies/ipu.py | 6 +- src/pytorch_lightning/utilities/data.py | 68 ++++++++++++------- tests/tests_pytorch/accelerators/test_ipu.py | 6 +- tests/tests_pytorch/utilities/test_data.py | 20 +++++- 6 files changed, 78 insertions(+), 33 deletions(-) diff --git a/docs/source-pytorch/accelerators/ipu_basic.rst b/docs/source-pytorch/accelerators/ipu_basic.rst index 6ff0cb701d6e7..99a5c69a10417 100644 --- a/docs/source-pytorch/accelerators/ipu_basic.rst +++ b/docs/source-pytorch/accelerators/ipu_basic.rst @@ -62,7 +62,8 @@ Currently there are some known limitations that are being addressed in the near Please see the `MNIST example `__ which displays most of the limitations and how to overcome them till they are resolved. -* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this -* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result -* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code -* Clipping gradients is not supported +* ``self.log`` is not supported in the ``training_step``, ``validation_step``, ``test_step`` or ``predict_step``. This is due to the step function being traced and sent to the IPU devices. We're actively working on fixing this. +* Multiple optimizers are not supported. ``training_step`` only supports returning one loss from the ``training_step`` function as a result. +* Since the step functions are traced, branching logic or any form of primitive values are traced into constants. Be mindful as this could lead to errors in your custom code. +* Clipping gradients is not supported. +* It is not possible to use :class:`torch.utils.data.BatchSampler` in your dataloaders if you are using multiple IPUs. diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4af493b7f3209..f8341248b20e8 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -167,7 +167,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated Habana Accelerator's `auto_device_count`, `is_available` & `get_device_name` methods based on the latest torch habana package ([#13423](https://github.com/PyTorchLightning/pytorch-lightning/pull/13423)) -- +- Disallowed using `BatchSampler` when running on multiple IPUs ([#13854](https://github.com/PyTorchLightning/pytorch-lightning/pull/13854)) ### Deprecated diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 5413756c15271..001ad77fbb5cc 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -162,6 +162,8 @@ def setup(self, trainer: "pl.Trainer") -> None: if self.lightning_module.trainer.enable_validation: model = poptorch.inferenceModel(model=model, options=inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model + if self.lightning_module.trainer.num_sanity_val_steps > 0: + self.poptorch_models[RunningStage.SANITY_CHECKING] = model elif trainer_fn == TrainerFn.VALIDATING: model = poptorch.inferenceModel(model=model, options=self.inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model @@ -228,7 +230,9 @@ def _convert_to_poptorch_loader( # the user is returning the `poptorch.DataLoader` directly, don't change anything. return dataloader - dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler) + dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs( + dataloader, sampler, mode, self.replication_factor > 1 + ) opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts dataloader = poptorch.DataLoader(opts, *dl_args, **dl_kwargs) return dataloader diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 2de82ceff088e..e60c56f6c7a7e 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -186,7 +186,7 @@ def get_len(dataloader: DataLoader) -> Union[int, float]: def _update_dataloader( dataloader: DataLoader, sampler: Union[Sampler, Iterable], mode: Optional[RunningStage] = None ) -> DataLoader: - dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode=mode) + dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode) dl_cls = type(dataloader) try: dataloader = dl_cls(*dl_args, **dl_kwargs) @@ -212,7 +212,10 @@ def _update_dataloader( def _get_dataloader_init_args_and_kwargs( - dataloader: DataLoader, sampler: Optional[Sampler], mode: Optional[RunningStage] = None + dataloader: DataLoader, + sampler: Optional[Sampler], + mode: Optional[RunningStage] = None, + disallow_batch_sampler: bool = False, ) -> Tuple[Tuple[Any], Dict[str, Any]]: if not isinstance(dataloader, DataLoader): raise ValueError(f"The dataloader {dataloader} needs to subclass `torch.utils.data.DataLoader`") @@ -264,7 +267,7 @@ def _get_dataloader_init_args_and_kwargs( dl_kwargs["batch_sampler"] = None dl_kwargs["sampler"] = None else: - dl_kwargs.update(_dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode=mode)) + dl_kwargs.update(_dataloader_init_kwargs_resolve_sampler(dataloader, sampler, mode, disallow_batch_sampler)) required_args = { p.name @@ -309,7 +312,10 @@ def _get_dataloader_init_args_and_kwargs( def _dataloader_init_kwargs_resolve_sampler( - dataloader: DataLoader, sampler: Optional[Sampler], mode: Optional[RunningStage] = None + dataloader: DataLoader, + sampler: Optional[Sampler], + mode: Optional[RunningStage] = None, + disallow_batch_sampler: bool = False, ) -> Dict[str, Any]: """This function is used to handle the sampler, batch_sampler arguments associated within a DataLoader for its re-instantiation. @@ -321,27 +327,39 @@ def _dataloader_init_kwargs_resolve_sampler( fault_tolerant_mode = _FaultTolerantMode.detect_current_mode() batch_sampler = getattr(dataloader, "batch_sampler") is_predicting = mode == RunningStage.PREDICTING - # checking the batch sampler type is different than PyTorch default. - if batch_sampler is not None and (type(batch_sampler) is not BatchSampler or is_predicting): - batch_sampler = type(batch_sampler)( - sampler, - batch_size=batch_sampler.batch_size, - drop_last=(False if is_predicting else batch_sampler.drop_last), - ) - if is_predicting: - batch_sampler = IndexBatchSamplerWrapper(batch_sampler) - - if fault_tolerant_mode.is_automatic: - fast_forward_sampler = batch_sampler = FastForwardSampler(batch_sampler) - fast_forward_sampler.setup(dataloader_batch_size=1) - - return { - "sampler": None, - "shuffle": False, - "batch_sampler": batch_sampler, - "batch_size": 1, - "drop_last": False, - } + + if batch_sampler is not None: + if disallow_batch_sampler: + # Check that we don't have a PyTorch default batch sampler that was instantiated in DataLoader __init__ + if not ( + type(batch_sampler) is BatchSampler + and batch_sampler.sampler == sampler + and dataloader.batch_size == batch_sampler.batch_size + ): + raise MisconfigurationException( + "It is not possible to have a batch sampler in your dataloader, " + "when running on multiple IPU devices." + ) + elif type(batch_sampler) is not BatchSampler or is_predicting: + batch_sampler = type(batch_sampler)( + sampler, + batch_size=batch_sampler.batch_size, + drop_last=(False if is_predicting else batch_sampler.drop_last), + ) + if is_predicting: + batch_sampler = IndexBatchSamplerWrapper(batch_sampler) + + if fault_tolerant_mode.is_automatic: + fast_forward_sampler = batch_sampler = FastForwardSampler(batch_sampler) + fast_forward_sampler.setup(dataloader_batch_size=1) + + return { + "sampler": None, + "shuffle": False, + "batch_sampler": batch_sampler, + "batch_size": 1, + "drop_last": False, + } if fault_tolerant_mode.is_automatic: fast_forward_sampler = sampler = FastForwardSampler(sampler) diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index 97f374a40d6c3..589ec7b29dd5b 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -619,7 +619,11 @@ def test_poptorch_models_at_different_stages(tmpdir): trainer.optimizers = model.configure_optimizers()[0] trainer.state.fn = TrainerFn.FITTING trainer.strategy.setup(trainer) - assert list(trainer.strategy.poptorch_models) == [RunningStage.TRAINING, RunningStage.VALIDATING] + assert list(trainer.strategy.poptorch_models) == [ + RunningStage.TRAINING, + RunningStage.VALIDATING, + RunningStage.SANITY_CHECKING, + ] for fn, stage in ( (TrainerFn.VALIDATING, RunningStage.VALIDATING), diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index 7b1e596d50f8c..5f66d802ea939 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -3,12 +3,13 @@ import pytest import torch from torch import Tensor -from torch.utils.data.dataloader import DataLoader +from torch.utils.data import BatchSampler, DataLoader, SequentialSampler from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.data import ( + _dataloader_init_kwargs_resolve_sampler, _get_dataloader_init_args_and_kwargs, _replace_dataloader_init_method, _update_dataloader, @@ -331,6 +332,23 @@ def test_replace_dataloader_init_method(cls, args, kwargs, arg_names, dataset, c assert getattr(dataloader, key) == value +def test_dataloader_disallow_batch_sampler(): + dataset = RandomDataset(5, 100) + dataloader = DataLoader(dataset, batch_size=10) + + # This should not raise + _dataloader_init_kwargs_resolve_sampler(dataloader, dataloader.sampler, disallow_batch_sampler=True) + + dataset = RandomDataset(5, 100) + sampler = SequentialSampler(dataset) + batch_sampler = BatchSampler(sampler, batch_size=10, drop_last=False) + dataloader = DataLoader(dataset, batch_sampler=batch_sampler) + + # this should raise - using batch sampler, that was not automatically instantiated by DataLoader + with pytest.raises(MisconfigurationException, match="when running on multiple IPU devices"): + _dataloader_init_kwargs_resolve_sampler(dataloader, dataloader.sampler, disallow_batch_sampler=True) + + @pytest.mark.parametrize("mode", [RunningStage.TRAINING, RunningStage.PREDICTING, RunningStage.TESTING]) def test_dataloader_kwargs_replacement_with_iterable_dataset(mode): """Test that DataLoader kwargs are not replaced when using Iterable Dataset.""" From b37e466f284c48d6e5e1759497f16b49876f7687 Mon Sep 17 00:00:00 2001 From: nitinramvelraj <98356761+nitinramvelraj@users.noreply.github.com> Date: Wed, 27 Jul 2022 03:37:29 -0700 Subject: [PATCH 024/230] Change tests/README.md to reflect repo structure change (#13437) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Jirka Borovec Co-authored-by: Carlos Mocholí --- tests/README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/README.md b/tests/README.md index 71c39e76b4b0a..3b40c32d755dd 100644 --- a/tests/README.md +++ b/tests/README.md @@ -9,10 +9,11 @@ To setup a local development environment, install both local and test dependenci ```bash # clone the repo -git clone https://github.com/PyTorchLightning/pytorch-lightning -cd pytorch-lightning +git clone https://github.com/Lightning-AI/lightning.git +cd lightning # install required depedencies +export PACKAGE_NAME=pytorch python -m pip install ".[dev, examples]" # install pre-commit (optional) python -m pip install pre-commit From 25de48802f5b407aee176138130ddf1a772f65e3 Mon Sep 17 00:00:00 2001 From: donlapark <10988155+donlapark@users.noreply.github.com> Date: Wed, 27 Jul 2022 18:19:29 +0700 Subject: [PATCH 025/230] Fixes various typing errors in `pytorch_lightning/strategies/deepspeed.py` (#13832) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: otaj Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta --- pyproject.toml | 1 - .../plugins/precision/deepspeed.py | 3 +- src/pytorch_lightning/strategies/deepspeed.py | 106 ++++++++++++------ 3 files changed, 71 insertions(+), 39 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 49d3259ea8e94..32cc6e8452d25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,6 @@ module = [ "pytorch_lightning.profilers.simple", "pytorch_lightning.strategies.ddp", "pytorch_lightning.strategies.ddp_spawn", - "pytorch_lightning.strategies.deepspeed", "pytorch_lightning.strategies.fully_sharded", "pytorch_lightning.strategies.ipu", "pytorch_lightning.strategies.sharded", diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 4cc12de400ef4..96458487c7420 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -26,9 +26,10 @@ from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.warnings import WarningCache +_DEEPSPEED_AVAILABLE = _RequirementAvailable("deepspeed") _DEEPSPEED_GREATER_EQUAL_0_6 = _RequirementAvailable("deepspeed>=0.6.0") if TYPE_CHECKING: - if pl.strategies.deepspeed._DEEPSPEED_AVAILABLE: + if _DEEPSPEED_AVAILABLE: import deepspeed warning_cache = WarningCache() diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index ede42754aafc9..e7fbcf91967fc 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -19,7 +19,7 @@ import platform from collections import OrderedDict from pathlib import Path -from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union +from typing import Any, cast, Dict, Generator, List, Mapping, Optional, Tuple, Union import torch from torch import Tensor @@ -48,12 +48,12 @@ from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info from pytorch_lightning.utilities.seed import reset_seed -from pytorch_lightning.utilities.types import _PATH, LRSchedulerConfig, LRSchedulerTypeUnion, STEP_OUTPUT +from pytorch_lightning.utilities.types import _LRScheduler, _PATH, LRSchedulerConfig, ReduceLROnPlateau, STEP_OUTPUT from pytorch_lightning.utilities.warnings import rank_zero_warn, WarningCache warning_cache = WarningCache() -_DEEPSPEED_AVAILABLE: bool = _RequirementAvailable("deepspeed") +_DEEPSPEED_AVAILABLE = _RequirementAvailable("deepspeed") if _DEEPSPEED_AVAILABLE: import deepspeed @@ -76,7 +76,7 @@ def __init__( super().__init__(pl_module) self.precision = precision - def forward(self, *inputs, **kwargs): + def forward(self, *inputs: Any, **kwargs: Any) -> Any: inputs = apply_to_collection(inputs, Tensor, function=self._batch_to) return super().forward(*inputs, **kwargs) @@ -123,7 +123,7 @@ def __init__( reduce_bucket_size: int = 200_000_000, zero_allow_untested_optimizer: bool = True, logging_batch_size_per_gpu: Union[str, int] = "auto", - config: Optional[Union[Path, str, dict]] = None, + config: Optional[Union[_PATH, Dict[str, Any]]] = None, logging_level: int = logging.WARN, parallel_devices: Optional[List[torch.device]] = None, cluster_environment: Optional[ClusterEnvironment] = None, @@ -142,7 +142,7 @@ def __init__( ) -> None: """Provides capabilities to run training using the DeepSpeed library, with training optimizations for large billion parameter models. `For more information: https://pytorch- - lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#deepspeed`. + lightning.readthedocs.io/en/stable/advanced/model_parallel.html#deepspeed`. .. warning:: ``DeepSpeedStrategy`` is in beta and subject to change. @@ -331,7 +331,7 @@ def __init__( self.hysteresis = hysteresis self.min_loss_scale = min_loss_scale - def _load_config(self, config): + def _load_config(self, config: Optional[Union[_PATH, Dict[str, Any]]]) -> Optional[Dict[str, Any]]: if config is None and self.DEEPSPEED_ENV_VAR in os.environ: rank_zero_info(f"Loading DeepSpeed config from set {self.DEEPSPEED_ENV_VAR} environment variable") config = os.environ[self.DEEPSPEED_ENV_VAR] @@ -342,9 +342,10 @@ def _load_config(self, config): ) with open(config) as f: config = json.load(f) + assert isinstance(config, dict) or config is None return config - def setup_distributed(self): + def setup_distributed(self) -> None: reset_seed() # determine which process we are and world size @@ -357,8 +358,10 @@ def setup_distributed(self): self._config_initialized = True def setup(self, trainer: "pl.Trainer") -> None: + assert self.accelerator is not None self.accelerator.setup(trainer) # we set the device so that optimizers can be created with distributed comms. + assert self.lightning_module is not None self.lightning_module._device = self.root_device self.setup_optimizers(trainer) self.setup_precision_plugin() @@ -367,6 +370,7 @@ def setup(self, trainer: "pl.Trainer") -> None: self.barrier() def _init_deepspeed_distributed(self) -> None: + assert self.cluster_environment is not None if platform.system() != "Windows": # do not set env variables on windows, allow deepspeed to control setup self._set_node_environment_variables() @@ -378,7 +382,7 @@ def _init_deepspeed_distributed(self) -> None: self._process_group_backend = self._get_process_group_backend() deepspeed.init_distributed(self._process_group_backend, distributed_port=self.cluster_environment.main_port) - def _get_process_group_backend(self): + def _get_process_group_backend(self) -> str: return ( self._process_group_backend or _get_process_group_backend_from_env() @@ -386,6 +390,7 @@ def _get_process_group_backend(self): ) def _set_node_environment_variables(self) -> None: + assert self.cluster_environment is not None os.environ["MASTER_ADDR"] = self.cluster_environment.main_address os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) os.environ["RANK"] = str(self.global_rank) @@ -396,7 +401,9 @@ def _set_node_environment_variables(self) -> None: def restore_checkpoint_after_setup(self) -> bool: return True - def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer]) -> Tuple[Module, List[Optimizer]]: + def _setup_model_and_optimizers( + self, model: Module, optimizers: List[Optimizer] + ) -> Tuple["deepspeed.DeepSpeedEngine", List[Optimizer]]: """Setup a model and multiple optimizers together. Currently only a single optimizer is supported. @@ -414,14 +421,18 @@ def _setup_model_and_optimizers(self, model: Module, optimizers: List[Optimizer] # train_micro_batch_size_per_gpu is used for throughput logging purposes # normally we set this to the batch size, but it is not available here unless the user provides it # as part of the config + assert self.config is not None self.config.setdefault("train_micro_batch_size_per_gpu", 1) self.model, optimizer = self._setup_model_and_optimizer(model, optimizers[0]) self._set_deepspeed_activation_checkpointing() return self.model, [optimizer] def _setup_model_and_optimizer( - self, model: Module, optimizer: Optimizer, lr_scheduler: Optional[LRSchedulerTypeUnion] = None - ): + self, + model: Module, + optimizer: Optional[Optimizer], + lr_scheduler: Optional[Union[_LRScheduler, ReduceLROnPlateau]] = None, + ) -> Tuple["deepspeed.DeepSpeedEngine", Optimizer]: """Initialize one model and one optimizer with an optional learning rate scheduler. This calls :func:`deepspeed.initialize` internally. @@ -431,14 +442,15 @@ def _setup_model_and_optimizer( args=argparse.Namespace(device_rank=self.root_device.index), config=self.config, model=model, - model_parameters=model_parameters, # type: ignore + model_parameters=model_parameters, optimizer=optimizer, lr_scheduler=lr_scheduler, dist_init_required=False, ) return deepspeed_engine, deepspeed_optimizer - def init_deepspeed(self): + def init_deepspeed(self) -> None: + assert self.lightning_module is not None # deepspeed handles gradient clipping internally if is_overridden("configure_gradient_clipping", self.lightning_module, pl.LightningModule): rank_zero_warn( @@ -464,6 +476,7 @@ def init_deepspeed(self): "DeepSpeed currently does not support different `accumulate_grad_batches` at different epochs." ) + assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) model = LightningDeepSpeedModule(pl_module=self.model, precision=self.precision_plugin.precision) if self.lightning_module.trainer and self.lightning_module.trainer.training: @@ -472,6 +485,7 @@ def init_deepspeed(self): self._initialize_deepspeed_inference(model) def _init_optimizers(self) -> Tuple[Optimizer, Optional[LRSchedulerConfig], Optional[int]]: + assert self.lightning_module is not None optimizers, lr_schedulers, optimizer_frequencies = _init_optimizers_and_lr_schedulers(self.lightning_module) if len(optimizers) > 1 or len(lr_schedulers) > 1: raise MisconfigurationException( @@ -485,10 +499,13 @@ def _init_optimizers(self) -> Tuple[Optimizer, Optional[LRSchedulerConfig], Opti @property def zero_stage_3(self) -> bool: - return self.config.get("zero_optimization") and self.config.get("zero_optimization").get("stage") == 3 + assert isinstance(self.config, dict) + zero_optimization = self.config.get("zero_optimization") + return zero_optimization is not None and zero_optimization.get("stage") == 3 - def _initialize_deepspeed_train(self, model): + def _initialize_deepspeed_train(self, model: Module) -> None: optimizer, scheduler = None, None + assert isinstance(self.config, dict) if "optimizer" in self.config: rank_zero_info( "You have specified an optimizer and/or scheduler within the DeepSpeed config." @@ -538,7 +555,8 @@ def model_sharded_context(self) -> Generator[None, None, None]: with model_parallel_context: yield - def _set_deepspeed_activation_checkpointing(self): + def _set_deepspeed_activation_checkpointing(self) -> None: + assert isinstance(self.config, dict) if self.config.get("activation_checkpointing"): checkpoint_config = self.config["activation_checkpointing"] deepspeed.checkpointing.configure( @@ -549,8 +567,9 @@ def _set_deepspeed_activation_checkpointing(self): profile=checkpoint_config.get("profile"), ) - def _initialize_deepspeed_inference(self, model): + def _initialize_deepspeed_inference(self, model: Module) -> None: # todo: Currently DeepSpeed requires optimizers at inference to partition weights correctly + assert isinstance(self.config, dict) optimizer, scheduler = None, None if "optimizer" not in self.config: rank_zero_info( @@ -585,13 +604,15 @@ def _initialize_deepspeed_inference(self, model): self.model = model @property - def lightning_module(self): + def lightning_module(self) -> Optional["pl.LightningModule"]: # the model may not be wrapped with DeepEngine & LightningDeepSpeedModule if calling this too early module = getattr(self.model, "module", self.model) - return module.module if isinstance(module, LightningDeepSpeedModule) else module + module = module.module if isinstance(module, LightningDeepSpeedModule) else module + assert isinstance(module, pl.LightningModule) or module is None + return module @property - def distributed_sampler_kwargs(self): + def distributed_sampler_kwargs(self) -> Dict[str, int]: distributed_sampler_kwargs = dict(num_replicas=self.world_size, rank=self.global_rank) return distributed_sampler_kwargs @@ -616,17 +637,18 @@ def handles_gradient_accumulation(self) -> bool: """Whether the plugin handles gradient accumulation internally.""" return True - def _format_config(self): + def _format_config(self) -> None: if self.config is None: raise MisconfigurationException( "To use DeepSpeed you must pass in a DeepSpeed config dict, or a path to a JSON config." - " See: https://pytorch-lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#deepspeed" + " See: https://pytorch-lightning.readthedocs.io/en/stable/advanced/model_parallel.html#deepspeed" ) self._format_batch_size_and_grad_accum_config() self._format_precision_config() - def _format_batch_size_and_grad_accum_config(self): + def _format_batch_size_and_grad_accum_config(self) -> None: # todo: using lite, we do not support these variables within the config + assert isinstance(self.config, dict) if self.lightning_module is None: return @@ -642,16 +664,17 @@ def _format_batch_size_and_grad_accum_config(self): if "gradient_clipping" not in self.config: self.config["gradient_clipping"] = self.lightning_module.trainer.gradient_clip_val or 0.0 - def _auto_select_batch_size(self): + def _auto_select_batch_size(self) -> int: # train_micro_batch_size_per_gpu is used for throughput logging purposes # by default we try to use the batch size of the loader + assert self.lightning_module is not None batch_size = 1 train_dl_source = self.lightning_module.trainer._data_connector._train_dataloader_source if train_dl_source.is_defined(): try: train_dataloader = train_dl_source.dataloader() if hasattr(train_dataloader, "batch_sampler"): - batch_size = train_dataloader.batch_sampler.batch_size + batch_size = train_dataloader.batch_sampler.batch_size # type: ignore[union-attr] # broad exception on purpose as `source.dataloader()` will fail if the dataloader requires `setup` # to have been called before except Exception: @@ -664,6 +687,7 @@ def _auto_select_batch_size(self): return batch_size def _format_precision_config(self) -> None: + assert isinstance(self.config, dict) if self.precision_plugin.precision in (PrecisionType.HALF, PrecisionType.MIXED): if "fp16" not in self.config and self.precision_plugin.amp_type == AMPType.NATIVE: # FP16 is a DeepSpeed standalone AMP implementation @@ -707,7 +731,7 @@ def _create_default_config( single_submit: bool, overlap_events: bool, thread_count: int, - **zero_kwargs, + **zero_kwargs: Any, ) -> Dict: cfg = { "activation_checkpointing": { @@ -753,7 +777,7 @@ def _create_default_config( return cfg @property - def deepspeed_engine(self): + def deepspeed_engine(self) -> "deepspeed.DeepSpeedEngine": return self.model @property @@ -786,7 +810,7 @@ def save_checkpoint(self, checkpoint: Dict, filepath: _PATH, storage_options: Op "When saving the DeepSpeed Stage 3 checkpoint, " "each worker will save a shard of the checkpoint within a directory. " "If a single file is required after training, " - "see https://pytorch-lightning.readthedocs.io/en/latest/advanced/advanced_gpu.html#" + "see https://pytorch-lightning.readthedocs.io/en/stable/advanced/model_parallel.html#" "deepspeed-zero-stage-3-single-file for instructions." ) # Use deepspeed's internal checkpointing function to handle partitioned weights across processes @@ -799,10 +823,12 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: if self.load_full_weights and self.zero_stage_3: # Broadcast to ensure we load from the rank 0 checkpoint # This doesn't have to be the case when using deepspeed sharded checkpointing - checkpoint_path = self.broadcast(checkpoint_path) + checkpoint_path = cast(_PATH, self.broadcast(checkpoint_path)) return super().load_checkpoint(checkpoint_path) # Rely on deepspeed to load the checkpoint and necessary information + assert self.lightning_module is not None + from pytorch_lightning.trainer.states import TrainerFn is_fitting = self.lightning_module.trainer.state.fn == TrainerFn.FITTING @@ -818,6 +844,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: @property def lightning_restore_optimizer(self) -> bool: + assert self.lightning_module is not None # managed by DeepSpeed if self.load_full_weights and self.zero_stage_3 and self.lightning_module.trainer.state.fn == TrainerFn.FITTING: rank_zero_warn( @@ -842,11 +869,13 @@ def _restore_zero_state(self, ckpt: Mapping[str, Any]) -> None: ckpt: The ckpt file. """ - def load(module: torch.nn.Module, prefix=""): + assert self.lightning_module is not None + + def load(module: torch.nn.Module, prefix: str = "") -> None: - missing_keys = [] - unexpected_keys = [] - error_msgs = [] + missing_keys: List[str] = [] + unexpected_keys: List[str] = [] + error_msgs: List[str] = [] state_dict = ckpt["state_dict"] # copy state_dict so _load_from_state_dict can modify it @@ -914,14 +943,17 @@ def register_strategies(cls, strategy_registry: Dict) -> None: offload_optimizer_device="nvme", ) - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + assert self.model is not None with self.precision_plugin.val_step_context(): return self.model(*args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + assert self.model is not None with self.precision_plugin.test_step_context(): return self.model(*args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + assert self.model is not None with self.precision_plugin.predict_step_context(): return self.model(*args, **kwargs) From 2a24b906ac9139b57ee6c0282f76c148b70b594d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 14:36:22 +0200 Subject: [PATCH 026/230] Add batch size script argument for standalone tests (#13841) Co-authored-by: Jirka Borovec --- tests/tests_pytorch/run_standalone_tests.sh | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 45264bb74a47d..5297cbd033347 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -15,6 +15,20 @@ set -e # THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY +# Batch size for testing: Determines how many standalone test invocations run in parallel +test_batch_size=6 + +while getopts "b:" opt; do + case $opt in + b) + test_batch_size=$OPTARG;; + *) + echo "Usage: $(basename $0) [-b batch_size]" + exit 1;; + esac +done +shift $((OPTIND-1)) + # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 # python arguments @@ -40,7 +54,6 @@ parametrizations_arr=($parametrizations) # tests to skip - space separated blocklist='profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx utilities/test_warnings.py' report='' -test_batch_size=6 rm -f standalone_test_output.txt # in case it exists, remove it function show_batched_output { From c58d351e0113522ee44362cc85bc85cc96a91b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 27 Jul 2022 16:45:53 +0200 Subject: [PATCH 027/230] Update version for rc0 release (#13877) --- src/pytorch_lightning/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index 88305e1188f37..6213a2ac0b758 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.7.0dev" +version = "1.7.0rc0" From 95f5f170f545e96d1b57b07144ce252b76beeae5 Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Wed, 27 Jul 2022 17:32:50 +0200 Subject: [PATCH 028/230] Allowed custom `BatchSampler`s when instantiated in `*_dataloader` hook (#13640) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Rohit Gupta Co-authored-by: Adrian Wälchli --- src/pytorch_lightning/CHANGELOG.md | 2 + src/pytorch_lightning/lite/lite.py | 8 +- .../trainer/connectors/data_connector.py | 6 +- .../utilities/auto_restart.py | 14 +- src/pytorch_lightning/utilities/data.py | 160 +++++++++++---- tests/tests_pytorch/lite/test_lite.py | 7 +- .../utilities/test_auto_restart.py | 10 - tests/tests_pytorch/utilities/test_data.py | 191 ++++++++++++++++-- 8 files changed, 317 insertions(+), 81 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index f8341248b20e8..baf01371fb8bc 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -348,6 +348,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Improved support for custom `DataLoader`s when instantiated in `*_dataloader` hook ([#12981](https://github.com/PyTorchLightning/pytorch-lightning/pull/12981)) +- Allowed custom `BatchSampler`s when instantiated in `*_dataloader` hook [#13640](https://github.com/PyTorchLightning/pytorch-lightning/pull/13640)) + - Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/PyTorchLightning/pytorch-lightning/pull/13014)) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 0195e6852eb28..981eed30635f6 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -22,7 +22,7 @@ import torch.nn as nn from torch import Tensor from torch.optim import Optimizer -from torch.utils.data import DataLoader, DistributedSampler +from torch.utils.data import BatchSampler, DataLoader, DistributedSampler from pytorch_lightning.accelerators.accelerator import Accelerator from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer @@ -35,7 +35,7 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, - _replace_dataloader_init_method, + _replace_init_method, _update_dataloader, has_iterable_dataset, ) @@ -403,7 +403,9 @@ def _run_impl(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: def _run_with_strategy_setup(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: self._strategy.setup_environment() - with self._strategy.model_sharded_context(), _replace_dataloader_init_method(): + with self._strategy.model_sharded_context(), _replace_init_method(DataLoader, "dataset"), _replace_init_method( + BatchSampler + ): return run_method(*args, **kwargs) def _move_model_to_device(self, model: nn.Module, optimizers: List[Optimizer]) -> nn.Module: diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index add62ceece65c..7831316a98ae1 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -17,7 +17,7 @@ from typing import Any, Callable, Collection, List, Optional, Tuple, Union from weakref import proxy -from torch.utils.data import DataLoader, Sampler, SequentialSampler +from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler from torch.utils.data.distributed import DistributedSampler import pytorch_lightning as pl @@ -31,7 +31,7 @@ from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, _is_dataloader_shuffled, - _replace_dataloader_init_method, + _replace_init_method, _update_dataloader, has_iterable_dataset, has_len_all_ranks, @@ -424,7 +424,7 @@ def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[Dat """ source = getattr(self, f"_{stage.dataloader_prefix}_dataloader_source") - with _replace_dataloader_init_method(): + with _replace_init_method(DataLoader, "dataset"), _replace_init_method(BatchSampler): # under this context manager, the arguments passed to `DataLoader.__init__` will be captured and saved as # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning dataloader = source.dataloader() diff --git a/src/pytorch_lightning/utilities/auto_restart.py b/src/pytorch_lightning/utilities/auto_restart.py index 0bd2942b17cca..3877a1ab3944c 100644 --- a/src/pytorch_lightning/utilities/auto_restart.py +++ b/src/pytorch_lightning/utilities/auto_restart.py @@ -16,15 +16,7 @@ from functools import partial, wraps from typing import Any, Callable, Dict, Generator, Iterable, Iterator, List, Optional, Tuple, Union -from torch.utils.data import ( - BatchSampler, - Dataset, - DistributedSampler, - get_worker_info, - RandomSampler, - Sampler, - SequentialSampler, -) +from torch.utils.data import Dataset, DistributedSampler, get_worker_info, RandomSampler, Sampler, SequentialSampler from torch.utils.data.dataloader import ( _BaseDataLoaderIter, _MultiProcessingDataLoaderIter, @@ -757,10 +749,6 @@ def _validate_map_dataset(dataloader: DataLoader) -> None: if sampler is not None and type(sampler) not in SUPPORTED_SAMPLERS: raise TypeError(f"Fault-tolerance supports only {SUPPORTED_SAMPLERS}.") - batch_sampler = getattr(dataloader, "batch_sampler", None) - if batch_sampler is not None and type(batch_sampler) is not BatchSampler: - raise TypeError("Fault-tolerance supports only a `BatchSampler`.") - if type(sampler) is DistributedSampler and sampler.shuffle: raise TypeError("A `DistributedSampler` sampler shuffle attribute is set to True.") elif type(sampler) is RandomSampler: diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index e60c56f6c7a7e..862c7f2de905b 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -14,6 +14,7 @@ import functools import inspect import os +from collections import OrderedDict from contextlib import contextmanager from dataclasses import fields from functools import partial @@ -220,11 +221,11 @@ def _get_dataloader_init_args_and_kwargs( if not isinstance(dataloader, DataLoader): raise ValueError(f"The dataloader {dataloader} needs to subclass `torch.utils.data.DataLoader`") - was_wrapped = hasattr(dataloader, "__pl_dl_args") + was_wrapped = hasattr(dataloader, "__pl_saved_args") if was_wrapped: - dl_args = dataloader.__pl_dl_args - dl_kwargs = dataloader.__pl_dl_kwargs - arg_names = dataloader.__pl_dl_arg_names + dl_args = dataloader.__pl_saved_args + dl_kwargs = dataloader.__pl_saved_kwargs + arg_names = dataloader.__pl_saved_arg_names original_dataset = dataloader.__dataset # we have this saved from _wrap_init else: # get the dataloader instance attributes @@ -323,6 +324,9 @@ def _dataloader_init_kwargs_resolve_sampler( If the dataloader is being used for prediction, the sampler will be wrapped into an `IndexBatchSamplerWrapper`, so Lightning can keep track of its indices. If fault tolerant training is enabled, the sampler will be wrapped into a `FastForwardSampler`. + + If there are multiple devices in IPU mode, it is necessary to disallow BatchSampler that isn't instantiated + automatically, since `poptorch.DataLoader` will try to increase the batch_size """ fault_tolerant_mode = _FaultTolerantMode.detect_current_mode() batch_sampler = getattr(dataloader, "batch_sampler") @@ -341,11 +345,59 @@ def _dataloader_init_kwargs_resolve_sampler( "when running on multiple IPU devices." ) elif type(batch_sampler) is not BatchSampler or is_predicting: - batch_sampler = type(batch_sampler)( - sampler, - batch_size=batch_sampler.batch_size, - drop_last=(False if is_predicting else batch_sampler.drop_last), - ) + batch_sampler_cls = type(batch_sampler) + if hasattr(batch_sampler, "__pl_saved_args"): + args = batch_sampler.__pl_saved_args + kwargs = batch_sampler.__pl_saved_kwargs + default_kwargs = batch_sampler.__pl_saved_default_kwargs + arg_names = batch_sampler.__pl_saved_arg_names + + if is_predicting: + success, args, kwargs = _replace_value_in_saved_args( + "drop_last", False, args, kwargs, default_kwargs, arg_names + ) + if not success: + rank_zero_warn( + f"Trying to inject `drop_last=False` into batch sampler since you are predicting, however " + f"it seems the class `{batch_sampler_cls.__qualname__}` does not support it. " + "Your predictions might be incomplete. To mitigate this, expose `drop_last` in " + "the `__init__` method of your custom class." + ) + + success, args, kwargs = _replace_value_in_saved_args( + "sampler", sampler, args, kwargs, default_kwargs, arg_names + ) + if not success: + raise TypeError( + "Trying to inject a modified sampler into the batch sampler; however, it seems the class " + f"`{batch_sampler_cls.__qualname__}` does not have an argument called `sampler.` To mitigate " + "this, expose an argument `sampler` in the `__init__` method of your custom class." + ) + + batch_sampler = batch_sampler_cls(*args, **kwargs) + else: + try: + batch_sampler = batch_sampler_cls( + sampler, + batch_size=batch_sampler.batch_size, + drop_last=(False if is_predicting else batch_sampler.drop_last), + ) + except TypeError as e: + import re + + match = re.match(r".*__init__\(\) (got multiple values)|(missing \d required)", str(e)) + if not match: + # an unexpected `TypeError`, continue failure + raise + + # There could either be too few or too many arguments. Customizing the message based on this doesn't + # make much sense since our MisconfigurationException is going to be raised from the original one. + raise MisconfigurationException( + "We tried to re-instantiate your custom batch sampler and failed. " + "To mitigate this, either follow the API of `BatchSampler` or instantiate " + "your custom batch sampler inside `*_dataloader` hooks of your module." + ) from e + if is_predicting: batch_sampler = IndexBatchSamplerWrapper(batch_sampler) @@ -368,39 +420,73 @@ def _dataloader_init_kwargs_resolve_sampler( return {"sampler": sampler, "shuffle": False, "batch_sampler": None} +def _replace_value_in_saved_args( + replace_key: str, + replace_value: Any, + args: Tuple[Any, ...], + kwargs: Dict[str, Any], + default_kwargs: Dict[str, Any], + arg_names: Tuple[str, ...], +) -> Tuple[bool, Tuple[Any, ...], Dict[str, Any]]: + """Tries to replace an argument value in a saved list of args and kwargs. + + Returns a tuple indicating success of the operation and modified saved args and kwargs + """ + + if replace_key in arg_names: + replace_index = arg_names.index(replace_key) + args = args[:replace_index] + (replace_value,) + args[replace_index + 1 :] + return True, args, kwargs + elif replace_key in kwargs or replace_key in default_kwargs: + kwargs[replace_key] = replace_value + return True, args, kwargs + + return False, args, kwargs + + def _auto_add_worker_init_fn(dataloader: DataLoader, rank: int) -> None: if int(os.environ.get("PL_SEED_WORKERS", 0)) and dataloader.worker_init_fn is None: dataloader.worker_init_fn = partial(pl_worker_init_function, rank=rank) -def _wrap_dataloader_init(init: Callable) -> Callable: - """Wraps the ``__init__`` method of :class:`~torch.utils.data.DataLoader` in order to enable re-instantiation - of custom subclasses.""" +def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) -> Callable: + """Wraps the ``__init__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and + :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" @functools.wraps(init) - def wrapper(obj: DataLoader, *args: Any, **kwargs: Any) -> None: + def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: # We need to inspect `init`, as inspecting `obj.__init__` # can lead to inspecting the wrong function with multiple inheritance params = inspect.signature(init).parameters - param_names = tuple( - param.name + + parameters_defaults = OrderedDict( + (param.name, param.default) for param in params.values() if param.name != "self" and param.kind not in (param.VAR_POSITIONAL, param.VAR_KEYWORD) ) - param_names = param_names[: len(args)] - if not hasattr(obj, "__pl_dl_args"): - obj.__pl_dl_args = args - obj.__pl_dl_kwargs = kwargs - obj.__pl_dl_arg_names = param_names + param_names = tuple(parameters_defaults)[: len(args)] - # We want to use the latest possible value for dataset argument (i.e. ideally what gets passed to DataLoader) + default_kwargs = { + name: value + for name, value in parameters_defaults.items() + if name not in kwargs and name not in param_names and value != inspect.Parameter.empty + } + + if not hasattr(obj, "__pl_saved_args"): + obj.__pl_saved_args = args + obj.__pl_saved_kwargs = kwargs + obj.__pl_saved_arg_names = param_names + obj.__pl_saved_default_kwargs = default_kwargs + + # We want to use the latest possible value for explicit argument (i.e. ideally what gets passed to base class) # so that we can be sure, that it will not get changed anymore. # That is why we are setting this in every `__init__` - if "dataset" in param_names: - setattr(obj, "__dataset", args[param_names.index("dataset")]) - elif "dataset" in kwargs: - setattr(obj, "__dataset", kwargs["dataset"]) + if store_explicit_arg is not None: + if store_explicit_arg in param_names: + setattr(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) + elif store_explicit_arg in kwargs: + setattr(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) init(obj, *args, **kwargs) @@ -422,15 +508,17 @@ def recurse(cl: Type[Any]) -> None: @contextmanager -def _replace_dataloader_init_method() -> Generator[None, None, None]: - """This context manager is used to add support for re-instantiation of custom (subclasses) of - :class:`~torch.utils.data.DataLoader`. It patches the ``__init__`` method.""" - classes = _get_all_subclasses(DataLoader) | {DataLoader} +def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: + """This context manager is used to add support for re-instantiation of custom (subclasses) of `base_cls`. + + It patches the ``__init__`` method. + """ + classes = _get_all_subclasses(base_cls) | {base_cls} wrapped = set() for cls in classes: if cls.__init__ not in wrapped: cls._old_init = cls.__init__ - cls.__init__ = _wrap_dataloader_init(cls.__init__) + cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) wrapped.add(cls.__init__) yield for cls in classes: @@ -475,13 +563,13 @@ def _apply_fault_tolerant_automatic_capture_dataset_wrapper( def _is_dataloader_shuffled(dataloader: object) -> bool: - if hasattr(dataloader, "__pl_dl_kwargs"): + if hasattr(dataloader, "__pl_saved_kwargs"): # this attribute is not part of PyTorch's DataLoader, but could have been set by - # our `_replace_dataloader_init_method` context manager - if "shuffle" in dataloader.__pl_dl_kwargs: - return dataloader.__pl_dl_kwargs["shuffle"] - if "shuffle" in dataloader.__pl_dl_arg_names: - return dataloader.__pl_dl_args[dataloader.__pl_dl_arg_names.index("shuffle")] + # our `_replace_init_method` context manager + if "shuffle" in dataloader.__pl_saved_kwargs: + return dataloader.__pl_saved_kwargs["shuffle"] + if "shuffle" in dataloader.__pl_saved_arg_names: + return dataloader.__pl_saved_args[dataloader.__pl_saved_arg_names.index("shuffle")] if isinstance(dataloader.dataset, IterableDataset): # shuffling is useless with iterable datasets return False diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index c0439854013a2..3652613526549 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -177,16 +177,17 @@ def test_setup_dataloaders_return_type(): assert lite_dataloader1.dataset is dataset1 -@mock.patch("pytorch_lightning.lite.lite._replace_dataloader_init_method") +@mock.patch("pytorch_lightning.lite.lite._replace_init_method") def test_setup_dataloaders_captures_dataloader_arguments(ctx_manager): """Test that Lite intercepts the DataLoader constructor arguments with a context manager in its run method.""" class Lite(LightningLite): def run(self): - ctx_manager().__enter__.assert_called_once() + # One for BatchSampler, another for DataLoader + assert ctx_manager().__enter__.call_count == 2 Lite().run() - ctx_manager().__exit__.assert_called_once() + assert ctx_manager().__exit__.call_count == 2 def test_setup_dataloaders_raises_for_unknown_custom_args(): diff --git a/tests/tests_pytorch/utilities/test_auto_restart.py b/tests/tests_pytorch/utilities/test_auto_restart.py index 5a5982ad009f9..8a888ce09c90a 100644 --- a/tests/tests_pytorch/utilities/test_auto_restart.py +++ b/tests/tests_pytorch/utilities/test_auto_restart.py @@ -34,7 +34,6 @@ from torch.utils.data._utils.worker import _generate_state, get_worker_info from torch.utils.data.dataloader import DataLoader, default_collate from torch.utils.data.dataset import Dataset, IterableDataset -from torch.utils.data.sampler import Sampler import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Callback, LightningModule, seed_everything, Trainer @@ -1177,15 +1176,6 @@ class CustomRandomSampler(RandomSampler): with pytest.raises(TypeError, match="RandomSampler"): _validate_fault_tolerant_automatic(dl, RunningStage.TRAINING) - class CustomBatchSampler(BatchSampler): - pass - - sampler = Sampler(data()) - batch_sampler = CustomBatchSampler(sampler, 2, False) - dl = DataLoader(data(), batch_sampler=batch_sampler) - with pytest.raises(TypeError, match="BatchSampler"): - _validate_fault_tolerant_automatic(dl, RunningStage.TRAINING) - class CustomIterable(IterableDataset): pass diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index 5f66d802ea939..ffb898efaa815 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -3,15 +3,17 @@ import pytest import torch from torch import Tensor -from torch.utils.data import BatchSampler, DataLoader, SequentialSampler +from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.data import ( _dataloader_init_kwargs_resolve_sampler, _get_dataloader_init_args_and_kwargs, - _replace_dataloader_init_method, + _replace_init_method, + _replace_value_in_saved_args, _update_dataloader, extract_batch_size, get_len, @@ -145,7 +147,7 @@ def __init__(self, foo, *args, **kwargs): with pytest.raises(MisconfigurationException, match="`DataLoader` implementation has an error.*`dataset`"): _update_dataloader(dataloader, dataloader.sampler) - with _replace_dataloader_init_method(): + with _replace_init_method(DataLoader, "dataset"): dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) new_dataloader = _update_dataloader(dataloader, dataloader.sampler) assert isinstance(new_dataloader, BadStandaloneGoodHookImpl) @@ -296,13 +298,14 @@ def __init__(self, dataset, **kwargs): pytest.param(ChangingDataLoader, (range(5),), dict(), ("dataset",), list(range(10)), dict(), id="test9"), ], ) -def test_replace_dataloader_init_method(cls, args, kwargs, arg_names, dataset, checked_values): - with _replace_dataloader_init_method(): +def test_replace_init_method_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): + with _replace_init_method(DataLoader, "dataset"): dataloader = cls(*args, **kwargs) - assert dataloader.__pl_dl_args == args - assert dataloader.__pl_dl_kwargs == kwargs - assert dataloader.__pl_dl_arg_names == arg_names + assert dataloader.__pl_saved_args == args + assert dataloader.__pl_saved_kwargs == kwargs + assert dataloader.__pl_saved_arg_names == arg_names + assert dataloader.__pl_saved_default_kwargs == {} assert dataloader.__dataset == dataset assert dataloader.dataset == dataset @@ -312,14 +315,15 @@ def test_replace_dataloader_init_method(cls, args, kwargs, arg_names, dataset, c if isinstance(dataloader_value, torch.Tensor): assert dataloader_value is value else: - assert getattr(dataloader, key) == value + assert dataloader_value == value dataloader = _update_dataloader(dataloader, dataloader.sampler) assert isinstance(dataloader, cls) - assert not hasattr(dataloader, "__pl_dl_kwargs") - assert not hasattr(dataloader, "__pl_dl_arg_names") - assert not hasattr(dataloader, "__pl_dl_args") + assert not hasattr(dataloader, "__pl_saved_kwargs") + assert not hasattr(dataloader, "__pl_saved_arg_names") + assert not hasattr(dataloader, "__pl_saved_args") + assert not hasattr(dataloader, "__pl_saved_default_kwargs") assert not hasattr(dataloader, "__dataset") assert dataloader.dataset == dataset @@ -329,7 +333,168 @@ def test_replace_dataloader_init_method(cls, args, kwargs, arg_names, dataset, c if isinstance(dataloader_value, torch.Tensor): assert dataloader_value is value else: - assert getattr(dataloader, key) == value + assert dataloader_value == value + + +def test_replace_init_method_extra_kwargs(): + class LoaderSubclass(DataLoader): + def __init__(self, dataset, *args, batch_size=10, **kwargs): + super().__init__(dataset, *args, batch_size=batch_size, **kwargs) + + with _replace_init_method(DataLoader, "dataset"): + dataloader = LoaderSubclass(range(10)) + + assert dataloader.__pl_saved_args == (range(10),) + assert dataloader.__pl_saved_kwargs == {} + assert dataloader.__pl_saved_arg_names == ("dataset",) + assert dataloader.__pl_saved_default_kwargs == {"batch_size": 10} + assert dataloader.__dataset == range(10) + + +@pytest.mark.parametrize("predicting", [True, False]) +def test_custom_batch_sampler(predicting): + """This test asserts, that custom `BatchSampler`, with all the arguments, that are required in order to + properly reinstantiate the class, is invoked properly. + + It also asserts, that during the reinstantiation, the wrapper of `__init__` method is not present anymore, therefore + not setting `__pl_saved_{args,arg_names,kwargs}` attributes. + """ + + class MyBatchSampler(BatchSampler): + # Custom Batch sampler with extra argument and default value + def __init__(self, sampler, extra_arg, drop_last=True): + self.extra_arg = extra_arg + super().__init__(sampler, 10, drop_last) + + sampler = RandomSampler(range(10)) + with _replace_init_method(BatchSampler): + # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + batch_sampler = MyBatchSampler(sampler, "random_str") + + dataloader = DataLoader(range(10), batch_sampler=batch_sampler) + + # assert that passed information got saved + assert dataloader.batch_sampler.__pl_saved_args == (sampler, "random_str") + assert dataloader.batch_sampler.__pl_saved_kwargs == {} + assert dataloader.batch_sampler.__pl_saved_arg_names == ("sampler", "extra_arg") + assert dataloader.batch_sampler.__pl_saved_default_kwargs == {"drop_last": True} + + # updating dataloader, what happens on access of the dataloaders. + # This should not fail, and would fail before support for custom args. + dataloader = _update_dataloader( + dataloader, dataloader.sampler, mode=RunningStage.PREDICTING if predicting else None + ) + + # Assert the `__init__` method is not replaced anymore and everything is instantiated to correct types + batch_sampler = dataloader.batch_sampler + + if predicting: + assert isinstance(batch_sampler, IndexBatchSamplerWrapper) + batch_sampler = batch_sampler._sampler + + assert isinstance(batch_sampler, MyBatchSampler) + assert batch_sampler.drop_last == (not predicting) + + assert batch_sampler.extra_arg == "random_str" + assert not hasattr(batch_sampler, "__pl_saved_kwargs") + assert not hasattr(batch_sampler, "__pl_saved_arg_names") + assert not hasattr(batch_sampler, "__pl_saved_args") + assert not hasattr(batch_sampler, "__pl_saved_default_kwargs") + + +def test_custom_batch_sampler_no_drop_last(): + """Tests whether appropriate warning is raised when the custom `BatchSampler` does not support `drop_last` and + we want to reset it.""" + + class MyBatchSampler(BatchSampler): + # Custom batch sampler with extra argument, but without `drop_last` + def __init__(self, sampler, extra_arg): + self.extra_arg = extra_arg + super().__init__(sampler, 10, False) + + sampler = RandomSampler(range(10)) + with _replace_init_method(BatchSampler): + # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + batch_sampler = MyBatchSampler(sampler, "random_str") + + dataloader = DataLoader(range(10), batch_sampler=batch_sampler) + + # assert that passed information got saved + assert dataloader.batch_sampler.__pl_saved_args == (sampler, "random_str") + assert dataloader.batch_sampler.__pl_saved_kwargs == {} + assert dataloader.batch_sampler.__pl_saved_arg_names == ("sampler", "extra_arg") + assert dataloader.batch_sampler.__pl_saved_default_kwargs == {} + + # Assert that warning is raised + with pytest.warns(UserWarning, match="drop_last=False"): + dataloader = _update_dataloader(dataloader, dataloader.sampler, mode=RunningStage.PREDICTING) + + +def test_custom_batch_sampler_no_sampler(): + """Tests whether appropriate error is raised when the custom `BatchSampler` does not support sampler + argument.""" + + class MyBatchSampler(BatchSampler): + # Custom batch sampler, without sampler argument. + def __init__(self, extra_arg): + self.extra_arg = extra_arg + super().__init__(RandomSampler(range(10)), 10, False) + + with _replace_init_method(BatchSampler): + # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + batch_sampler = MyBatchSampler("random_str") + dataloader = DataLoader(range(10), batch_sampler=batch_sampler) + + # assert that passed information got saved + assert dataloader.batch_sampler.__pl_saved_args == ("random_str",) + assert dataloader.batch_sampler.__pl_saved_kwargs == {} + assert dataloader.batch_sampler.__pl_saved_arg_names == ("extra_arg",) + assert dataloader.batch_sampler.__pl_saved_default_kwargs == {} + + # Assert that error is raised + with pytest.raises(TypeError, match="sampler into the batch sampler"): + dataloader = _update_dataloader(dataloader, dataloader.sampler, mode=RunningStage.PREDICTING) + + +@pytest.mark.parametrize( + [ + "args", + "kwargs", + "default_kwargs", + "arg_names", + "replace_key", + "replace_value", + "expected_status", + "expected_args", + "expected_kwargs", + ], + [ + pytest.param((), {}, {}, [], "a", 1, False, (), {}, id="empty"), + pytest.param((1,), {}, {}, ["a"], "a", 2, True, (2,), {}, id="simple1"), + pytest.param((1, 2, 3), {}, {}, ["a", "b", "c"], "b", False, True, (1, False, 3), {}, id="simple2"), + pytest.param((1, 2, 3), {"a": 1}, {}, ["b", "c", "d"], "a", 2, True, (1, 2, 3), {"a": 2}, id="simple_kwargs"), + pytest.param( + (1, 2, 3), + {"a": 1}, + {"e": 5}, + ["b", "c", "d"], + "e", + 2, + True, + (1, 2, 3), + {"a": 1, "e": 2}, + id="default_kwargs", + ), + ], +) +def test_replace_value_in_args( + args, kwargs, default_kwargs, arg_names, replace_key, replace_value, expected_status, expected_args, expected_kwargs +): + assert _replace_value_in_saved_args(replace_key, replace_value, args, kwargs, default_kwargs, arg_names) == ( + expected_status, + expected_args, + expected_kwargs, + ) def test_dataloader_disallow_batch_sampler(): From fff62f0ae5af52bdcd7e65b82ecbc5f13bda7715 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 27 Jul 2022 17:40:40 +0200 Subject: [PATCH 029/230] Fix TPU testing and collect all tests (#11098) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> --- .azure/gpu-tests.yml | 9 ++ .circleci/config.yml | 10 +-- dockers/tpu-tests/tpu_test_cases.jsonnet | 15 ++-- .../plugins/training_type/single_tpu.py | 2 +- .../strategies/launchers/xla.py | 44 +++++++--- src/pytorch_lightning/strategies/tpu_spawn.py | 15 ++-- .../test_accelerator_connector.py | 2 +- tests/tests_pytorch/accelerators/test_ipu.py | 2 +- tests/tests_pytorch/accelerators/test_tpu.py | 21 ++--- .../callbacks/test_device_stats_monitor.py | 12 ++- tests/tests_pytorch/conftest.py | 1 + .../deprecated_api/test_remove_1-8.py | 6 +- tests/tests_pytorch/helpers/runif.py | 2 + tests/tests_pytorch/helpers/utils.py | 36 -------- tests/tests_pytorch/lite/test_lite.py | 34 ++++---- tests/tests_pytorch/models/test_horovod.py | 2 +- tests/tests_pytorch/models/test_tpu.py | 87 ++++++++----------- .../environments/test_xla_environment.py | 4 +- .../profilers/test_xla_profiler.py | 1 - tests/tests_pytorch/run_standalone_tasks.sh | 48 ++++++++++ tests/tests_pytorch/run_standalone_tests.sh | 26 +----- .../strategies/test_tpu_spawn.py | 9 +- .../test_estimated_stepping_batches.py | 28 ++++-- 23 files changed, 213 insertions(+), 203 deletions(-) create mode 100644 tests/tests_pytorch/run_standalone_tasks.sh diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index b5dbd9e3340c7..8e8e2edb91d85 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -116,6 +116,15 @@ jobs: timeoutInMinutes: "35" condition: eq(variables['continue'], '1') + - bash: bash run_standalone_tasks.sh + workingDirectory: tests/tests_pytorch + env: + PL_USE_MOCKED_MNIST: "1" + PL_RUN_CUDA_TESTS: "1" + displayName: 'Testing: PyTorch standalone tasks' + timeoutInMinutes: "10" + condition: eq(variables['continue'], '1') + - bash: | python -m coverage report python -m coverage xml diff --git a/.circleci/config.yml b/.circleci/config.yml index 91d57cd707b97..7ac10195c75a9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -81,6 +81,8 @@ references: job_name=$(jsonnet -J ml-testing-accelerators/ dockers/tpu-tests/tpu_test_cases.jsonnet | kubectl create -f -) && \ job_name=${job_name#job.batch/} job_name=${job_name% created} + pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') + echo "GKE pod name: $pod_name" echo "Waiting on kubernetes job: $job_name" i=0 && \ # N checks spaced 30s apart = 900s total. @@ -92,8 +94,6 @@ references: printf "Waiting for job to finish: " && \ while [ $i -lt $MAX_CHECKS ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else printf "."; fi; sleep $CHECK_SPEEP; done && \ echo "Done waiting. Job status code: $status_code" && \ - pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \ - echo "GKE pod name: $pod_name" && \ kubectl logs -f $pod_name --container=train > /tmp/full_output.txt if grep -q '' /tmp/full_output.txt ; then csplit /tmp/full_output.txt '//'; else mv /tmp/full_output.txt xx00; fi && \ # First portion is the test logs. Print these to Github Action stdout. @@ -106,10 +106,6 @@ references: name: Statistics command: | mv ./xx01 coverage.xml - # TODO: add human readable report - cat coverage.xml - sudo pip install pycobertura - pycobertura show coverage.xml jobs: @@ -119,7 +115,7 @@ jobs: environment: - XLA_VER: 1.9 - PYTHON_VER: 3.7 - - MAX_CHECKS: 240 + - MAX_CHECKS: 1000 - CHECK_SPEEP: 5 steps: - checkout diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index e3f5f1d98802a..18a0c894c31a2 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -8,7 +8,7 @@ local tputests = base.BaseTest { mode: 'postsubmit', configMaps: [], - timeout: 1200, # 20 minutes, in seconds. + timeout: 6000, # 100 minutes, in seconds. image: 'pytorchlightning/pytorch_lightning', imageTag: 'base-xla-py{PYTHON_VERSION}-torch{PYTORCH_VERSION}', @@ -34,16 +34,11 @@ local tputests = base.BaseTest { pip install -e .[test] echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" + export PL_RUN_TPU_TESTS=1 cd tests/tests_pytorch - echo $PWD - # TODO (@kaushikb11): Add device stats tests here - coverage run --source pytorch_lightning -m pytest -v --capture=no \ - strategies/test_tpu_spawn.py \ - profilers/test_xla_profiler.py \ - accelerators/test_tpu.py \ - models/test_tpu.py \ - plugins/environments/test_xla_environment.py \ - utilities/test_xla_device_utils.py + coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ + echo "\n||| Running standalone tests |||\n" + bash run_standalone_tests.sh -b 1 test_exit_code=$? echo "\n||| END PYTEST LOGS |||\n" coverage xml diff --git a/src/pytorch_lightning/plugins/training_type/single_tpu.py b/src/pytorch_lightning/plugins/training_type/single_tpu.py index 51713fa4f0ee2..5d305a51c497a 100644 --- a/src/pytorch_lightning/plugins/training_type/single_tpu.py +++ b/src/pytorch_lightning/plugins/training_type/single_tpu.py @@ -18,7 +18,7 @@ class SingleTPUPlugin(SingleTPUStrategy): def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] rank_zero_deprecation( - "The `pl.plugins.training_type.single_tpu.SingleTPUPlugin` is deprecated in v1.6 and will be removed in." + "The `pl.plugins.training_type.single_tpu.SingleTPUPlugin` is deprecated in v1.6 and will be removed in" " v1.8. Use `pl.strategies.single_tpu.SingleTPUStrategy` instead." ) super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/strategies/launchers/xla.py b/src/pytorch_lightning/strategies/launchers/xla.py index 699f92bed72e7..037ec027bfd7d 100644 --- a/src/pytorch_lightning/strategies/launchers/xla.py +++ b/src/pytorch_lightning/strategies/launchers/xla.py @@ -13,10 +13,12 @@ # limitations under the License. import os import time +from functools import wraps from multiprocessing.queues import SimpleQueue -from typing import Any, Callable, Optional, TYPE_CHECKING +from typing import Any, Callable, Optional, Tuple, TYPE_CHECKING import torch.multiprocessing as mp +from torch.multiprocessing import ProcessContext import pytorch_lightning as pl from pytorch_lightning.strategies.launchers.multiprocessing import _FakeQueue, _MultiProcessingLauncher, _WorkerOutput @@ -26,9 +28,10 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_debug if _TPU_AVAILABLE: + import torch_xla.core.xla_model as xm import torch_xla.distributed.xla_multiprocessing as xmp else: - xm, xmp, MpDeviceLoader, rendezvous = [None] * 4 + xm, xmp = None, None if TYPE_CHECKING: from pytorch_lightning.strategies import Strategy @@ -72,7 +75,7 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] """ context = mp.get_context(self._start_method) return_queue = context.SimpleQueue() - xmp.spawn( + _save_spawn( self._wrapping_function, args=(trainer, function, args, kwargs, return_queue), nprocs=len(self._strategy.parallel_devices), @@ -103,14 +106,6 @@ def _wrapping_function( if self._strategy.local_rank == 0: return_queue.put(move_data_to_device(results, "cpu")) - # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542 - self._strategy.barrier("end-process") - - # Ensure that the rank 0 process is the one exiting last - # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358 - if self._strategy.local_rank == 0: - time.sleep(2) - def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Optional["_WorkerOutput"]: rank_zero_debug("Collecting results from rank 0 process.") checkpoint_callback = trainer.checkpoint_callback @@ -138,3 +133,30 @@ def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Opt self.add_to_queue(trainer, extra) return _WorkerOutput(best_model_path, weights_path, trainer.state, results, extra) + + +def _save_spawn( + fn: Callable, + args: Tuple = (), + nprocs: Optional[int] = None, + join: bool = True, + daemon: bool = False, + start_method: str = "spawn", +) -> Optional[ProcessContext]: + """Wraps the :func:`torch_xla.distributed.xla_multiprocessing.spawn` with added teardown logic for the worker + processes.""" + + @wraps(fn) + def wrapped(rank: int, *_args: Any) -> None: + fn(rank, *_args) + + # Make all processes wait for each other before joining + # https://github.com/pytorch/xla/issues/1801#issuecomment-602799542 + xm.rendezvous("end-process") + + # Ensure that the rank 0 process is the one exiting last + # https://github.com/pytorch/xla/issues/2190#issuecomment-641665358 + if rank == 0: + time.sleep(1) + + return xmp.spawn(wrapped, args=args, nprocs=nprocs, join=join, daemon=daemon, start_method=start_method) diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index f4953a9f64baa..2d474fafe51b1 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -74,6 +74,7 @@ def __init__( start_method="fork", ) self.debug = debug + self._launched = False @property def checkpoint_io(self) -> CheckpointIO: @@ -90,6 +91,8 @@ def checkpoint_io(self, io: Optional[CheckpointIO]) -> None: @property def root_device(self) -> torch.device: + if not self._launched: + raise RuntimeError("Accessing the XLA device before processes have spawned is not allowed.") return xm.xla_device() @staticmethod @@ -130,7 +133,7 @@ def setup(self, trainer: "pl.Trainer") -> None: self.accelerator.setup(trainer) if self.debug: - os.environ["PT_XLA_DEBUG"] = str(1) + os.environ["PT_XLA_DEBUG"] = "1" shared_params = find_shared_parameters(self.model) self.model_to_device() @@ -150,8 +153,8 @@ def distributed_sampler_kwargs(self) -> Dict[str, int]: @property def is_distributed(self) -> bool: - # HOST_WORLD_SIZE is None outside the xmp.spawn process - return os.getenv(xenv.HOST_WORLD_SIZE, None) and self.world_size != 1 + # HOST_WORLD_SIZE is not set outside the xmp.spawn process + return (xenv.HOST_WORLD_SIZE in os.environ) and self.world_size != 1 def process_dataloader(self, dataloader: DataLoader) -> MpDeviceLoader: TPUSpawnStrategy._validate_dataloader(dataloader) @@ -189,8 +192,9 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ invalid_reduce_op = isinstance(reduce_op, ReduceOp) and reduce_op != ReduceOp.SUM invalid_reduce_op_str = isinstance(reduce_op, str) and reduce_op.lower() not in ("sum", "mean", "avg") if invalid_reduce_op or invalid_reduce_op_str: - raise MisconfigurationException( - "Currently, TPUSpawn Strategy only support `sum`, `mean`, `avg` reduce operation." + raise ValueError( + "Currently, the TPUSpawnStrategy only supports `sum`, `mean`, `avg` for the reduce operation, got:" + f" {reduce_op}" ) output = xm.mesh_reduce("reduce", output, sum) @@ -201,6 +205,7 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ return output def _worker_setup(self, process_idx: int): + self._launched = True reset_seed() self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/accelerators/test_accelerator_connector.py index 06f088e87ea4d..dc53fb5e36588 100644 --- a/tests/tests_pytorch/accelerators/test_accelerator_connector.py +++ b/tests/tests_pytorch/accelerators/test_accelerator_connector.py @@ -671,7 +671,7 @@ def test_devices_auto_choice_mps(): @pytest.mark.parametrize( ["parallel_devices", "accelerator"], - [([torch.device("cpu")], "cuda"), ([torch.device("cuda", i) for i in range(8)], ("tpu"))], + [([torch.device("cpu")], "cuda"), ([torch.device("cuda", i) for i in range(8)], "tpu")], ) def test_parallel_devices_in_strategy_confilict_with_accelerator(parallel_devices, accelerator): with pytest.raises(MisconfigurationException, match=r"parallel_devices set through"): diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index 589ec7b29dd5b..248ac0dbb1818 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -602,7 +602,7 @@ def test_strategy_choice_ipu_plugin(tmpdir): @RunIf(ipu=True) -def test_device_type_when_training_plugin_ipu_passed(tmpdir): +def test_device_type_when_ipu_strategy_passed(tmpdir): trainer = Trainer(strategy=IPUStrategy(), accelerator="ipu", devices=8) assert isinstance(trainer.strategy, IPUStrategy) assert isinstance(trainer.accelerator, IPUAccelerator) diff --git a/tests/tests_pytorch/accelerators/test_tpu.py b/tests/tests_pytorch/accelerators/test_tpu.py index 8e0eb52a9a424..bad6c2801f94f 100644 --- a/tests/tests_pytorch/accelerators/test_tpu.py +++ b/tests/tests_pytorch/accelerators/test_tpu.py @@ -28,7 +28,6 @@ from pytorch_lightning.strategies import DDPStrategy, TPUSpawnStrategy from pytorch_lightning.utilities import find_shared_parameters from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.helpers.utils import pl_multi_process_test class WeightSharingModule(BoringModel): @@ -46,8 +45,7 @@ def forward(self, x): return x -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_resume_training_on_cpu(tmpdir): """Checks if training can be resumed from a saved checkpoint on CPU.""" # Train a model on TPU @@ -65,11 +63,9 @@ def test_resume_training_on_cpu(tmpdir): # Verify that training is resumed on CPU trainer = Trainer(max_epochs=1, default_root_dir=tmpdir) trainer.fit(model, ckpt_path=model_path) - assert trainer.state.finished, f"Training failed with {trainer.state}" @RunIf(tpu=True) -@pl_multi_process_test def test_if_test_works_after_train(tmpdir): """Ensure that .test() works after .fit()""" @@ -293,12 +289,14 @@ def test_xla_checkpoint_plugin_being_default(): assert isinstance(trainer.strategy.checkpoint_io, XLACheckpointIO) -@RunIf(tpu=True) -@patch("pytorch_lightning.strategies.tpu_spawn.xm") -def test_mp_device_dataloader_attribute(_): +@patch("pytorch_lightning.strategies.tpu_spawn.MpDeviceLoader") +@patch("pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.root_device") +def test_mp_device_dataloader_attribute(root_device_mock, mp_loader_mock): dataset = RandomDataset(32, 64) - dataloader = TPUSpawnStrategy().process_dataloader(DataLoader(dataset)) - assert dataloader.dataset == dataset + dataloader = DataLoader(dataset) + processed_dataloader = TPUSpawnStrategy().process_dataloader(dataloader) + mp_loader_mock.assert_called_with(dataloader, root_device_mock) + assert processed_dataloader.dataset == processed_dataloader._loader.dataset @RunIf(tpu=True) @@ -307,8 +305,7 @@ def test_warning_if_tpus_not_used(): Trainer() -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) @pytest.mark.parametrize( ["devices", "expected_device_ids"], [ diff --git a/tests/tests_pytorch/callbacks/test_device_stats_monitor.py b/tests/tests_pytorch/callbacks/test_device_stats_monitor.py index 0da6e5c32b9c4..2a2bae8a2e5a4 100644 --- a/tests/tests_pytorch/callbacks/test_device_stats_monitor.py +++ b/tests/tests_pytorch/callbacks/test_device_stats_monitor.py @@ -96,7 +96,6 @@ def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> assert cpu_stats_mock.call_count == expected -@pytest.mark.skipif(True, reason="TODO (@kaushikb11): fix this test, timeout") @RunIf(tpu=True) def test_device_stats_monitor_tpu(tmpdir): """Test TPU stats are logged using a logger.""" @@ -106,24 +105,23 @@ def test_device_stats_monitor_tpu(tmpdir): class DebugLogger(CSVLogger): @rank_zero_only - def log_metrics(self, metrics: Dict[str, float], step: Optional[int] = None) -> None: + def log_metrics(self, metrics, step=None) -> None: fields = ["avg. free memory (MB)", "avg. peak memory (MB)"] for f in fields: assert any(f in h for h in metrics) trainer = Trainer( default_root_dir=tmpdir, - max_epochs=1, - limit_train_batches=2, + max_epochs=2, + limit_train_batches=5, accelerator="tpu", - devices=1, + devices=8, log_every_n_steps=1, callbacks=[device_stats], logger=DebugLogger(tmpdir), enable_checkpointing=False, enable_progress_bar=False, ) - trainer.fit(model) @@ -146,7 +144,7 @@ def test_device_stats_monitor_no_logger(tmpdir): trainer.fit(model) -def test_prefix_metric_keys(tmpdir): +def test_prefix_metric_keys(): """Test that metric key names are converted correctly.""" metrics = {"1": 1.0, "2": 2.0, "3": 3.0} prefix = "foo" diff --git a/tests/tests_pytorch/conftest.py b/tests/tests_pytorch/conftest.py index e41a236486a93..745067cc2f9f1 100644 --- a/tests/tests_pytorch/conftest.py +++ b/tests/tests_pytorch/conftest.py @@ -180,6 +180,7 @@ def pytest_collection_modifyitems(items: List[pytest.Function], config: pytest.C min_cuda_gpus="PL_RUN_CUDA_TESTS", slow="PL_RUN_SLOW_TESTS", ipu="PL_RUN_IPU_TESTS", + tpu="PL_RUN_TPU_TESTS", ) if os.getenv(options["standalone"], "0") == "1" and os.getenv(options["min_cuda_gpus"], "0") == "1": # special case: we don't have a CPU job for standalone tests, so we shouldn't run only cuda tests. diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 12aca123eacc1..6da335383e11e 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -360,12 +360,10 @@ def test_v1_8_0_deprecated_single_device_plugin_class(): SingleDevicePlugin("cpu") -@RunIf(tpu=True) +@RunIf(tpu=True, standalone=True) def test_v1_8_0_deprecated_single_tpu_plugin_class(): with pytest.deprecated_call( - match=( - "SingleTPUPlugin` is deprecated in v1.6 and will be removed in v1.8." " Use `.*SingleTPUStrategy` instead." - ) + match="SingleTPUPlugin` is deprecated in v1.6 and will be removed in v1.8. Use `.*SingleTPUStrategy` instead." ): SingleTPUPlugin(0) diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py index d8e38e7101fe0..abbca75f626ad 100644 --- a/tests/tests_pytorch/helpers/runif.py +++ b/tests/tests_pytorch/helpers/runif.py @@ -177,6 +177,8 @@ def __new__( if tpu: conditions.append(not _TPU_AVAILABLE) reasons.append("TPU") + # used in conftest.py::pytest_collection_modifyitems + kwargs["tpu"] = True if ipu: conditions.append(not _IPU_AVAILABLE) diff --git a/tests/tests_pytorch/helpers/utils.py b/tests/tests_pytorch/helpers/utils.py index 6da53e7b54b20..a9efd7f178f2b 100644 --- a/tests/tests_pytorch/helpers/utils.py +++ b/tests/tests_pytorch/helpers/utils.py @@ -11,10 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import functools import os import re -import traceback from contextlib import contextmanager from typing import Optional, Type @@ -80,40 +78,6 @@ def init_checkpoint_callback(logger): return checkpoint -def pl_multi_process_test(func): - """Wrapper for running multi-processing tests_pytorch.""" - - @functools.wraps(func) - def wrapper(*args, **kwargs): - - from multiprocessing import Process, Queue - - queue = Queue() - - def inner_f(queue, **kwargs): - try: - func(**kwargs) - queue.put(1) - except Exception: - _trace = traceback.format_exc() - print(_trace) - # code 17 means RuntimeError: tensorflow/compiler/xla/xla_client/mesh_service.cc:364 : - # Failed to meet rendezvous 'torch_xla.core.xla_model.save': Socket closed (14) - if "terminated with exit code 17" in _trace: - queue.put(1) - else: - queue.put(-1) - - proc = Process(target=inner_f, args=(queue,), kwargs=kwargs) - proc.start() - proc.join() - - result = queue.get() - assert result == 1, "expected 1, but returned %s" % result - - return wrapper - - @contextmanager def no_warning_call(expected_warning: Type[Warning] = UserWarning, match: Optional[str] = None): with pytest.warns(None) as record: diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 3652613526549..ca2b06b6d695b 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -318,31 +318,35 @@ def test_setup_dataloaders_replace_standard_sampler(shuffle, strategy): ("cpu", "cpu"), pytest.param("cuda", "cuda:0", marks=RunIf(min_cuda_gpus=1)), pytest.param("gpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), - pytest.param("tpu", "xla:0", marks=RunIf(tpu=True)), + pytest.param("tpu", "xla:0", marks=RunIf(tpu=True, standalone=True)), pytest.param("mps", "mps:0", marks=RunIf(mps=True)), pytest.param("gpu", "mps:0", marks=RunIf(mps=True)), ], ) def test_to_device(accelerator, expected): """Test that the to_device method can move various objects to the device determined by the accelerator.""" - lite = EmptyLite(accelerator=accelerator, devices=1) - expected_device = torch.device(expected) + class Lite(LightningLite): + def run(self): + expected_device = torch.device(expected) + + # module + module = torch.nn.Linear(2, 3) + module = lite.to_device(module) + assert all(param.device == expected_device for param in module.parameters()) - # module - module = torch.nn.Linear(2, 3) - module = lite.to_device(module) - assert all(param.device == expected_device for param in module.parameters()) + # tensor + tensor = torch.rand(2, 2) + tensor = lite.to_device(tensor) + assert tensor.device == expected_device - # tensor - tensor = torch.rand(2, 2) - tensor = lite.to_device(tensor) - assert tensor.device == expected_device + # collection + collection = {"data": torch.rand(2, 2), "int": 1} + collection = lite.to_device(collection) + assert collection["data"].device == expected_device - # collection - collection = {"data": torch.rand(2, 2), "int": 1} - collection = lite.to_device(collection) - assert collection["data"].device == expected_device + lite = Lite(accelerator=accelerator, devices=1) + lite.run() def test_rank_properties(): diff --git a/tests/tests_pytorch/models/test_horovod.py b/tests/tests_pytorch/models/test_horovod.py index 244a3e3d885c3..6cd354ef22cfe 100644 --- a/tests/tests_pytorch/models/test_horovod.py +++ b/tests/tests_pytorch/models/test_horovod.py @@ -390,7 +390,7 @@ def _compute_batch(): trainer = Trainer(fast_dev_run=True, strategy="horovod", logger=False) assert isinstance(trainer.accelerator, CPUAccelerator) - # TODO: test that we selected the correct training_type_plugin based on horovod flags + # TODO: test that we selected the correct strategy based on horovod flags metric = Accuracy( compute_on_step=True, diff --git a/tests/tests_pytorch/models/test_tpu.py b/tests/tests_pytorch/models/test_tpu.py index b6829d444701d..a41ba7429c0e9 100644 --- a/tests/tests_pytorch/models/test_tpu.py +++ b/tests/tests_pytorch/models/test_tpu.py @@ -26,18 +26,15 @@ from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.strategies import TPUSpawnStrategy +from pytorch_lightning.strategies.launchers.xla import _save_spawn from pytorch_lightning.trainer.connectors.logger_connector.result import _Sync from pytorch_lightning.utilities import _TPU_AVAILABLE from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.helpers.utils import pl_multi_process_test if _TPU_AVAILABLE: import torch_xla - import torch_xla.distributed.xla_multiprocessing as xmp - - SERIAL_EXEC = xmp.MpSerialExecutor() class SerialLoaderBoringModel(BoringModel): @@ -48,8 +45,7 @@ def val_dataloader(self): return DataLoader(RandomDataset(32, 2000), batch_size=32) -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_model_tpu_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() @@ -68,8 +64,7 @@ def test_model_tpu_devices_1(tmpdir): @pytest.mark.parametrize("tpu_core", [1, 5]) -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_model_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() @@ -89,7 +84,6 @@ def test_model_tpu_index(tmpdir, tpu_core): @RunIf(tpu=True) -@pl_multi_process_test def test_model_tpu_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() @@ -108,8 +102,7 @@ def test_model_tpu_devices_8(tmpdir): tpipes.run_model_test(trainer_options, model, with_hpc=False, min_acc=0.05) -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_model_16bit_tpu_devices_1(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() @@ -129,8 +122,7 @@ def test_model_16bit_tpu_devices_1(tmpdir): @pytest.mark.parametrize("tpu_core", [1, 5]) -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_model_16bit_tpu_index(tmpdir, tpu_core): """Make sure model trains on TPU.""" tutils.reset_seed() @@ -151,7 +143,6 @@ def test_model_16bit_tpu_index(tmpdir, tpu_core): @RunIf(tpu=True) -@pl_multi_process_test def test_model_16bit_tpu_devices_8(tmpdir): """Make sure model trains on TPU.""" tutils.reset_seed() @@ -172,7 +163,6 @@ def test_model_16bit_tpu_devices_8(tmpdir): @RunIf(tpu=True) -@pl_multi_process_test def test_model_tpu_early_stop(tmpdir): """Test if single TPU core training works.""" @@ -198,8 +188,7 @@ def validation_step(self, *args, **kwargs): trainer.test(dataloaders=DataLoader(RandomDataset(32, 2000), batch_size=32)) -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_tpu_grad_norm(tmpdir): """Test if grad_norm works on TPU.""" tutils.reset_seed() @@ -218,8 +207,7 @@ def test_tpu_grad_norm(tmpdir): tpipes.run_model_test(trainer_options, model, with_hpc=False) -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_tpu_clip_grad_by_value(tmpdir): """Test if clip_gradients by value works on TPU.""" tutils.reset_seed() @@ -240,7 +228,6 @@ def test_tpu_clip_grad_by_value(tmpdir): @RunIf(tpu=True) -@pl_multi_process_test def test_dataloaders_passed_to_fit(tmpdir): """Test if dataloaders passed to trainer works on TPU.""" tutils.reset_seed() @@ -248,7 +235,6 @@ def test_dataloaders_passed_to_fit(tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="tpu", devices=8) trainer.fit(model, train_dataloaders=model.train_dataloader(), val_dataloaders=model.val_dataloader()) - assert trainer.state.finished, f"Training failed with {trainer.state}" @RunIf(tpu=True) @@ -267,14 +253,13 @@ def test_exception_when_no_tpu_found(): @pytest.mark.parametrize("tpu_cores", [1, 8, [1]]) -@RunIf(tpu=True) +@RunIf(tpu=True, standalone=True) def test_accelerator_set_when_using_tpu(tpu_cores): """Test if the accelerator is set to `tpu` when tpu_cores is not None.""" assert isinstance(Trainer(accelerator="tpu", devices=tpu_cores).accelerator, TPUAccelerator) @RunIf(tpu=True) -@pl_multi_process_test def test_broadcast_on_tpu(): """Checks if an object from the main process is broadcasted to other processes correctly.""" @@ -282,19 +267,19 @@ def test_broadcast(rank): trainer = Trainer(accelerator="tpu", devices=8) assert isinstance(trainer.accelerator, TPUAccelerator) assert isinstance(trainer.strategy, TPUSpawnStrategy) + trainer.strategy._launched = True obj = ("ver_0.5", "logger_name", rank) result = trainer.strategy.broadcast(obj) assert result == ("ver_0.5", "logger_name", 0) - xmp.spawn(test_broadcast, nprocs=8, start_method="fork") + _save_spawn(test_broadcast, nprocs=8, start_method="fork") @pytest.mark.parametrize( ["cli_args", "expected"], [("--tpu_cores=8", {"tpu_cores": 8}), ("--tpu_cores=1,", {"tpu_cores": "1,"})], ) -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_tpu_cores_with_argparse(cli_args, expected): """Test passing tpu_cores in command line.""" cli_args = cli_args.split(" ") if cli_args else [] @@ -310,30 +295,31 @@ def test_tpu_cores_with_argparse(cli_args, expected): @RunIf(tpu=True) -@pl_multi_process_test def test_tpu_reduce(): """Test tpu spawn reduce operation.""" def test_reduce(rank): trainer = Trainer(accelerator="tpu", devices=8) - # faster this way - reduce_ops = ["mean", "AVG", "undefined", "sum", ReduceOp.SUM, ReduceOp.MAX] - for reduce_op in reduce_ops: - if reduce_op == "undefined" or reduce_op == ReduceOp.MAX: - with pytest.raises(MisconfigurationException, match="TPUSpawn Strategy only support"): - result = trainer.strategy.reduce(1, reduce_op) - else: - result = trainer.strategy.reduce(1, reduce_op) + trainer.strategy._launched = True + + with pytest.raises(ValueError, match="TPUSpawnStrategy only supports"): + trainer.strategy.reduce(1, reduce_op="undefined") + + with pytest.raises(ValueError, match="TPUSpawnStrategy only supports"): + trainer.strategy.reduce(1, reduce_op=ReduceOp.MAX) + + # it is faster to loop over here than to parameterize the test + for reduce_op in ("mean", "AVG", "sum", ReduceOp.SUM): + result = trainer.strategy.reduce(1, reduce_op=reduce_op) if isinstance(reduce_op, str) and reduce_op.lower() in ("mean", "avg"): assert result.item() == 1 else: assert result.item() == 8 - xmp.spawn(test_reduce, nprocs=8, start_method="fork") + _save_spawn(test_reduce, nprocs=8, start_method="fork") -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) @pytest.mark.parametrize("clip_val", [10]) @mock.patch("torch.nn.utils.clip_grad_norm_") def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir): @@ -363,7 +349,6 @@ def test_tpu_precision_16_clip_gradients(mock_clip_grad_norm, clip_val, tmpdir): @RunIf(tpu=True) -@pl_multi_process_test def test_if_test_works_with_checkpoint_false(tmpdir): """Ensure that model trains properly when `enable_checkpointing` is set to False.""" @@ -382,21 +367,22 @@ def test_if_test_works_with_checkpoint_false(tmpdir): @RunIf(tpu=True) -@pl_multi_process_test def test_tpu_sync_dist(): """Test tpu spawn sync dist operation.""" - def test_sync_dist(_): - sync = _Sync(TPUSpawnStrategy().reduce, should=True, _op=torch.distributed.ReduceOp.SUM) + def test_sync_dist(rank): + trainer = Trainer(accelerator="tpu", devices=8) + trainer.strategy._launched = True + + sync = _Sync(trainer.strategy.reduce, _should=True, _op=torch.distributed.ReduceOp.SUM) value = torch.tensor([1.0]) - value = (sync(value),) + value = sync(value) assert value.item() == 8 - xmp.spawn(test_sync_dist, nprocs=8, start_method="fork") + _save_spawn(test_sync_dist, nprocs=8, start_method="fork") @RunIf(tpu=True) -@pl_multi_process_test def test_tpu_debug_mode(tmpdir): """Test if debug mode works on TPU.""" @@ -424,7 +410,6 @@ def teardown(self, stage): @RunIf(tpu=True) -@pl_multi_process_test def test_tpu_host_world_size(tmpdir): """Test Host World size env setup on TPU.""" @@ -432,9 +417,6 @@ class DebugModel(BoringModel): def on_train_start(self): assert os.environ.get("XRT_HOST_WORLD_SIZE") == str(1) - def teardown(self, stage): - assert "XRT_HOST_WORLD_SIZE" not in os.environ - tutils.reset_seed() trainer_options = dict( default_root_dir=tmpdir, @@ -447,12 +429,13 @@ def teardown(self, stage): ) model = DebugModel() + assert "XRT_HOST_WORLD_SIZE" not in os.environ tpipes.run_model_test(trainer_options, model, with_hpc=False) + assert "XRT_HOST_WORLD_SIZE" not in os.environ @RunIf(tpu=True) -@pl_multi_process_test -def test_device_type_when_training_plugin_tpu_passed(tmpdir): - trainer = Trainer(strategy=TPUSpawnStrategy(), accelerator="tpu", devices=8) +def test_device_type_when_tpu_strategy_passed(tmpdir): + trainer = Trainer(default_root_dir=tmpdir, strategy=TPUSpawnStrategy(), accelerator="tpu", devices=8) assert isinstance(trainer.strategy, TPUSpawnStrategy) assert isinstance(trainer.accelerator, TPUAccelerator) diff --git a/tests/tests_pytorch/plugins/environments/test_xla_environment.py b/tests/tests_pytorch/plugins/environments/test_xla_environment.py index 8c6bae204ed17..ac1f17bc2dde0 100644 --- a/tests/tests_pytorch/plugins/environments/test_xla_environment.py +++ b/tests/tests_pytorch/plugins/environments/test_xla_environment.py @@ -15,6 +15,7 @@ from unittest import mock import pytest +import torch import pytorch_lightning as pl from pytorch_lightning.plugins.environments import XLAEnvironment @@ -23,7 +24,8 @@ @RunIf(tpu=True) @mock.patch.dict(os.environ, {}, clear=True) -def test_default_attributes(): +@mock.patch("torch_xla._XLAC._xla_get_default_device", return_value=torch.device("xla:0")) +def test_default_attributes(*_): """Test the default attributes when no environment variables are set.""" env = XLAEnvironment() assert not env.creates_processes_externally diff --git a/tests/tests_pytorch/profilers/test_xla_profiler.py b/tests/tests_pytorch/profilers/test_xla_profiler.py index 7f5b0ecdd7740..694d978905177 100644 --- a/tests/tests_pytorch/profilers/test_xla_profiler.py +++ b/tests/tests_pytorch/profilers/test_xla_profiler.py @@ -35,7 +35,6 @@ def test_xla_profiler_instance(tmpdir): assert isinstance(trainer.profiler, XLAProfiler) trainer.fit(model) - assert trainer.state.finished, f"Training failed with {trainer.state}" @pytest.mark.skipif(True, reason="XLA Profiler doesn't support Prog. capture yet") diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh new file mode 100644 index 0000000000000..4d433399e5736 --- /dev/null +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -0,0 +1,48 @@ +#!/bin/bash +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e +# THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY + +report='' + +if nvcc --version; then + nvprof --profile-from-start off -o trace_name.prof -- python -m coverage run --source pytorch_lightning --append -m pytest --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx +fi + +# needs to run outside of `pytest` +python utilities/test_warnings.py +if [ $? -eq 0 ]; then + report+="Ran\tutilities/test_warnings.py\n" +fi + +# test deadlock is properly handled with TorchElastic. +LOGS=$(PL_RUN_STANDALONE_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") +if [ -z "$LOGS" ]; then + exit 1 +fi +report+="Ran\tplugins/environments/torch_elastic_deadlock.py\n" + +# test that a user can manually launch individual processes +export PYTHONPATH="${PYTHONPATH}:$(pwd)" +args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} & +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} +report+="Ran\tmanual ddp launch test\n" + +# echo test report +printf '=%.s' {1..80} +printf "\n$report" +printf '=%.s' {1..80} +printf '\n' diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 5297cbd033347..55a0d330f6188 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -93,31 +93,7 @@ done # wait for leftover tests for pid in ${pids[*]}; do wait $pid; done show_batched_output -echo "Batched mode finished. Continuing with the rest of standalone tests." - -if nvcc --version; then - nvprof --profile-from-start off -o trace_name.prof -- python ${defaults} profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx -fi - -# needs to run outside of `pytest` -python utilities/test_warnings.py -if [ $? -eq 0 ]; then - report+="Ran\tutilities/test_warnings.py\n" -fi - -# test deadlock is properly handled with TorchElastic. -LOGS=$(PL_RUN_STANDALONE_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") -if [ -z "$LOGS" ]; then - exit 1 -fi -report+="Ran\tplugins/environments/torch_elastic_deadlock.py\n" - -# test that a user can manually launch individual processes -export PYTHONPATH="${PYTHONPATH}:$(pwd)" -args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} & -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} -report+="Ran\tmanual ddp launch test\n" +echo "Batched mode finished. End of standalone tests." # echo test report printf '=%.s' {1..80} diff --git a/tests/tests_pytorch/strategies/test_tpu_spawn.py b/tests/tests_pytorch/strategies/test_tpu_spawn.py index 246df92c45e46..967e44a42c9de 100644 --- a/tests/tests_pytorch/strategies/test_tpu_spawn.py +++ b/tests/tests_pytorch/strategies/test_tpu_spawn.py @@ -25,7 +25,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.dataloaders import CustomNotImplementedErrorDataloader from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.helpers.utils import pl_multi_process_test class BoringModelNoDataloaders(BoringModel): @@ -85,18 +84,16 @@ def test_error_process_iterable_dataloader(_): class BoringModelTPU(BoringModel): def on_train_start(self) -> None: + # assert strategy attributes for device setting assert self.device == torch.device("xla", index=1) assert os.environ.get("PT_XLA_DEBUG") == "1" -@RunIf(tpu=True) -@pl_multi_process_test +@RunIf(tpu=True, standalone=True) def test_model_tpu_one_core(): """Tests if device/debug flag is set correctly when training and after teardown for TPUSpawnStrategy.""" + model = BoringModelTPU() trainer = Trainer(accelerator="tpu", devices=1, fast_dev_run=True, strategy=TPUSpawnStrategy(debug=True)) - # assert training strategy attributes for device setting assert isinstance(trainer.strategy, TPUSpawnStrategy) - assert trainer.strategy.root_device == torch.device("xla", index=1) - model = BoringModelTPU() trainer.fit(model) assert "PT_XLA_DEBUG" not in os.environ diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 35a8a0a8d5789..8d8fb7f3a8c21 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -14,8 +14,10 @@ import logging from unittest import mock +from unittest.mock import PropertyMock import pytest +import torch from torch.utils.data import DataLoader from pytorch_lightning import Trainer @@ -26,7 +28,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.helpers.utils import pl_multi_process_test def test_num_stepping_batches_basic(): @@ -135,16 +136,29 @@ def test_num_stepping_batches_gpu(trainer_kwargs, estimated_steps, monkeypatch): assert trainer.estimated_stepping_batches == estimated_steps +@RunIf(tpu=True, standalone=True) +def test_num_stepping_batches_with_tpu_single(): + """Test stepping batches with the single-core TPU strategy.""" + trainer = Trainer(accelerator="tpu", devices=1, max_epochs=1) + model = BoringModel() + trainer._data_connector.attach_data(model) + trainer.strategy.connect(model) + assert trainer.estimated_stepping_batches == len(model.train_dataloader()) + + @RunIf(tpu=True) -@pl_multi_process_test -@pytest.mark.parametrize("devices,estimated_steps", [([1], 64), (8, 8)]) -def test_num_stepping_batches_with_tpu(devices, estimated_steps): - """Test stepping batches with TPU training which acts like DDP.""" - trainer = Trainer(accelerator="tpu", devices=devices, max_epochs=1) +@mock.patch( + "pytorch_lightning.strategies.tpu_spawn.TPUSpawnStrategy.root_device", + new_callable=PropertyMock, + return_value=torch.device("xla:0"), +) +def test_num_stepping_batches_with_tpu_multi(_): + """Test stepping batches with the TPU strategy across multiple devices.""" + trainer = Trainer(accelerator="tpu", devices=8, max_epochs=1) model = BoringModel() trainer._data_connector.attach_data(model) trainer.strategy.connect(model) - assert trainer.estimated_stepping_batches == estimated_steps + assert trainer.estimated_stepping_batches == len(model.train_dataloader()) // 8 @mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True) From 511875e5675b0543d89e2aae3950a7834b35238e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 27 Jul 2022 18:57:52 +0200 Subject: [PATCH 030/230] Support DeepSpeed >=0.6.0, <0.6.5 (#13863) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- .azure/gpu-tests.yml | 1 + requirements/pytorch/strategies.txt | 2 +- .../plugins/precision/apex_amp.py | 3 ++- .../plugins/precision/deepspeed.py | 22 +++++++++---------- .../plugins/precision/ipu.py | 2 +- .../plugins/precision/precision_plugin.py | 3 ++- src/pytorch_lightning/strategies/strategy.py | 11 ++++++++-- tests/tests_pytorch/lite/test_lite.py | 18 +++++++++++---- .../precision/test_deepspeed_precision.py | 9 -------- .../strategies/test_deepspeed_strategy.py | 10 +-------- 10 files changed, 42 insertions(+), 39 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 8e8e2edb91d85..74c1df4553fe0 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -73,6 +73,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] + pip install deepspeed==0.6.4 # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 2b69c8ba76b81..db29ce556e839 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,5 +1,5 @@ fairscale>=0.4.5, <=0.4.6 -deepspeed<0.6.0 +deepspeed>=0.6.0, <0.6.5 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index 15825dedd2ef6..e18f82dc27f6e 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -59,6 +59,7 @@ def backward( model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], *args: Any, **kwargs: Any, ) -> None: @@ -71,7 +72,7 @@ def backward( """ opt = optimizer or model.trainer.optimizers with amp.scale_loss(closure_loss, opt) as closure_loss: - super().backward(model, closure_loss, optimizer, *args, **kwargs) + super().backward(model, closure_loss, optimizer, optimizer_idx, *args, **kwargs) def optimizer_step( self, diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 96458487c7420..fa948520e1fd6 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -27,10 +27,8 @@ from pytorch_lightning.utilities.warnings import WarningCache _DEEPSPEED_AVAILABLE = _RequirementAvailable("deepspeed") -_DEEPSPEED_GREATER_EQUAL_0_6 = _RequirementAvailable("deepspeed>=0.6.0") -if TYPE_CHECKING: - if _DEEPSPEED_AVAILABLE: - import deepspeed +if TYPE_CHECKING and _DEEPSPEED_AVAILABLE: + import deepspeed warning_cache = WarningCache() @@ -53,12 +51,6 @@ class DeepSpeedPrecisionPlugin(PrecisionPlugin): """ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optional[str] = None) -> None: - if precision == PrecisionType.BFLOAT and not _DEEPSPEED_GREATER_EQUAL_0_6: - raise MisconfigurationException( - f"`Trainer(strategy='deepspeed', precision={precision!r})` is not supported" - " with `deepspeed < v0.6`. Please upgrade it using `pip install -U deepspeed`." - ) - supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT, PrecisionType.MIXED) if precision not in supported_precision: raise ValueError( @@ -71,7 +63,15 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona self.amp_type = amp_type self.amp_level = amp_level - def backward(self, model: "pl.LightningModule", closure_loss: Tensor, *args: Any, **kwargs: Any) -> None: + def backward( + self, + model: "pl.LightningModule", + closure_loss: Tensor, + optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], + *args: Any, + **kwargs: Any, + ) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles" diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index 329a8b8978e50..89f544575f63f 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -44,7 +44,7 @@ def __init__(self, precision: int) -> None: super().__init__() self.precision = precision - def backward(self, model: "pl.LightningModule", *args: Any, **kwargs: Any) -> None: + def backward(self, model: "pl.LightningModule", *_: Any, **__: Any) -> None: if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since IPUs handle" diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index cbf18b8c4fa41..02d343a0876b4 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -64,6 +64,7 @@ def backward( model: "pl.LightningModule", closure_loss: Tensor, optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], *args: Any, **kwargs: Any, ) -> None: @@ -76,7 +77,7 @@ def backward( """ # do backward pass if model is not None and isinstance(model, pl.LightningModule): - model.backward(closure_loss, optimizer, *args, **kwargs) + model.backward(closure_loss, optimizer, optimizer_idx, *args, **kwargs) else: self._run_backward(closure_loss, *args, **kwargs) diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index f47afc890bcbb..0de904ccbd283 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -171,7 +171,14 @@ def optimizer_state(self, optimizer: Optimizer) -> Dict[str, Tensor]: """ return optimizer.state_dict() - def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor: + def backward( + self, + closure_loss: Tensor, + optimizer: Optional[Optimizer], + optimizer_idx: Optional[int], + *args: Any, + **kwargs: Any, + ) -> Tensor: """Forwards backward-calls to the precision plugin. Args: @@ -181,7 +188,7 @@ def backward(self, closure_loss: Tensor, *args: Any, **kwargs: Any) -> Tensor: assert self.lightning_module is not None closure_loss = self.precision_plugin.pre_backward(self.lightning_module, closure_loss) - self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs) + self.precision_plugin.backward(self.lightning_module, closure_loss, optimizer, optimizer_idx, *args, **kwargs) closure_loss = self.precision_plugin.post_backward(self.lightning_module, closure_loss) self.post_backward(closure_loss) diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index ca2b06b6d695b..86a0a5a82195a 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -412,15 +412,21 @@ def run(self): model = BoringModel() optimizer = torch.optim.SGD(model.parameters(), lr=0.0001) model, optimizer = self.setup(model, optimizer) - state_dict = deepcopy(model.state_dict()) - for _ in range(2): + for i in range(2): optimizer.zero_grad() x = model(torch.randn(1, 32).to(self.device)) loss = x.sum() + if i == 0: + # the weights are not initialized with stage 3 until backward is run once + assert all(w.nelement() == 0 for w in model.state_dict().values()) self.backward(loss, model=model) + if i == 0: + # save for later to check that the weights were updated + state_dict = deepcopy(model.state_dict()) optimizer.step() + # check that the model trained, the weights from step 1 do not match the weights from step 2 for mw_b, mw_a in zip(state_dict.values(), model.state_dict().values()): assert not torch.allclose(mw_b, mw_a) @@ -438,6 +444,7 @@ def run(self): model_1, optimizer_1 = self.setup(model_1, optimizer_1) model_2, optimizer_2 = self.setup(model_2, optimizer_2) + # train model_1 first self.seed_everything(42) data_list = [] for _ in range(2): @@ -449,9 +456,11 @@ def run(self): self.backward(loss, model=model_1) optimizer_1.step() - for mw_1, mw_2 in zip(model_1.state_dict().values(), model_2.state_dict().values()): - assert not torch.allclose(mw_1, mw_2) + # the weights do not match + assert all(w.nelement() > 1 for w in model_1.state_dict().values()) + assert all(w.nelement() == 0 for w in model_2.state_dict().values()) + # now train model_2 with the same data for data in data_list: optimizer_2.zero_grad() x = model_2(data) @@ -459,6 +468,7 @@ def run(self): self.backward(loss, model=model_2) optimizer_2.step() + # the weights should match for mw_1, mw_2 in zip(model_1.state_dict().values(), model_2.state_dict().values()): assert torch.allclose(mw_1, mw_2) diff --git a/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py b/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py index 8a68f7c73209b..a4698e7c19c97 100644 --- a/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py +++ b/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py @@ -11,20 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock - import pytest from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin -from pytorch_lightning.utilities.exceptions import MisconfigurationException def test_invalid_precision_with_deepspeed_precision(): with pytest.raises(ValueError, match="is not supported. `precision` must be one of"): DeepSpeedPrecisionPlugin(precision=64, amp_type="native") - - -@mock.patch("pytorch_lightning.plugins.precision.deepspeed._DEEPSPEED_GREATER_EQUAL_0_6", False) -def test_incompatible_bfloat16_raises_error_with_deepspeed_version(): - with pytest.raises(MisconfigurationException, match="is not supported with `deepspeed < v0.6`"): - DeepSpeedPrecisionPlugin(precision="bf16", amp_type="native") diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 79562134f9ccb..6a2a2fc5d9111 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -30,11 +30,9 @@ from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin -from pytorch_lightning.plugins.precision.deepspeed import _DEEPSPEED_GREATER_EQUAL_0_6 from pytorch_lightning.strategies import DeepSpeedStrategy from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE, LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.meta import init_meta_context from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.datasets import RandomIterableDataset @@ -42,14 +40,9 @@ if _DEEPSPEED_AVAILABLE: import deepspeed + from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer from deepspeed.utils.zero_to_fp32 import convert_zero_checkpoint_to_fp32_state_dict - _DEEPSPEED_GREATER_EQUAL_0_5_9 = _RequirementAvailable("deepspeed>=0.5.9") - if _DEEPSPEED_GREATER_EQUAL_0_5_9: - from deepspeed.runtime.zero.stage_1_and_2 import DeepSpeedZeroOptimizer - else: - from deepspeed.runtime.zero.stage2 import FP16_DeepSpeedZeroOptimizer as DeepSpeedZeroOptimizer - class ModelParallelBoringModel(BoringModel): def __init__(self): @@ -1294,7 +1287,6 @@ def training_step(self, *args, **kwargs): @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) -@pytest.mark.skipif(not _DEEPSPEED_GREATER_EQUAL_0_6, reason="requires deepspeed >= 0.6") def test_deepspeed_with_bfloat16_precision(tmpdir): """Test that deepspeed works with bfloat16 precision.""" model = BoringModel() From a58a406952e0ec9eb474a1eae8b3ab4a6533abfe Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 28 Jul 2022 14:09:12 +0200 Subject: [PATCH 031/230] add UI for install all (#13732) * add UI for install all --- .github/actions/pkg-check/action.yml | 19 +++++++++++++++++++ .github/actions/pkg-install/action.yml | 23 +++++++++++++++++------ .github/workflows/ci_pkg-install.yml | 7 ------- src/lightning/__setup__.py | 17 +++++++++-------- 4 files changed, 45 insertions(+), 21 deletions(-) diff --git a/.github/actions/pkg-check/action.yml b/.github/actions/pkg-check/action.yml index dc6031a1b769c..aa0ecd3db4968 100644 --- a/.github/actions/pkg-check/action.yml +++ b/.github/actions/pkg-check/action.yml @@ -33,3 +33,22 @@ runs: - name: copy/export pkg run: cp dist/* pypi/ shell: bash + + - name: Unzip packages + if: ${{ inputs.pkg-name != '' }} + working-directory: dist + run: for file in `ls *.gz`; do tar -xzf $file; done + shell: bash + + - name: Check single pkg/folder + if: ${{ inputs.pkg-name != '' }} + working-directory: dist + run: | + import os, glob, pathlib, shutil + # list folders without ending .egg-info + dirs = [d for d in glob.glob(os.path.join("*", "src", "*")) if not d.endswith(".egg-info")] + print(dirs) + assert len(dirs) == 1 + # cleaning + shutil.rmtree(pathlib.Path(dirs[0]).parent.parent) + shell: python diff --git a/.github/actions/pkg-install/action.yml b/.github/actions/pkg-install/action.yml index b5253cd1779c2..652a82d76155b 100644 --- a/.github/actions/pkg-install/action.yml +++ b/.github/actions/pkg-install/action.yml @@ -4,7 +4,8 @@ description: installing and validationg the package inputs: pkg-name: description: package name for import - required: true + required: false + default: "" pip-flags: description: additional pil install flags required: false @@ -13,14 +14,24 @@ inputs: runs: using: "composite" steps: + - name: Determine package name + if: ${{ inputs.pkg-import == '' }} + working-directory: ./dist + run: python -c "import glob ; ls = glob.glob('*.tar.gz') ; name = '_'.join(ls[0].split('-')[:-1]) ; print(f'PKG_NAME={name}')" >> $GITHUB_ENV + shell: bash + + - name: Pass package name + if: ${{ inputs.pkg-import != '' }} + run: echo "PKG_NAME=${{ inputs.pkg-name }}" >> $GITHUB_ENV + shell: bash - name: Install | Uninstall package - archive working-directory: ./dist run: | - pip install *.tar.gz ${{ inputs.pip-flags }} + pip install *.tar.gz ${PKG_NAME} pip list | grep lightning - python -c "import ${{ inputs.pkg-name }} ; print(${{ inputs.pkg-name }}.__version__)" - pip uninstall -y ${{ inputs.pkg-name }} + python -c "import ${PKG_NAME} ; print(${PKG_NAME}.__version__)" + pip uninstall -y ${PKG_NAME} shell: bash - name: Install | Uninstall package - wheel @@ -28,6 +39,6 @@ runs: run: | pip install *.whl ${{ inputs.pip-flags }} pip list | grep lightning - python -c "import ${{ inputs.pkg-name }} ; print(${{ inputs.pkg-name }}.__version__)" - pip uninstall -y ${{ inputs.pkg-name }} + python -c "import ${PKG_NAME} ; print(${PKG_NAME}.__version__)" + pip uninstall -y ${PKG_NAME} shell: bash diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index 54bca37531b28..5d09047663c58 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -58,14 +58,7 @@ jobs: name: ci-packages-${{ github.sha }} path: pypi - - name: Determine package name - if: ${{ inputs.pkg-import == '' }} - working-directory: ./dist - run: python -c "import glob ; ls = glob.glob('*.tar.gz') ; name = '_'.join(ls[0].split('-')[:-1]) ; print(f'PKG_NAME={name}')" >> $GITHUB_ENV - - uses: ./.github/actions/pkg-install - with: - pkg-name: ${{ env.PKG_NAME }} install-meta-src: needs: install-standalone diff --git a/src/lightning/__setup__.py b/src/lightning/__setup__.py index 83f060b9b2e23..6ab3118b3174d 100644 --- a/src/lightning/__setup__.py +++ b/src/lightning/__setup__.py @@ -27,22 +27,23 @@ def _adjust_manifest(**kwargs: Any) -> None: manifest_path = os.path.join(_PROJECT_ROOT, "MANIFEST.in") assert os.path.isfile(manifest_path) with open(manifest_path) as fp: - lines = fp.readlines() + lines = [ln.rstrip() for ln in fp.readlines()] if kwargs["pkg_name"] == "lightning": lines += [ - "recursive-include src/lightning *.md" + os.linesep, + "recursive-include src/lightning *.md", # fixme: this is strange, this shall work with setup find package - include - "prune src/lightning_app" + os.linesep, - "prune src/pytorch_lightning" + os.linesep, + "prune src/lightning_app", + "prune src/pytorch_lightning", ] else: lines += [ - "recursive-include src *.md" + os.linesep, - "recursive-include requirements *.txt" + os.linesep, - "recursive-include src/lightning_app/cli/*-template *" + os.linesep, # Add templates + "recursive-include src *.md", + "recursive-include requirements *.txt", + "recursive-include src/lightning_app/ui *", + "recursive-include src/lightning_app/cli/*-template *", # Add templates as build-in ] with open(manifest_path, "w") as fp: - fp.writelines(lines) + fp.writelines([ln + os.linesep for ln in lines]) def _setup_args(**kwargs: Any) -> Dict[str, Any]: From 1299e4f984d56e013029192fbbfd4713c937c26e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 28 Jul 2022 16:07:57 +0200 Subject: [PATCH 032/230] Run GPU tests with PyTorch 1.12 (#13716) Co-authored-by: Jirka --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 6 ++-- .github/workflows/README.md | 20 ++++++------ .github/workflows/cicd-pytorch_dockers.yml | 5 ++- dockers/base-cuda/Dockerfile | 2 +- requirements/pytorch/examples.txt | 2 +- .../tests_pytorch/plugins/test_amp_plugins.py | 10 ++++-- .../test_ddp_fully_sharded_native.py | 3 +- .../strategies/test_deepspeed_strategy.py | 2 +- .../strategies/test_sharded_strategy.py | 31 +++++-------------- .../optimization/test_manual_optimization.py | 2 +- tests/tests_pytorch/utilities/test_meta.py | 4 +-- 12 files changed, 40 insertions(+), 49 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 3108f9e78c3c3..ac5ca6f60a6b4 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 74c1df4553fe0..33d03688171b1 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES @@ -69,7 +69,9 @@ jobs: condition: eq(variables['continue'], '1') - bash: | + set -e python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" + python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 196d64f7e7f1c..d39d096b26f0f 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,16 +4,16 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| --------------------------- | ----------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.8), (3.7, 1.11), (3.9, 1.8), (3.9, 1.12) | linux, mac, windows | -| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | -| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | -| PL.pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| PL.pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| PL.pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.7, 1.8) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.7, 1.8) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.9) | linux | +| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | +| -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | +| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.8), (3.7, 1.11), (3.9, 1.8), (3.9, 1.12) | linux, mac, windows | +| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | +| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.9) | linux | - \*Accelerators used in CI - GPU: 2 x NVIDIA Tesla V100 diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch_dockers.yml index 4742f3579c274..b037c798bc8ee 100644 --- a/.github/workflows/cicd-pytorch_dockers.yml +++ b/.github/workflows/cicd-pytorch_dockers.yml @@ -31,7 +31,7 @@ jobs: matrix: # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image python_version: ["3.9"] - pytorch_version: ["1.10", "1.11"] + pytorch_version: ["1.12"] steps: - uses: actions/checkout@v2 - uses: docker/setup-buildx-action@v2 @@ -86,8 +86,7 @@ jobs: matrix: include: # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.7", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"} - - {python_version: "3.7", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"} # latest (used in Tutorials) - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"} diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 743ab95cc3ab2..01372574e4618 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -18,7 +18,7 @@ ARG CUDA_VERSION=11.3.1 FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION} ARG PYTHON_VERSION=3.9 -ARG PYTORCH_VERSION=1.9 +ARG PYTORCH_VERSION=1.12 SHELL ["/bin/bash", "-c"] # https://techoverflow.net/2019/05/18/how-to-fix-configuring-tzdata-interactive-input-when-building-docker-images/ diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 114c509ad72e3..223a9e0117299 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -1,3 +1,3 @@ -torchvision>=0.10.*, <=0.12.0 +torchvision>=0.10.*, <=0.13.0 gym[classic_control]>=0.17.0, <0.24.2 ipython[all] <=8.1.1 diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py index 132d13c054926..b02e3e29e9539 100644 --- a/tests/tests_pytorch/plugins/test_amp_plugins.py +++ b/tests/tests_pytorch/plugins/test_amp_plugins.py @@ -22,8 +22,14 @@ from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.plugins import ApexMixedPrecisionPlugin, NativeMixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 from tests_pytorch.helpers.runif import RunIf +if _TORCH_GREATER_EQUAL_1_12: + torch_test_assert_close = torch.testing.assert_close +else: + torch_test_assert_close = torch.testing.assert_allclose + class MyNativeAMP(NativeMixedPrecisionPlugin): pass @@ -98,13 +104,13 @@ def check_grads_unscaled(self, optimizer=None): grads = [p.grad for p in self.parameters()] assert len(grads) == len(self.original_grads) for actual, expected in zip(grads, self.original_grads): - torch.testing.assert_allclose(actual, expected) + torch_test_assert_close(actual, expected, equal_nan=True) def check_grads_clipped(self): parameters = list(self.parameters()) assert len(parameters) == len(self.clipped_parameters) for actual, expected in zip(parameters, self.clipped_parameters): - torch.testing.assert_allclose(actual.grad, expected.grad) + torch_test_assert_close(actual.grad, expected.grad, equal_nan=True) def on_before_optimizer_step(self, optimizer, *_): self.check_grads_unscaled(optimizer) diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index 1ac7ad0b6660b..74f9534c47ce3 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -128,10 +128,9 @@ def test_fully_sharded_native_strategy_sync_batchnorm(tmpdir): @RunIf(min_cuda_gpus=1, skip_windows=True, standalone=True, min_torch="1.12") -@pytest.mark.parametrize("precision", [16, "bf16"]) +@pytest.mark.parametrize("precision", (16, pytest.param("bf16", marks=RunIf(bf16_cuda=True)))) def test_fully_sharded_native_strategy_checkpoint(tmpdir, precision): """Test to ensure that checkpoint is saved correctly when using a single GPU, and all stages can be run.""" - model = TestFSDPModel() trainer = Trainer( default_root_dir=tmpdir, accelerator="gpu", devices=1, strategy="fsdp_native", precision=precision, max_epochs=1 diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 6a2a2fc5d9111..1f955a2520faa 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -1232,7 +1232,7 @@ def on_test_batch_start( trainer.test(model) -@RunIf(min_cuda_gpus=2, min_torch="1.10.0", standalone=True, deepspeed=True) +@RunIf(min_cuda_gpus=2, min_torch="1.10.0", max_torch="1.12.0", standalone=True, deepspeed=True) def test_deepspeed_with_meta_device(tmpdir): with init_meta_context(): model = BoringModel() diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py index bfd2a3abfc411..a047a10df32e3 100644 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py @@ -41,7 +41,7 @@ def test_ddp_sharded_precision_16_clip_gradients(mock_oss_clip_grad_norm, clip_v @pytest.mark.parametrize( "strategy,expected", [("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPSpawnShardedStrategy)] ) -def test_sharded_ddp_choice(tmpdir, strategy, expected): +def test_sharded_ddp_choice(strategy, expected): """Test to ensure that strategy is correctly chosen.""" trainer = Trainer(fast_dev_run=True, strategy=strategy) assert isinstance(trainer.strategy, expected) @@ -51,7 +51,7 @@ def test_sharded_ddp_choice(tmpdir, strategy, expected): @pytest.mark.parametrize( "strategy,expected", [("ddp_sharded", DDPShardedStrategy), ("ddp_sharded_spawn", DDPSpawnShardedStrategy)] ) -def test_ddp_choice_sharded_amp(tmpdir, strategy, expected): +def test_ddp_choice_sharded_amp(strategy, expected): """Test to ensure that plugin native amp plugin is correctly chosen when using sharded.""" trainer = Trainer(fast_dev_run=True, accelerator="gpu", devices=1, precision=16, strategy=strategy) assert isinstance(trainer.strategy, expected) @@ -201,27 +201,12 @@ def training_step(self, batch, batch_idx): @RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, fairscale=True) -def test_ddp_sharded_strategy_manual_optimization_spawn(tmpdir): - # todo (sean): this test has been split out as running both tests using parametrize causes "Address in use" +@pytest.mark.parametrize("strategy", ("ddp_sharded", "ddp_sharded_spawn")) +def test_ddp_sharded_strategy_manual_optimization(tmpdir, strategy): model = ManualBoringModel() trainer = Trainer( default_root_dir=tmpdir, - strategy="ddp_sharded_spawn", - fast_dev_run=2, - accelerator="gpu", - devices=2, - enable_progress_bar=False, - enable_model_summary=False, - ) - trainer.fit(model) - - -@RunIf(min_cuda_gpus=2, skip_windows=True, standalone=True, fairscale=True) -def test_ddp_sharded_strategy_manual_optimization(tmpdir): - model = ManualBoringModel() - trainer = Trainer( - default_root_dir=tmpdir, - strategy="ddp_sharded", + strategy=strategy, fast_dev_run=2, accelerator="gpu", devices=2, @@ -268,7 +253,7 @@ def test_configure_ddp(tmpdir): @RunIf(skip_windows=True, fairscale=True) @mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True) @pytest.mark.parametrize("cls", [DDPShardedStrategy, DDPSpawnShardedStrategy]) -def test_custom_kwargs_sharded(tmpdir, cls): +def test_custom_kwargs_sharded(_, cls): """Tests to ensure that if custom kwargs are passed, they are set correctly.""" strategy = cls(reduce_fp16=True) strategy.model = Mock(spec=LightningModule) @@ -287,7 +272,7 @@ def test_custom_kwargs_sharded(tmpdir, cls): @mock.patch("pytorch_lightning.strategies.DDPShardedStrategy._wrap_optimizers", autospec=True) @pytest.mark.parametrize(["params", "expected_buffer_size"], [(dict(), 0), (dict(reduce_buffer_size=128), 128)]) @pytest.mark.parametrize("num_nodes", [1, 2]) -def test_custom_kwargs_sharded_reduce_buffer_size(tmpdir, params, expected_buffer_size, num_nodes): +def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_size, num_nodes): """Tests to ensure that ``reduce_buffer_size`` is correctly set based on user kwargs.""" strategy = DDPShardedStrategy(**params) strategy.num_nodes = num_nodes @@ -308,7 +293,7 @@ def test_custom_kwargs_sharded_reduce_buffer_size(tmpdir, params, expected_buffe @RunIf(skip_windows=True, fairscale=True) -def test_block_backward_sync(tmpdir): +def test_block_backward_sync(): strategy = DDPShardedStrategy() model = mock.MagicMock(spec=ShardedDataParallel) with mock.patch.object(strategy, "_model", model): diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py index 43edff94b171a..a1a4dfca8666d 100644 --- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py +++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py @@ -457,7 +457,7 @@ def check_grads_unscaled(self, optimizer=None): grads = [p.grad for p in self.parameters()] assert len(grads) == len(self.original_grads) for actual, expected in zip(grads, self.original_grads): - torch.testing.assert_allclose(actual, expected) + torch_test_assert_close(actual, expected) def on_before_optimizer_step(self, optimizer, *_): self.check_grads_unscaled(optimizer) diff --git a/tests/tests_pytorch/utilities/test_meta.py b/tests/tests_pytorch/utilities/test_meta.py index b19483e29bbe2..f7fcce4cb835e 100644 --- a/tests/tests_pytorch/utilities/test_meta.py +++ b/tests/tests_pytorch/utilities/test_meta.py @@ -33,7 +33,7 @@ def __init__(self, num_layers: int): self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)]) -@RunIf(min_torch="1.10.0", standalone=True) +@RunIf(min_torch="1.10.0", max_torch="1.12.0", standalone=True) def test_init_meta_context(): with init_meta_context(): @@ -72,7 +72,7 @@ def test_init_meta_context(): assert m.weight.device.type == "cpu" -@RunIf(min_torch="1.10.0", standalone=True) +@RunIf(min_torch="1.10.0", max_torch="1.12.0", standalone=True) def test_materialize_module_recursive_child(): """Test materialize_module doesn't set a child recursively to a model instantiated within init_meta_context.""" with init_meta_context(): From 2a659344b3d1cc9c26e86e3f59bc0d44dbe5f783 Mon Sep 17 00:00:00 2001 From: Dhruv Nair Date: Thu, 28 Jul 2022 19:57:58 +0530 Subject: [PATCH 033/230] Comet logger documentation improvements (#13847) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- src/pytorch_lightning/loggers/comet.py | 82 +++++++++++++++++++++++++- 1 file changed, 81 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/loggers/comet.py b/src/pytorch_lightning/loggers/comet.py index 363d47c1166e6..ed4fb2f8f2c1a 100644 --- a/src/pytorch_lightning/loggers/comet.py +++ b/src/pytorch_lightning/loggers/comet.py @@ -53,7 +53,8 @@ class CometLogger(Logger): r""" - Log using `Comet.ml `_. + Track your parameters, metrics, source code and more using + `Comet `_. Install it with pip: @@ -99,6 +100,85 @@ class CometLogger(Logger): ) trainer = Trainer(logger=comet_logger) + **Log Hyperparameters:** + + Log parameters used to initialize a :class:`~pytorch_lightning.core.module.LightningModule`: + + .. code-block:: python + + class LitModule(LightningModule): + def __init__(self, *args, **kwarg): + self.save_hyperparameters() + + Log other Experiment Parameters + + .. code-block:: python + + # log a single parameter + logger.log_hyperparams({"batch_size": 16}) + + # log multiple parameters + logger.log_hyperparams({"batch_size": 16, "learning_rate": 0.001}) + + **Log Metrics:** + + .. code-block:: python + + # log a single metric + logger.log_metrics({"train/loss": 0.001}) + + # add multiple metrics + logger.log_metrics({"train/loss": 0.001, "val/loss": 0.002}) + + **Access the Comet Experiment object:** + + You can gain access to the underlying Comet + `Experiment `__ object + and its methods through the :obj:`logger.experiment` property. This will let you use + the additional logging features provided by the Comet SDK. + + Some examples of data you can log through the Experiment object: + + Log Image data: + + .. code-block:: python + + img = PIL.Image.open("") + logger.experiment.log_image(img, file_name="my_image.png") + + Log Text data: + + .. code-block:: python + + text = "Lightning is awesome!" + logger.experiment.log_text(text) + + Log Audio data: + + .. code-block:: python + + audio = "" + logger.experiment.log_audio(audio, file_name="my_audio.wav") + + Log arbitary data assets: + + You can log any type of data to Comet as an asset. These can be model + checkpoints, datasets, debug logs, etc. + + .. code-block:: python + + logger.experiment.log_asset("", file_name="my_data.pkl") + + Log Models to Comet's Model Registry: + + .. code-block:: python + + logger.experiment.log_model(name="my-model", "") + + See Also: + - `Demo in Google Colab `__ + - `Comet Documentation `__ + Args: api_key: Required in online mode. API key, found on Comet.ml. If not given, this will be loaded from the environment variable COMET_API_KEY or ~/.comet.config From 406cea7146d5d056d0e2fd2d3356a7d4f7eb0d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 28 Jul 2022 16:38:51 +0200 Subject: [PATCH 034/230] Support DeepSpeed <0.7.0 (#13859) Co-authored-by: awaelchli --- .azure/gpu-tests.yml | 2 +- requirements/pytorch/strategies.txt | 2 +- src/pytorch_lightning/lite/lite.py | 15 +++++++++++++++ .../utilities/deepspeed_model_summary.py | 8 +++++++- tests/tests_pytorch/lite/test_lite.py | 13 ++++++++++++- 5 files changed, 36 insertions(+), 4 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 33d03688171b1..f37c17613affc 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -75,7 +75,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] - pip install deepspeed==0.6.4 # TODO: remove when docker images are upgraded + pip install deepspeed>0.6.4 # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index db29ce556e839..4eafac99b8c66 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,5 +1,5 @@ fairscale>=0.4.5, <=0.4.6 -deepspeed>=0.6.0, <0.6.5 +deepspeed>=0.6.0, <0.7.0 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 981eed30635f6..5125bf4486a9d 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -40,6 +40,7 @@ has_iterable_dataset, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import seed_everything @@ -105,6 +106,8 @@ def __init__( self._precision_plugin = self._strategy.precision_plugin self._models_setup: int = 0 + self._check_deepspeed_support() + # wrap the run method so we can inject setup logic or spawn processes for the user setattr(self, "run", partial(self._run_impl, self.run)) @@ -456,6 +459,18 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N f" Choose one of {supported} or pass in a `Strategy` instance." ) + def _check_deepspeed_support(self) -> None: + if ( + isinstance(self._strategy, DeepSpeedStrategy) + and self._strategy.zero_stage_3 + and _RequirementAvailable("deepspeed>=0.6.5") + ): + # https://github.com/microsoft/DeepSpeed/issues/2139 + raise RuntimeError( + "DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`." + " Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available." + ) + @staticmethod def _supported_device_types() -> Sequence[_AcceleratorType]: return ( diff --git a/src/pytorch_lightning/utilities/deepspeed_model_summary.py b/src/pytorch_lightning/utilities/deepspeed_model_summary.py index 89dd6a9f9a25f..45d55392df51d 100644 --- a/src/pytorch_lightning/utilities/deepspeed_model_summary.py +++ b/src/pytorch_lightning/utilities/deepspeed_model_summary.py @@ -17,7 +17,9 @@ from typing import Dict, List, Tuple import torch +from torch.nn import Parameter +from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_summary import ( _is_lazy_weight_tensor, get_human_readable_count, @@ -40,7 +42,11 @@ def num_parameters(self) -> int: @property def average_shard_parameters(self) -> int: """Returns the number of parameters in this module.""" - return sum(p.partitioned_size() if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) + + def partitioned_size(p: Parameter) -> int: + return p.partitioned_size() if _RequirementAvailable("deepspeed<0.6.6") else p.partition_numel() + + return sum(partitioned_size(p) if not _is_lazy_weight_tensor(p) else 0 for p in self._module.parameters()) class DeepSpeedSummary(ModelSummary): diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 86a0a5a82195a..2215ab3129780 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import contextlib import os from copy import deepcopy from unittest import mock @@ -29,6 +30,7 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -478,4 +480,13 @@ def run(self): assert self.broadcast(True) assert self.is_global_zero == (self.local_rank == 0) - Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() + if _RequirementAvailable("deepspeed>=0.6.5"): + # https://github.com/microsoft/DeepSpeed/issues/2139 + raise_if_deepspeed_incompatible = pytest.raises( + RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite" + ) + else: + raise_if_deepspeed_incompatible = contextlib.suppress() + + with raise_if_deepspeed_incompatible: + Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() From 3f74ec71cee9703f411aa6b8d789fef8de809899 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Thu, 28 Jul 2022 22:31:24 +0530 Subject: [PATCH 035/230] Update MPS availability to include check for ARM processors (#13896) Update MPS availability to include check for ARM chips --- src/pytorch_lightning/accelerators/mps.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/accelerators/mps.py b/src/pytorch_lightning/accelerators/mps.py index 5c35b618b55fc..3a7178f0623c2 100644 --- a/src/pytorch_lightning/accelerators/mps.py +++ b/src/pytorch_lightning/accelerators/mps.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import platform from typing import Any, Dict, List, Optional, Union import torch @@ -21,7 +22,9 @@ from pytorch_lightning.utilities.imports import _PSUTIL_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.types import _DEVICE -_MPS_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() +# For using the `MPSAccelerator`, user's machine should have `torch>=1.12`, Metal programming framework and +# the ARM-based Apple Silicon processors. +_MPS_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() and platform.platform() == "arm" class MPSAccelerator(Accelerator): From 25203d4c81eb9c269e44fd5faf50930e0404e7bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 28 Jul 2022 19:23:29 +0200 Subject: [PATCH 036/230] Organize model summary utilities (#13893) --- .../callbacks/model_summary.py | 4 ++-- .../utilities/model_summary/__init__.py | 22 +++++++++++++++++++ .../{ => model_summary}/model_summary.py | 0 .../model_summary_deepspeed.py} | 2 +- .../utilities/test_deepspeed_model_summary.py | 2 +- .../utilities/test_model_summary.py | 2 +- 6 files changed, 27 insertions(+), 5 deletions(-) create mode 100644 src/pytorch_lightning/utilities/model_summary/__init__.py rename src/pytorch_lightning/utilities/{ => model_summary}/model_summary.py (100%) rename src/pytorch_lightning/utilities/{deepspeed_model_summary.py => model_summary/model_summary_deepspeed.py} (98%) diff --git a/src/pytorch_lightning/callbacks/model_summary.py b/src/pytorch_lightning/callbacks/model_summary.py index 1b3082a6a9d72..5b7c1be91e5b7 100644 --- a/src/pytorch_lightning/callbacks/model_summary.py +++ b/src/pytorch_lightning/callbacks/model_summary.py @@ -26,10 +26,10 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback -from pytorch_lightning.utilities.deepspeed_model_summary import DeepSpeedSummary -from pytorch_lightning.utilities.model_summary import _format_summary_table +from pytorch_lightning.utilities.model_summary import DeepSpeedSummary from pytorch_lightning.utilities.model_summary import ModelSummary as Summary from pytorch_lightning.utilities.model_summary import summarize +from pytorch_lightning.utilities.model_summary.model_summary import _format_summary_table log = logging.getLogger(__name__) diff --git a/src/pytorch_lightning/utilities/model_summary/__init__.py b/src/pytorch_lightning/utilities/model_summary/__init__.py new file mode 100644 index 0000000000000..a78154ef222bc --- /dev/null +++ b/src/pytorch_lightning/utilities/model_summary/__init__.py @@ -0,0 +1,22 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from pytorch_lightning.utilities.model_summary.model_summary import ( # noqa: F401 + get_formatted_model_size, + get_human_readable_count, + LayerSummary, + ModelSummary, + parse_batch_shape, + summarize, +) +from pytorch_lightning.utilities.model_summary.model_summary_deepspeed import DeepSpeedSummary # noqa: F401 diff --git a/src/pytorch_lightning/utilities/model_summary.py b/src/pytorch_lightning/utilities/model_summary/model_summary.py similarity index 100% rename from src/pytorch_lightning/utilities/model_summary.py rename to src/pytorch_lightning/utilities/model_summary/model_summary.py diff --git a/src/pytorch_lightning/utilities/deepspeed_model_summary.py b/src/pytorch_lightning/utilities/model_summary/model_summary_deepspeed.py similarity index 98% rename from src/pytorch_lightning/utilities/deepspeed_model_summary.py rename to src/pytorch_lightning/utilities/model_summary/model_summary_deepspeed.py index 45d55392df51d..5fc189ccf99b2 100644 --- a/src/pytorch_lightning/utilities/deepspeed_model_summary.py +++ b/src/pytorch_lightning/utilities/model_summary/model_summary_deepspeed.py @@ -20,7 +20,7 @@ from torch.nn import Parameter from pytorch_lightning.utilities.imports import _RequirementAvailable -from pytorch_lightning.utilities.model_summary import ( +from pytorch_lightning.utilities.model_summary.model_summary import ( _is_lazy_weight_tensor, get_human_readable_count, LayerSummary, diff --git a/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py b/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py index 59733325a741c..d0457699f9357 100644 --- a/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py +++ b/tests/tests_pytorch/utilities/test_deepspeed_model_summary.py @@ -16,7 +16,7 @@ from pytorch_lightning import Callback, Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DeepSpeedStrategy -from pytorch_lightning.utilities.deepspeed_model_summary import DeepSpeedSummary +from pytorch_lightning.utilities.model_summary import DeepSpeedSummary from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/utilities/test_model_summary.py b/tests/tests_pytorch/utilities/test_model_summary.py index daaf929fd07ff..5445f6dfdff70 100644 --- a/tests/tests_pytorch/utilities/test_model_summary.py +++ b/tests/tests_pytorch/utilities/test_model_summary.py @@ -19,7 +19,7 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.utilities.model_summary import ModelSummary, summarize, UNKNOWN_SIZE +from pytorch_lightning.utilities.model_summary.model_summary import ModelSummary, summarize, UNKNOWN_SIZE from tests_pytorch.helpers.advanced_models import ParityModuleRNN from tests_pytorch.helpers.runif import RunIf From 233b36b185ae71b480ddd5c87ff5d12cd70c88fe Mon Sep 17 00:00:00 2001 From: donlapark <10988155+donlapark@users.noreply.github.com> Date: Fri, 29 Jul 2022 01:49:24 +0700 Subject: [PATCH 037/230] Fix mypy errors in strategies/ddp_spawn.py (#13865) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- pyproject.toml | 1 - src/pytorch_lightning/strategies/ddp_spawn.py | 73 +++++++++++++------ 2 files changed, 49 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 32cc6e8452d25..05eba62c50402 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -62,7 +62,6 @@ module = [ "pytorch_lightning.profilers.pytorch", "pytorch_lightning.profilers.simple", "pytorch_lightning.strategies.ddp", - "pytorch_lightning.strategies.ddp_spawn", "pytorch_lightning.strategies.fully_sharded", "pytorch_lightning.strategies.ipu", "pytorch_lightning.strategies.sharded", diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index fdb0a7d851169..6a3460febbf07 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -14,7 +14,7 @@ import logging import os from datetime import timedelta -from typing import Any, Dict, List, Optional, Union +from typing import Any, Callable, Dict, List, Optional, Union import torch import torch.distributed @@ -26,12 +26,14 @@ import pytorch_lightning as pl from pytorch_lightning.overrides import LightningDistributedModule +from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.multiprocessing import _MultiProcessingLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy +from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.distributed import ( _get_process_group_backend_from_env, @@ -49,7 +51,7 @@ from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only from pytorch_lightning.utilities.seed import reset_seed -from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep log = logging.getLogger(__name__) @@ -75,8 +77,8 @@ def __init__( checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ddp_comm_state: Optional[object] = None, - ddp_comm_hook: Optional[callable] = None, - ddp_comm_wrapper: Optional[callable] = None, + ddp_comm_hook: Optional[Callable] = None, + ddp_comm_wrapper: Optional[Callable] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, start_method: Literal["spawn", "fork", "forkserver"] = "spawn", @@ -113,32 +115,36 @@ def local_rank(self) -> int: return self._local_rank @property - def root_device(self): + def root_device(self) -> torch.device: + assert self.parallel_devices is not None return self.parallel_devices[self.local_rank] @property - def num_processes(self): + def num_processes(self) -> int: return len(self.parallel_devices) if self.parallel_devices is not None else 0 @property - def distributed_sampler_kwargs(self): + def distributed_sampler_kwargs(self) -> Dict[str, int]: distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) return distributed_sampler_kwargs @property - def _is_single_process_single_device(self): + def _is_single_process_single_device(self) -> bool: return True @property def process_group_backend(self) -> Optional[str]: return self._process_group_backend - def _configure_launcher(self): + def _configure_launcher(self) -> None: self._launcher = _MultiProcessingLauncher(self, start_method=self._start_method) def setup(self, trainer: "pl.Trainer") -> None: + + assert self.cluster_environment is not None os.environ["MASTER_PORT"] = str(self.cluster_environment.main_port) + assert self.accelerator is not None self.accelerator.setup(trainer) # move the model to the correct device @@ -148,6 +154,7 @@ def setup(self, trainer: "pl.Trainer") -> None: trainer_fn = trainer.state.fn if trainer_fn == TrainerFn.FITTING: if self._layer_sync: + assert self.model is not None self.model = self._layer_sync.apply(self.model) self.setup_precision_plugin() @@ -167,11 +174,12 @@ def set_world_ranks(self, process_idx: int = 0) -> None: self.cluster_environment.set_world_size(self.num_nodes * self.num_processes) rank_zero_only.rank = self.cluster_environment.global_rank() - def _worker_setup(self, process_idx: int): + def _worker_setup(self, process_idx: int) -> None: reset_seed() self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank self._process_group_backend = self._get_process_group_backend() + assert self.cluster_environment is not None init_dist_connection( self.cluster_environment, self._process_group_backend, @@ -187,7 +195,7 @@ def _get_process_group_backend(self) -> str: or get_default_process_group_backend_for_device(self.root_device) ) - def pre_configure_ddp(self): + def pre_configure_ddp(self) -> None: # if unset, default `find_unused_parameters` `True` # Many models require setting this parameter to True, as there are corner cases # when not all parameter backward hooks are fired by the autograd engine even if require_grad is set to True. @@ -198,6 +206,7 @@ def _register_ddp_hooks(self) -> None: # currently, DDP communication hooks only work with NCCL backend and SPSD (single process single device) mode # https://github.com/pytorch/pytorch/blob/v1.8.0/torch/nn/parallel/distributed.py#L1080-L1084 if self.root_device.type == "cuda" and self._is_single_process_single_device: + assert isinstance(self.model, DistributedDataParallel) register_ddp_comm_hook( model=self.model, ddp_comm_state=self._ddp_comm_state, @@ -207,19 +216,21 @@ def _register_ddp_hooks(self) -> None: def configure_ddp(self) -> None: self.pre_configure_ddp() + assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model = self._setup_model(LightningDistributedModule(self.model)) self._register_ddp_hooks() # set up optimizers after the wrapped module has been moved to the device + assert self.lightning_module is not None self.setup_optimizers(self.lightning_module.trainer) optimizers_to_device(self.optimizers, self.root_device) - def determine_ddp_device_ids(self): + def determine_ddp_device_ids(self) -> Optional[List[int]]: if self.root_device.type == "cpu": return None return [self.root_device.index] - def barrier(self, *args, **kwargs) -> None: + def barrier(self, *args: Any, **kwargs: Any) -> None: if not distributed_available(): return if torch.distributed.get_backend() == "nccl": @@ -227,27 +238,32 @@ def barrier(self, *args, **kwargs) -> None: else: torch.distributed.barrier() - def broadcast(self, obj: object, src: int = 0) -> object: + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: if not distributed_available(): return obj obj = [obj] if self.global_rank != src: - obj = [None] + obj = [None] # type: ignore[list-item] torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] - def model_to_device(self): + def model_to_device(self) -> None: if self.root_device.type == "cuda": # set the device on the spawned subprocesses torch.cuda.set_device(self.root_device) + assert self.model is not None self.model.to(self.root_device) def pre_backward(self, closure_loss: Tensor) -> None: """Run before precision plugin executes backward.""" + assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: + assert isinstance(self.model, DistributedDataParallel) prepare_for_backward(self.model, closure_loss) - def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> Tensor: + def reduce( + self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean" + ) -> Tensor: """Reduces a tensor from several distributed processes to one aggregated tensor. Args: @@ -263,30 +279,38 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) return tensor - def training_step(self, *args, **kwargs) -> STEP_OUTPUT: + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + assert self.model is not None with self.precision_plugin.train_step_context(): return self.model(*args, **kwargs) - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.val_step_context(): + assert self.lightning_module is not None + assert self.model is not None if self.lightning_module.trainer.state.fn == TrainerFn.FITTING: # used when calling `trainer.fit` return self.model(*args, **kwargs) else: # used when calling `trainer.validate` + assert isinstance(self.model, ValidationStep) return self.model.validation_step(*args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.test_step_context(): + assert isinstance(self.model, TestStep) return self.model.test_step(*args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: with self.precision_plugin.predict_step_context(): + assert isinstance(self.model, PredictStep) return self.model.predict_step(*args, **kwargs) - def post_training_step(self): + def post_training_step(self) -> None: + assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: - self.model.require_backward_grad_sync = True + assert self.model is not None + self.model.require_backward_grad_sync = True # type: ignore[assignment] @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: @@ -315,7 +339,7 @@ def teardown(self) -> None: if ( _TORCH_GREATER_EQUAL_1_11 and not self.model.static_graph - and self.model._get_ddp_logging_data().get("can_set_static_graph") + and self.model._get_ddp_logging_data().get("can_set_static_graph") # type: ignore[operator] ): rank_zero_info( "Your model can run with static graph optimizations. For future training runs, we suggest you" @@ -332,5 +356,6 @@ def teardown(self) -> None: and pl_module._trainer.state.fn == TrainerFn.FITTING and self._layer_sync ): + assert self.model is not None self.model = self._layer_sync.revert(self.model) super().teardown() From 07b39c257bac44d82a5d704d79ef103c829597a2 Mon Sep 17 00:00:00 2001 From: HMellor Date: Thu, 28 Jul 2022 20:26:41 +0100 Subject: [PATCH 038/230] Cast on host instead of IPU when using `precision=16` (#13880) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 ++ src/pytorch_lightning/strategies/ipu.py | 29 ++++++++++++------- tests/tests_pytorch/accelerators/test_ipu.py | 3 +- .../deprecated_api/test_remove_1-8.py | 8 +++++ 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index baf01371fb8bc..f37f03000c148 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -387,6 +387,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/Lightning-AI/lightning/pull/13645)) +- When training with `precision=16` on IPU, the cast has been moved off the IPU onto the host, making the copies from host to IPU cheaper ([#13880](https://github.com/Lightning-AI/lightning/pull/13880)) + + - Fixed error handling in learning rate finder when not enough data points are available to give a good suggestion ([#13845](https://github.com/Lightning-AI/lightning/pull/13845)) diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 001ad77fbb5cc..82ba4ad227f7c 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -33,6 +33,7 @@ from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation from pytorch_lightning.utilities.types import STEP_OUTPUT if _POPTORCH_AVAILABLE: @@ -45,6 +46,7 @@ class LightningIPUModule(_LightningModuleWrapperBase): def __init__( self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase], precision: Union[str, int] ) -> None: + rank_zero_deprecation("`LightningIPUModule` has been deprecated in v1.7.0 and will be removed in v1.8.0") super().__init__(pl_module) self.precision = precision @@ -142,8 +144,7 @@ def setup(self, trainer: "pl.Trainer") -> None: self._optimizer_zero_grad_original = self.lightning_module.optimizer_zero_grad self._disable_zero_grad() - model = LightningIPUModule(self.lightning_module, self.precision_plugin.precision) - self.model = model + self.model = _LightningModuleWrapperBase(self.lightning_module) # reset the backup self.poptorch_models = {} @@ -156,22 +157,22 @@ def setup(self, trainer: "pl.Trainer") -> None: training_opts = self.training_opts inference_opts = self.inference_opts optimizer = self.lightning_module.trainer.optimizers[0] - model = poptorch.trainingModel(model=model, options=training_opts, optimizer=optimizer) + model = poptorch.trainingModel(model=self.model, options=training_opts, optimizer=optimizer) self.poptorch_models[RunningStage.TRAINING] = model if self.lightning_module.trainer.enable_validation: - model = poptorch.inferenceModel(model=model, options=inference_opts) + model = poptorch.inferenceModel(model=self.model, options=inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model if self.lightning_module.trainer.num_sanity_val_steps > 0: self.poptorch_models[RunningStage.SANITY_CHECKING] = model elif trainer_fn == TrainerFn.VALIDATING: - model = poptorch.inferenceModel(model=model, options=self.inference_opts) + model = poptorch.inferenceModel(model=self.model, options=self.inference_opts) self.poptorch_models[RunningStage.VALIDATING] = model elif trainer_fn == TrainerFn.TESTING: - model = poptorch.inferenceModel(model=model, options=self.inference_opts) + model = poptorch.inferenceModel(model=self.model, options=self.inference_opts) self.poptorch_models[RunningStage.TESTING] = model elif trainer_fn == TrainerFn.PREDICTING: - model = poptorch.inferenceModel(model=model, options=self.inference_opts) + model = poptorch.inferenceModel(model=self.model, options=self.inference_opts) self.poptorch_models[RunningStage.PREDICTING] = model def setup_optimizers(self, trainer: "pl.Trainer") -> None: @@ -219,10 +220,6 @@ def inference_opts(self) -> "poptorch.Options": self._inference_opts = self._create_opts(training=False) return self._inference_opts - @property - def lightning_module(self) -> Optional["pl.LightningModule"]: - return self.model.module if isinstance(self.model, LightningIPUModule) else self.model - def _convert_to_poptorch_loader( self, dataloader: DataLoader, sampler, mode: Optional[RunningStage] = None ) -> "poptorch.DataLoader": @@ -272,6 +269,16 @@ def to_tensor(x): args = apply_to_collection(args, dtype=(int, float), function=to_tensor) return args + def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any: + # This override is necessary because the cast must occur before the data + # is moved to the device to prevent wasteful host->device copies. + if self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF): + batch = apply_to_collection(batch, Tensor, function=Tensor.half) + # We don't call `super().batch_to_device` because `data.to(device)` is not + # currently necessary for IPUs. The movement of data from host<->IPU is + # currently handled by PopTorch. + return batch + def _disable_zero_grad(self) -> None: lightning_module = self.lightning_module if is_overridden("optimizer_zero_grad", lightning_module): diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index 248ac0dbb1818..9d510d9d60e9e 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -205,7 +205,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st def test_pure_half_precision(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: - assert trainer.strategy.model.precision == 16 + assert trainer.strategy.precision_plugin.precision == 16 for param in trainer.strategy.model.parameters(): assert param.dtype == torch.float16 raise SystemExit @@ -219,6 +219,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.strategy, IPUStrategy) assert isinstance(trainer.strategy.precision_plugin, IPUPrecisionPlugin) assert trainer.strategy.precision_plugin.precision == 16 + assert trainer.strategy.batch_to_device(torch.zeros((1), dtype=torch.float)).dtype == torch.half with pytest.raises(SystemExit): trainer.fit(model) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 6da335383e11e..0d9fc1e7a2baf 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -43,6 +43,7 @@ from pytorch_lightning.profiler import AbstractProfiler, BaseProfiler from pytorch_lightning.profilers import AdvancedProfiler, Profiler, SimpleProfiler from pytorch_lightning.strategies import DDP2Strategy, ParallelStrategy +from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import device_parser @@ -1006,6 +1007,13 @@ def test_trainer_config_ipus(monkeypatch, trainer_kwargs, expected_ipus): trainer.ipus == expected_ipus +@mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True) +def test_v1_8_0_deprecated_lightning_ipu_module(_, monkeypatch): + monkeypatch.setattr(pytorch_lightning.strategies.ipu, "_IPU_AVAILABLE", True) + with pytest.deprecated_call(match=r"has been deprecated in v1.7.0 and will be removed in v1.8."): + _ = LightningIPUModule(BoringModel(), 32) + + @pytest.mark.parametrize( ["trainer_kwargs", "expected_num_processes"], [ From 7708ce22b219534b3a3be0c2c5b5f65ca98c6058 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 28 Jul 2022 22:08:07 +0200 Subject: [PATCH 039/230] Update GitHub links to PL repo (#13849) * update lightning links in docs * update links in chlog * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: Rohit Gupta * Update src/pytorch_lightning/README.md Co-authored-by: Rohit Gupta * Update src/pytorch_lightning/README.md Co-authored-by: Rohit Gupta * update * painful * badges * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update badges Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta Co-authored-by: Jirka Co-authored-by: Akihiro Nitta --- .github/CONTRIBUTING.md | 8 +- .github/workflows/README.md | 10 +- README.md | 51 +- .../_templates/theme_variables.jinja | 8 +- docs/source-pytorch/starter/installation.rst | 4 +- src/lightning_app/CHANGELOG.md | 2 +- src/pytorch_lightning/CHANGELOG.md | 4948 ++++++++--------- src/pytorch_lightning/README.md | 51 +- tests/README.md | 4 +- 9 files changed, 2543 insertions(+), 2543 deletions(-) diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 7bec2d8763afd..83f7a7252f625 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -202,7 +202,7 @@ We recommend creating a PR in a separate branch other than `master`, especially First, make sure you have set [upstream](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/configuring-a-remote-for-a-fork) by running: ```bash -git remote add upstream https://github.com/PyTorchLightning/pytorch-lightning.git +git remote add upstream https://github.com/Lightning-AI/lightning.git ``` You'll know its set up right if you run `git remote -v` and see something similar to this: @@ -210,8 +210,8 @@ You'll know its set up right if you run `git remote -v` and see something simila ```bash origin https://github.com/{YOUR_USERNAME}/pytorch-lightning.git (fetch) origin https://github.com/{YOUR_USERNAME}/pytorch-lightning.git (push) -upstream https://github.com/PyTorchLightning/pytorch-lightning.git (fetch) -upstream https://github.com/PyTorchLightning/pytorch-lightning.git (push) +upstream https://github.com/Lightning-AI/lightning.git (fetch) +upstream https://github.com/Lightning-AI/lightning.git (push) ``` Checkout your feature branch and rebase it with upstream's master before pushing up your feature branch: @@ -319,7 +319,7 @@ NOTE: Once you edit one of these files, remember to `source` it or restart your plclone (){ git clone https://github.com/{YOUR_USERNAME}/pytorch-lightning.git cd pytorch-lightning - git remote add upstream https://github.com/PyTorchLightning/pytorch-lightning.git + git remote add upstream https://github.com/Lightning-AI/lightning.git # This is just here to print out info about your remote upstream/origin git remote -v } diff --git a/.github/workflows/README.md b/.github/workflows/README.md index d39d096b26f0f..d67bf92d6c048 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -29,11 +29,11 @@ ## Code Quality -| workflow file | action | -| --------------------------------- | ----------------------------------------------------------------------------------------------------- | -| .codecov.yml | Measure test coverage with [codecov.io](https://app.codecov.io/gh/PyTorchLightning/pytorch-lightning) | -| .github/workflows/code-checks.yml | Check Python typing with [MyPy](https://mypy.readthedocs.io/en/stable/). | -| .github/workflows/ci_schema.yml | Validate the syntax of workflow files. | +| workflow file | action | +| --------------------------------- | ----------------------------------------------------------------------------------------- | +| .codecov.yml | Measure test coverage with [codecov.io](https://app.codecov.io/gh/Lightning-AI/lightning) | +| .github/workflows/code-checks.yml | Check Python typing with [MyPy](https://mypy.readthedocs.io/en/stable/). | +| .github/workflows/ci_schema.yml | Validate the syntax of workflow files. | ## Others diff --git a/README.md b/README.md index d9c94bc9195f1..ce2c71db653b8 100644 --- a/README.md +++ b/README.md @@ -22,14 +22,14 @@ ______________________________________________________________________ [![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning) [![Conda](https://img.shields.io/conda/v/conda-forge/pytorch-lightning?label=conda&color=success)](https://anaconda.org/conda-forge/pytorch-lightning) [![DockerHub](https://img.shields.io/docker/pulls/pytorchlightning/pytorch_lightning.svg)](https://hub.docker.com/r/pytorchlightning/pytorch_lightning) -[![codecov](https://codecov.io/gh/PyTorchLightning/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/PyTorchLightning/pytorch-lightning) +[![codecov](https://codecov.io/gh/Lightning-AI/lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Lightning-AI/lightning) -[![ReadTheDocs](https://readthedocs.org/projects/pytorch-lightning/badge/?version=stable)](https://pytorch-lightning.readthedocs.io/en/stable/starter/new-project.html) -[![Slack](https://img.shields.io/badge/slack-chat-green.svg?logo=slack)](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-12iz3cds1-uyyyBYJLiaL2bqVmMN7n~A) -[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/PytorchLightning/pytorch-lightning/blob/master/LICENSE) +[![ReadTheDocs](https://readthedocs.org/projects/pytorch-lightning/badge/?version=stable)](https://pytorch-lightning.readthedocs.io/en/stable/) +[![Slack](https://img.shields.io/badge/slack-chat-green.svg?logo=slack)](https://www.pytorchlightning.ai/community) +[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lightning/blob/master/LICENSE) @@ -74,7 +74,7 @@ Lightning forces the following structure to your code which makes it reusable an Once you do this, you can train on multiple-GPUs, TPUs, CPUs and even in 16-bit precision without changing your code! -Get started with our [2 step guide](https://pytorch-lightning.readthedocs.io/en/latest/starter/new-project.html) +[Get started in just 15 minutes](https://pytorch-lightning.readthedocs.io/en/latest/starter/introduction.html) ______________________________________________________________________ @@ -87,14 +87,14 @@ Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major
-| System / PyTorch ver. | 1.8 (LTS, min. req.) | 1.9 | 1.10 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | [![Build Status]()](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master.svg?style=svg)](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master) | - | - | -| Linux py3.8 (with Conda | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.8 (LTS, min. req.) | 1.9 | 1.10 (latest) | +| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 (with Conda | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -132,20 +132,19 @@ pip install pytorch-lightning['extra'] conda install pytorch-lightning -c conda-forge ``` -#### Install stable 1.5.x +#### Install stable 1.7.x -the actual status of 1.5 \[stable\] is following: +The actual status of 1.7 \[stable\] is the following: -![CI basic testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20basic%20testing/badge.svg?branch=release%2F1.5.x&event=push) -![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.5.x&event=push) -![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.5.x&event=push) -![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.5.x&event=push) -![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.5.x&event=push) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml?query=branch%3Arelease%2Fpytorch) +[![TPU tests](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) +[![Check Docs](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml?query=branch%3Arelease%2Fpytorch) Install future release from the source ```bash -pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.5.x --upgrade +pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/release/pytorch.zip -U ``` #### Install bleeding-edge - future 1.6 @@ -153,7 +152,7 @@ pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@releas Install nightly from the source (no guarantees) ```bash -pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/master.zip +pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/master.zip -U ``` or from testing PyPI @@ -352,7 +351,7 @@ ______________________________________________________________________ - Make fewer mistakes because lightning handles the tricky engineering - Keeps all the flexibility (LightningModules are still PyTorch modules), but removes a ton of boilerplate - Lightning has dozens of integrations with popular machine learning tools. -- [Tested rigorously with every new PR](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/tests). We test every combination of PyTorch and Python supported versions, every OS, multi GPUs and even TPUs. +- [Tested rigorously with every new PR](https://github.com/Lightning-AI/lightning/tree/master/tests). We test every combination of PyTorch and Python supported versions, every OS, multi GPUs and even TPUs. - Minimal running speed overhead (about 300 ms per epoch compared with pure PyTorch). ______________________________________________________________________ @@ -420,5 +419,5 @@ Lightning is also part of the [PyTorch ecosystem](https://pytorch.org/ecosystem/ If you have any questions please: 1. [Read the docs](https://pytorch-lightning.rtfd.io/en/latest). -1. [Search through existing Discussions](https://github.com/PyTorchLightning/pytorch-lightning/discussions), or [add a new question](https://github.com/PyTorchLightning/pytorch-lightning/discussions/new) -1. [Join our slack](https://join.slack.com/t/pytorch-lightning/shared_invite/zt-pw5v393p-qRaDgEk24~EjiZNBpSQFgQ). +1. [Search through existing Discussions](https://github.com/Lightning-AI/lightning/discussions), or [add a new question](https://github.com/Lightning-AI/lightning/discussions/new) +1. [Join our slack](https://www.pytorchlightning.ai/community). diff --git a/docs/source-pytorch/_templates/theme_variables.jinja b/docs/source-pytorch/_templates/theme_variables.jinja index 5073b7a97f9d0..332a9820f4fb4 100644 --- a/docs/source-pytorch/_templates/theme_variables.jinja +++ b/docs/source-pytorch/_templates/theme_variables.jinja @@ -1,7 +1,7 @@ {%- set external_urls = { - 'github': 'https://github.com/PyTorchLightning/pytorch-lightning', - 'github_issues': 'https://github.com/PyTorchLightning/pytorch-lightning/issues', - 'contributing': 'https://github.com/PyTorchLightning/pytorch-lightning/blob/master/CONTRIBUTING.md', + 'github': 'https://github.com/Lightning-AI/lightning', + 'github_issues': 'https://github.com/Lightning-AI/lightning/issues', + 'contributing': 'https://github.com/Lightning-AI/lightning/blob/master/.github/CONTRIBUTING.md', 'governance': 'https://pytorch-lightning.readthedocs.io/en/latest/governance.html', 'docs': 'https://pytorch-lightning.rtfd.io/en/latest', 'twitter': 'https://twitter.com/PyTorchLightnin', @@ -15,6 +15,6 @@ 'resources': 'https://pytorch-lightning.readthedocs.io/en/latest/#community-examples', 'support': 'https://pytorch-lightning.rtfd.io/en/latest/', 'community': 'https://www.pytorchlightning.ai/community', - 'forums': 'https://github.com/PyTorchLightning/pytorch-lightning/discussions', + 'forums': 'https://github.com/Lightning-AI/lightning/discussions', } -%} diff --git a/docs/source-pytorch/starter/installation.rst b/docs/source-pytorch/starter/installation.rst index ba28644006f74..c5f7760945fce 100644 --- a/docs/source-pytorch/starter/installation.rst +++ b/docs/source-pytorch/starter/installation.rst @@ -50,10 +50,10 @@ are not published yet. This is the bleeding edge, so use it at your own discreti .. code-block:: bash - pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/master.zip + pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/master.zip -U Install future patch releases from the source. Note that the patch release contains only the bug fixes for the recent major release. .. code-block:: bash - pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/refs/heads/release/1.5.x.zip + pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/release/pytorch.zip -U diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 7d0dcb589b9e3..95a7000818b78 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -12,7 +12,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Update the Lightning App docs ([#13537](https://github.com/PyTorchLightning/pytorch-lightning/pull/13537)) +- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537)) ### Changed diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index f37f03000c148..aa66df9b54a8b 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -9,106 +9,106 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added ``ServableModule`` and its associated callback called ``ServableModuleValidator`` to ensure the model can served ([#13614](https://github.com/PyTorchLightning/pytorch-lightning/pull/13614)) +- Added ``ServableModule`` and its associated callback called ``ServableModuleValidator`` to ensure the model can served ([#13614](https://github.com/Lightning-AI/lightning/pull/13614)) -- Converted validation loop config warnings to `PossibleUserWarning` ([#13377](https://github.com/PyTorchLightning/pytorch-lightning/pull/13377)) +- Converted validation loop config warnings to `PossibleUserWarning` ([#13377](https://github.com/Lightning-AI/lightning/pull/13377)) -- Added a flag named `log_rank_zero_only` to `EarlyStopping` to disable logging to non-zero rank processes ([#13233](https://github.com/PyTorchLightning/pytorch-lightning/pull/13233)) +- Added a flag named `log_rank_zero_only` to `EarlyStopping` to disable logging to non-zero rank processes ([#13233](https://github.com/Lightning-AI/lightning/pull/13233)) -- Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/PyTorchLightning/pytorch-lightning/pull/12816)) +- Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/Lightning-AI/lightning/pull/12816)) -- Added `LightningDataModule.load_from_checkpoint` to support loading datamodules directly from checkpoint ([#12550](https://github.com/PyTorchLightning/pytorch-lightning/pull/12550)) +- Added `LightningDataModule.load_from_checkpoint` to support loading datamodules directly from checkpoint ([#12550](https://github.com/Lightning-AI/lightning/pull/12550)) -- Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/PyTorchLightning/pytorch-lightning/pull/12772)) +- Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/Lightning-AI/lightning/pull/12772)) -- Added a friendly error message when attempting to use `DeepSpeedStrategy` on unsupported accelerators ([#12699](https://github.com/PyTorchLightning/pytorch-lightning/pull/12699)) +- Added a friendly error message when attempting to use `DeepSpeedStrategy` on unsupported accelerators ([#12699](https://github.com/Lightning-AI/lightning/pull/12699)) -- Enabled `torch.inference_mode` for evaluation and prediction ([#12715](https://github.com/PyTorchLightning/pytorch-lightning/pull/12715)) +- Enabled `torch.inference_mode` for evaluation and prediction ([#12715](https://github.com/Lightning-AI/lightning/pull/12715)) -- Added support for setting `val_check_interval` to a value higher than the amount of training batches when `check_val_every_n_epoch=None` ([#11993](https://github.com/PyTorchLightning/pytorch-lightning/pull/11993)) +- Added support for setting `val_check_interval` to a value higher than the amount of training batches when `check_val_every_n_epoch=None` ([#11993](https://github.com/Lightning-AI/lightning/pull/11993)) -- Include the `pytorch_lightning` version as a header in the CLI config files ([#12532](https://github.com/PyTorchLightning/pytorch-lightning/pull/12532)) +- Include the `pytorch_lightning` version as a header in the CLI config files ([#12532](https://github.com/Lightning-AI/lightning/pull/12532)) -- Added support for `Callback` registration through entry points ([#12739](https://github.com/PyTorchLightning/pytorch-lightning/pull/12739)) +- Added support for `Callback` registration through entry points ([#12739](https://github.com/Lightning-AI/lightning/pull/12739)) -- Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/PyTorchLightning/pytorch-lightning/pull/12588)) +- Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/Lightning-AI/lightning/pull/12588)) -- Added profiling to the loops' dataloader `__next__` calls ([#12124](https://github.com/PyTorchLightning/pytorch-lightning/pull/12124)) +- Added profiling to the loops' dataloader `__next__` calls ([#12124](https://github.com/Lightning-AI/lightning/pull/12124)) - Hivemind Strategy - * Added `CollaborativeStrategy` ([#12842](https://github.com/PyTorchLightning/pytorch-lightning/pull/12842)) - * Renamed `CollaborativeStrategy` to `HivemindStrategy` ([#13388](https://github.com/PyTorchLightning/pytorch-lightning/pull/13388)) - * Removed unnecessary endpoint logic, renamed `collaborative` to `hivemind` ([#13392](https://github.com/PyTorchLightning/pytorch-lightning/pull/13392)) + * Added `CollaborativeStrategy` ([#12842](https://github.com/Lightning-AI/lightning/pull/12842)) + * Renamed `CollaborativeStrategy` to `HivemindStrategy` ([#13388](https://github.com/Lightning-AI/lightning/pull/13388)) + * Removed unnecessary endpoint logic, renamed `collaborative` to `hivemind` ([#13392](https://github.com/Lightning-AI/lightning/pull/13392)) -- Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/PyTorchLightning/pytorch-lightning/pull/12902)) +- Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/Lightning-AI/lightning/pull/12902)) -- Show a better error message when a Metric that does not return a Tensor is logged ([#13164](https://github.com/PyTorchLightning/pytorch-lightning/pull/13164)) +- Show a better error message when a Metric that does not return a Tensor is logged ([#13164](https://github.com/Lightning-AI/lightning/pull/13164)) -- Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/PyTorchLightning/pytorch-lightning/pull/12942)) +- Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/Lightning-AI/lightning/pull/12942)) -- Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/PyTorchLightning/pytorch-lightning/pull/12228)) +- Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/Lightning-AI/lightning/pull/12228)) -- Automatically wrap custom samplers under a distributed environment by using `DistributedSamplerWrapper` ([#12959](https://github.com/PyTorchLightning/pytorch-lightning/pull/12959)) +- Automatically wrap custom samplers under a distributed environment by using `DistributedSamplerWrapper` ([#12959](https://github.com/Lightning-AI/lightning/pull/12959)) -- Added profiling of `LightningDataModule` hooks ([#12971](https://github.com/PyTorchLightning/pytorch-lightning/pull/12971)) +- Added profiling of `LightningDataModule` hooks ([#12971](https://github.com/Lightning-AI/lightning/pull/12971)) -- Added Native FSDP Strategy ([#12447](https://github.com/PyTorchLightning/pytorch-lightning/pull/12447)) +- Added Native FSDP Strategy ([#12447](https://github.com/Lightning-AI/lightning/pull/12447)) -- Added breaking of lazy graph across training, validation, test and predict steps when training with habana accelerators to ensure better performance ([#12938](https://github.com/PyTorchLightning/pytorch-lightning/pull/12938)) +- Added breaking of lazy graph across training, validation, test and predict steps when training with habana accelerators to ensure better performance ([#12938](https://github.com/Lightning-AI/lightning/pull/12938)) -- Added `Checkpoint` class to inherit from ([#13024](https://github.com/PyTorchLightning/pytorch-lightning/pull/13024)) +- Added `Checkpoint` class to inherit from ([#13024](https://github.com/Lightning-AI/lightning/pull/13024)) -- Added CPU metric tracking to `DeviceStatsMonitor` ([#11795](https://github.com/PyTorchLightning/pytorch-lightning/pull/11795)) +- Added CPU metric tracking to `DeviceStatsMonitor` ([#11795](https://github.com/Lightning-AI/lightning/pull/11795)) -- Added `teardown()` method to `Accelerator` ([#11935](https://github.com/PyTorchLightning/pytorch-lightning/pull/11935)) +- Added `teardown()` method to `Accelerator` ([#11935](https://github.com/Lightning-AI/lightning/pull/11935)) -- Added support for using custom Trainers that don't include callbacks using the CLI ([#13138](https://github.com/PyTorchLightning/pytorch-lightning/pull/13138)) +- Added support for using custom Trainers that don't include callbacks using the CLI ([#13138](https://github.com/Lightning-AI/lightning/pull/13138)) -- Added a `timeout` argument to `DDPStrategy` and `DDPSpawnStrategy`. ([#13244](https://github.com/PyTorchLightning/pytorch-lightning/pull/13244), [#13383](https://github.com/Lightning-AI/lightning/pull/13383)) +- Added a `timeout` argument to `DDPStrategy` and `DDPSpawnStrategy`. ([#13244](https://github.com/Lightning-AI/lightning/pull/13244), [#13383](https://github.com/Lightning-AI/lightning/pull/13383)) -- Added `XLAEnvironment` cluster environment plugin ([#11330](https://github.com/PyTorchLightning/pytorch-lightning/pull/11330)) +- Added `XLAEnvironment` cluster environment plugin ([#11330](https://github.com/Lightning-AI/lightning/pull/11330)) -- Added logging messages to notify when `FitLoop` stopping conditions are met ([#9749](https://github.com/PyTorchLightning/pytorch-lightning/pull/9749)) +- Added logging messages to notify when `FitLoop` stopping conditions are met ([#9749](https://github.com/Lightning-AI/lightning/pull/9749)) -- Added support for calling unknown methods with `DummyLogger` ([#13224](https://github.com/PyTorchLightning/pytorch-lightning/pull/13224) +- Added support for calling unknown methods with `DummyLogger` ([#13224](https://github.com/Lightning-AI/lightning/pull/13224) -- Added support for recursively setting the `Trainer` reference for ensembles of `LightningModule`s ([#13638](https://github.com/PyTorchLightning/pytorch-lightning/pull/13638) +- Added support for recursively setting the `Trainer` reference for ensembles of `LightningModule`s ([#13638](https://github.com/Lightning-AI/lightning/pull/13638) -- Added Apple Silicon Support via `MPSAccelerator` ([#13123](https://github.com/PyTorchLightning/pytorch-lightning/pull/13123)) +- Added Apple Silicon Support via `MPSAccelerator` ([#13123](https://github.com/Lightning-AI/lightning/pull/13123)) -- Added support for DDP Fork ([#13405](https://github.com/PyTorchLightning/pytorch-lightning/pull/13405)) +- Added support for DDP Fork ([#13405](https://github.com/Lightning-AI/lightning/pull/13405)) -- Added support for async checkpointing ([#13658](https://github.com/PyTorchLightning/pytorch-lightning/pull/13658)) +- Added support for async checkpointing ([#13658](https://github.com/Lightning-AI/lightning/pull/13658)) ### Changed @@ -116,46 +116,46 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) -- Enable validation during overfitting ([#12527](https://github.com/PyTorchLightning/pytorch-lightning/pull/12527)) +- Enable validation during overfitting ([#12527](https://github.com/Lightning-AI/lightning/pull/12527)) -- Added dataclass support to `extract_batch_size` ([#12573](https://github.com/PyTorchLightning/pytorch-lightning/pull/12573)) +- Added dataclass support to `extract_batch_size` ([#12573](https://github.com/Lightning-AI/lightning/pull/12573)) -- Changed checkpoints save path in the case of one logger and user-provided weights_save_path from `weights_save_path/name/version/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/PyTorchLightning/pytorch-lightning/pull/12372)) +- Changed checkpoints save path in the case of one logger and user-provided weights_save_path from `weights_save_path/name/version/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/Lightning-AI/lightning/pull/12372)) -- Changed checkpoints save path in the case of multiple loggers and user-provided weights_save_path from `weights_save_path/name1_name2/version1_version2/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/PyTorchLightning/pytorch-lightning/pull/12372)) +- Changed checkpoints save path in the case of multiple loggers and user-provided weights_save_path from `weights_save_path/name1_name2/version1_version2/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/Lightning-AI/lightning/pull/12372)) -- Marked `swa_lrs` argument in `StochasticWeightAveraging` callback as required ([#12556](https://github.com/PyTorchLightning/pytorch-lightning/pull/12556)) +- Marked `swa_lrs` argument in `StochasticWeightAveraging` callback as required ([#12556](https://github.com/Lightning-AI/lightning/pull/12556)) -- `LightningCLI`'s shorthand notation changed to use jsonargparse native feature ([#12614](https://github.com/PyTorchLightning/pytorch-lightning/pull/12614)) +- `LightningCLI`'s shorthand notation changed to use jsonargparse native feature ([#12614](https://github.com/Lightning-AI/lightning/pull/12614)) -- `LightningCLI` changed to use jsonargparse native support for list append ([#13129](https://github.com/PyTorchLightning/pytorch-lightning/pull/13129)) +- `LightningCLI` changed to use jsonargparse native support for list append ([#13129](https://github.com/Lightning-AI/lightning/pull/13129)) -- Changed `seed_everything_default` argument in the `LightningCLI` to type `Union[bool, int]`. If set to `True` a seed is automatically generated for the parser argument `--seed_everything`. ([#12822](https://github.com/PyTorchLightning/pytorch-lightning/pull/12822), [#13110](https://github.com/PyTorchLightning/pytorch-lightning/pull/13110)) +- Changed `seed_everything_default` argument in the `LightningCLI` to type `Union[bool, int]`. If set to `True` a seed is automatically generated for the parser argument `--seed_everything`. ([#12822](https://github.com/Lightning-AI/lightning/pull/12822), [#13110](https://github.com/Lightning-AI/lightning/pull/13110)) -- Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/PyTorchLightning/pytorch-lightning/pull/12504)) +- Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/Lightning-AI/lightning/pull/12504)) -- Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885)) +- Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/Lightning-AI/lightning/pull/12885)) -- `DataLoader` instantiated inside a `*_dataloader` hook will not set the passed arguments as attributes anymore ([#12981](https://github.com/PyTorchLightning/pytorch-lightning/pull/12981)) +- `DataLoader` instantiated inside a `*_dataloader` hook will not set the passed arguments as attributes anymore ([#12981](https://github.com/Lightning-AI/lightning/pull/12981)) -- When a multi-element tensor is logged, an error is now raised instead of silently taking the mean of all elements ([#13164](https://github.com/PyTorchLightning/pytorch-lightning/pull/13164)) +- When a multi-element tensor is logged, an error is now raised instead of silently taking the mean of all elements ([#13164](https://github.com/Lightning-AI/lightning/pull/13164)) -- The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/PyTorchLightning/pytorch-lightning/pull/12604)) +- The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/Lightning-AI/lightning/pull/12604)) -- Enabled using any Sampler in distributed environment in Lite ([#13646](https://github.com/PyTorchLightning/pytorch-lightning/pull/13646)) +- Enabled using any Sampler in distributed environment in Lite ([#13646](https://github.com/Lightning-AI/lightning/pull/13646)) - Raised a warning instead of forcing `sync_dist=True` on epoch end ([13364](https://github.com/Lightning-AI/lightning/pull/13364)) @@ -164,10 +164,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated `val_check_interval`(int) to consider total train batches processed instead of `_batches_that_stepped` for validation check during training ([#12832](https://github.com/Lightning-AI/lightning/pull/12832) -- Updated Habana Accelerator's `auto_device_count`, `is_available` & `get_device_name` methods based on the latest torch habana package ([#13423](https://github.com/PyTorchLightning/pytorch-lightning/pull/13423)) +- Updated Habana Accelerator's `auto_device_count`, `is_available` & `get_device_name` methods based on the latest torch habana package ([#13423](https://github.com/Lightning-AI/lightning/pull/13423)) -- Disallowed using `BatchSampler` when running on multiple IPUs ([#13854](https://github.com/PyTorchLightning/pytorch-lightning/pull/13854)) +- Disallowed using `BatchSampler` when running on multiple IPUs ([#13854](https://github.com/Lightning-AI/lightning/pull/13854)) ### Deprecated @@ -175,159 +175,159 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `pytorch_lightning.accelerators.gpu.GPUAccelerator` in favor of `pytorch_lightning.accelerators.cuda.CUDAAccelerator` ([#13636](https://github.com/Lightning-AI/lightning/pull/13636)) -- Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/PyTorchLightning/pytorch-lightning/pull/12014)) +- Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/Lightning-AI/lightning/pull/12014)) -- Deprecated `pytorch_lightning.callbacks.base.Callback` in favor of `pytorch_lightning.callbacks.callback.Callback` ([#13031](https://github.com/PyTorchLightning/pytorch-lightning/pull/13031)) +- Deprecated `pytorch_lightning.callbacks.base.Callback` in favor of `pytorch_lightning.callbacks.callback.Callback` ([#13031](https://github.com/Lightning-AI/lightning/pull/13031)) -- Deprecated `num_processes`, `gpus`, `tpu_cores,` and `ipus` from the `Trainer` constructor in favor of using the `accelerator` and `devices` arguments ([#11040](https://github.com/PyTorchLightning/pytorch-lightning/pull/11040)) +- Deprecated `num_processes`, `gpus`, `tpu_cores,` and `ipus` from the `Trainer` constructor in favor of using the `accelerator` and `devices` arguments ([#11040](https://github.com/Lightning-AI/lightning/pull/11040)) -- Deprecated setting `LightningCLI(seed_everything_default=None)` in favor of `False` ([#12804](https://github.com/PyTorchLightning/pytorch-lightning/issues/12804)). +- Deprecated setting `LightningCLI(seed_everything_default=None)` in favor of `False` ([#12804](https://github.com/Lightning-AI/lightning/issues/12804)). -- Deprecated `pytorch_lightning.core.lightning.LightningModule` in favor of `pytorch_lightning.core.module.LightningModule` ([#12740](https://github.com/PyTorchLightning/pytorch-lightning/pull/12740)) +- Deprecated `pytorch_lightning.core.lightning.LightningModule` in favor of `pytorch_lightning.core.module.LightningModule` ([#12740](https://github.com/Lightning-AI/lightning/pull/12740)) -- Deprecated `pytorch_lightning.loops.base.Loop` in favor of `pytorch_lightning.loops.loop.Loop` ([#13043](https://github.com/PyTorchLightning/pytorch-lightning/pull/13043)) +- Deprecated `pytorch_lightning.loops.base.Loop` in favor of `pytorch_lightning.loops.loop.Loop` ([#13043](https://github.com/Lightning-AI/lightning/pull/13043)) -- Deprecated `Trainer.reset_train_val_dataloaders()` in favor of `Trainer.reset_{train,val}_dataloader` ([#12184](https://github.com/PyTorchLightning/pytorch-lightning/pull/12184)) +- Deprecated `Trainer.reset_train_val_dataloaders()` in favor of `Trainer.reset_{train,val}_dataloader` ([#12184](https://github.com/Lightning-AI/lightning/pull/12184)) -- Deprecated LightningCLI's registries in favor of importing the respective package ([#13221](https://github.com/PyTorchLightning/pytorch-lightning/pull/13221)) +- Deprecated LightningCLI's registries in favor of importing the respective package ([#13221](https://github.com/Lightning-AI/lightning/pull/13221)) -- Deprecated public utilities in `pytorch_lightning.utilities.cli.LightningCLI` in favor of equivalent copies in `pytorch_lightning.cli.LightningCLI` ([#13767](https://github.com/PyTorchLightning/pytorch-lightning/pull/13767)) +- Deprecated public utilities in `pytorch_lightning.utilities.cli.LightningCLI` in favor of equivalent copies in `pytorch_lightning.cli.LightningCLI` ([#13767](https://github.com/Lightning-AI/lightning/pull/13767)) -- Deprecated `pytorch_lightning.profiler` in favor of `pytorch_lightning.profilers` ([#12308](https://github.com/PyTorchLightning/pytorch-lightning/pull/12308)) +- Deprecated `pytorch_lightning.profiler` in favor of `pytorch_lightning.profilers` ([#12308](https://github.com/Lightning-AI/lightning/pull/12308)) ### Removed -- Removed deprecated `IndexBatchSamplerWrapper.batch_indices` ([#13565](https://github.com/PyTorchLightning/pytorch-lightning/pull/13565)) +- Removed deprecated `IndexBatchSamplerWrapper.batch_indices` ([#13565](https://github.com/Lightning-AI/lightning/pull/13565)) -- Removed the deprecated `LightningModule.add_to_queue` and `LightningModule.get_from_queue` method ([#13600](https://github.com/PyTorchLightning/pytorch-lightning/pull/13600)) +- Removed the deprecated `LightningModule.add_to_queue` and `LightningModule.get_from_queue` method ([#13600](https://github.com/Lightning-AI/lightning/pull/13600)) - Removed deprecated `pytorch_lightning.core.decorators.parameter_validation` from `decorators` ([#13514](https://github.com/Lightning-AI/lightning/pull/13514)) -- Removed the deprecated `Logger.close` method ([#13149](https://github.com/PyTorchLightning/pytorch-lightning/pull/13149)) +- Removed the deprecated `Logger.close` method ([#13149](https://github.com/Lightning-AI/lightning/pull/13149)) -- Removed the deprecated `weights_summary` argument from the `Trainer` constructor ([#13070](https://github.com/PyTorchLightning/pytorch-lightning/pull/13070)) +- Removed the deprecated `weights_summary` argument from the `Trainer` constructor ([#13070](https://github.com/Lightning-AI/lightning/pull/13070)) -- Removed the deprecated `flush_logs_every_n_steps` argument from the `Trainer` constructor ([#13074](https://github.com/PyTorchLightning/pytorch-lightning/pull/13074)) +- Removed the deprecated `flush_logs_every_n_steps` argument from the `Trainer` constructor ([#13074](https://github.com/Lightning-AI/lightning/pull/13074)) -- Removed the deprecated `process_position` argument from the `Trainer` constructor ([13071](https://github.com/PyTorchLightning/pytorch-lightning/pull/13071)) +- Removed the deprecated `process_position` argument from the `Trainer` constructor ([13071](https://github.com/Lightning-AI/lightning/pull/13071)) -- Removed the deprecated `checkpoint_callback` argument from the `Trainer` constructor ([#13027](https://github.com/PyTorchLightning/pytorch-lightning/pull/13027)) +- Removed the deprecated `checkpoint_callback` argument from the `Trainer` constructor ([#13027](https://github.com/Lightning-AI/lightning/pull/13027)) -- Removed the deprecated `on_{train,val,test,predict}_dataloader` hooks from the `LightningModule` and `LightningDataModule` ([#13033](https://github.com/PyTorchLightning/pytorch-lightning/pull/13033)) +- Removed the deprecated `on_{train,val,test,predict}_dataloader` hooks from the `LightningModule` and `LightningDataModule` ([#13033](https://github.com/Lightning-AI/lightning/pull/13033)) -- Removed the deprecated `TestTubeLogger` ([#12859](https://github.com/PyTorchLightning/pytorch-lightning/pull/12859)) +- Removed the deprecated `TestTubeLogger` ([#12859](https://github.com/Lightning-AI/lightning/pull/12859)) -- Removed the deprecated `pytorch_lightning.core.memory.LayerSummary` and `pytorch_lightning.core.memory.ModelSummary` ([#12593](https://github.com/PyTorchLightning/pytorch-lightning/pull/12593)) +- Removed the deprecated `pytorch_lightning.core.memory.LayerSummary` and `pytorch_lightning.core.memory.ModelSummary` ([#12593](https://github.com/Lightning-AI/lightning/pull/12593)) -- Removed the deprecated `summarize` method from the `LightningModule` ([#12559](https://github.com/PyTorchLightning/pytorch-lightning/pull/12559)) +- Removed the deprecated `summarize` method from the `LightningModule` ([#12559](https://github.com/Lightning-AI/lightning/pull/12559)) -- Removed the deprecated `model_size` property from the `LightningModule` class ([#12641](https://github.com/PyTorchLightning/pytorch-lightning/pull/12641)) +- Removed the deprecated `model_size` property from the `LightningModule` class ([#12641](https://github.com/Lightning-AI/lightning/pull/12641)) -- Removed the deprecated `stochastic_weight_avg` argument from the `Trainer` constructor ([#12535](https://github.com/PyTorchLightning/pytorch-lightning/pull/12535)) +- Removed the deprecated `stochastic_weight_avg` argument from the `Trainer` constructor ([#12535](https://github.com/Lightning-AI/lightning/pull/12535)) -- Removed the deprecated `progress_bar_refresh_rate` argument from the `Trainer` constructor ([#12514](https://github.com/PyTorchLightning/pytorch-lightning/pull/12514)) +- Removed the deprecated `progress_bar_refresh_rate` argument from the `Trainer` constructor ([#12514](https://github.com/Lightning-AI/lightning/pull/12514)) -- Removed the deprecated `prepare_data_per_node` argument from the `Trainer` constructor ([#12536](https://github.com/PyTorchLightning/pytorch-lightning/pull/12536)) +- Removed the deprecated `prepare_data_per_node` argument from the `Trainer` constructor ([#12536](https://github.com/Lightning-AI/lightning/pull/12536)) -- Removed the deprecated `pytorch_lightning.core.memory.{get_gpu_memory_map,get_memory_profile}` ([#12659](https://github.com/PyTorchLightning/pytorch-lightning/pull/12659)) +- Removed the deprecated `pytorch_lightning.core.memory.{get_gpu_memory_map,get_memory_profile}` ([#12659](https://github.com/Lightning-AI/lightning/pull/12659)) -- Removed the deprecated `terminate_on_nan` argument from the `Trainer` constructor ([#12553](https://github.com/PyTorchLightning/pytorch-lightning/pull/12553)) +- Removed the deprecated `terminate_on_nan` argument from the `Trainer` constructor ([#12553](https://github.com/Lightning-AI/lightning/pull/12553)) -- Removed the deprecated `XLAStatsMonitor` callback ([#12688](https://github.com/PyTorchLightning/pytorch-lightning/pull/12688)) +- Removed the deprecated `XLAStatsMonitor` callback ([#12688](https://github.com/Lightning-AI/lightning/pull/12688)) -- Remove deprecated `pytorch_lightning.callbacks.progress.progress` ([#12658](https://github.com/PyTorchLightning/pytorch-lightning/pull/12658)) +- Remove deprecated `pytorch_lightning.callbacks.progress.progress` ([#12658](https://github.com/Lightning-AI/lightning/pull/12658)) -- Removed the deprecated `dim` and `size` arguments from the `LightningDataModule` constructor([#12780](https://github.com/PyTorchLightning/pytorch-lightning/pull/12780)) +- Removed the deprecated `dim` and `size` arguments from the `LightningDataModule` constructor([#12780](https://github.com/Lightning-AI/lightning/pull/12780)) -- Removed the deprecated `train_transforms` argument from the `LightningDataModule` constructor([#12662](https://github.com/PyTorchLightning/pytorch-lightning/pull/12662)) +- Removed the deprecated `train_transforms` argument from the `LightningDataModule` constructor([#12662](https://github.com/Lightning-AI/lightning/pull/12662)) -- Removed the deprecated `log_gpu_memory` argument from the `Trainer` constructor ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) +- Removed the deprecated `log_gpu_memory` argument from the `Trainer` constructor ([#12657](https://github.com/Lightning-AI/lightning/pull/12657)) -- Removed the deprecated automatic logging of GPU stats by the logger connector ([#12657](https://github.com/PyTorchLightning/pytorch-lightning/pull/12657)) +- Removed the deprecated automatic logging of GPU stats by the logger connector ([#12657](https://github.com/Lightning-AI/lightning/pull/12657)) -- Removed deprecated `GPUStatsMonitor` callback ([#12554](https://github.com/PyTorchLightning/pytorch-lightning/pull/12554)) +- Removed deprecated `GPUStatsMonitor` callback ([#12554](https://github.com/Lightning-AI/lightning/pull/12554)) -- Removed support for passing strategy names or strategy instances to the accelerator Trainer argument ([#12696](https://github.com/PyTorchLightning/pytorch-lightning/pull/12696)) +- Removed support for passing strategy names or strategy instances to the accelerator Trainer argument ([#12696](https://github.com/Lightning-AI/lightning/pull/12696)) -- Removed support for passing strategy names or strategy instances to the plugins Trainer argument ([#12700](https://github.com/PyTorchLightning/pytorch-lightning/pull/12700)) +- Removed support for passing strategy names or strategy instances to the plugins Trainer argument ([#12700](https://github.com/Lightning-AI/lightning/pull/12700)) -- Removed the deprecated `val_transforms` argument from the `LightningDataModule` constructor ([#12763](https://github.com/PyTorchLightning/pytorch-lightning/pull/12763)) +- Removed the deprecated `val_transforms` argument from the `LightningDataModule` constructor ([#12763](https://github.com/Lightning-AI/lightning/pull/12763)) -- Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/PyTorchLightning/pytorch-lightning/pull/12773)) +- Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/Lightning-AI/lightning/pull/12773)) - Removed deprecated `Trainer(max_steps=None)` ([#13591](https://github.com/Lightning-AI/lightning/pull/13591)) -- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/PyTorchLightning/pytorch-lightning/pull/12769), [#12977](https://github.com/PyTorchLightning/pytorch-lightning/pull/12977)) +- Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/Lightning-AI/lightning/pull/12769), [#12977](https://github.com/Lightning-AI/lightning/pull/12977)) -- Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/PyTorchLightning/pytorch-lightning/pull/12839)) +- Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/Lightning-AI/lightning/pull/12839)) -- Removed sanity check for multi-optimizer support with habana backends ([#13217](https://github.com/PyTorchLightning/pytorch-lightning/pull/13217)) +- Removed sanity check for multi-optimizer support with habana backends ([#13217](https://github.com/Lightning-AI/lightning/pull/13217)) -- Removed the need to explicitly load habana module ([#13338](https://github.com/PyTorchLightning/pytorch-lightning/pull/13338)) +- Removed the need to explicitly load habana module ([#13338](https://github.com/Lightning-AI/lightning/pull/13338)) -- Removed the deprecated `Strategy.post_dispatch()` hook ([#13461](https://github.com/PyTorchLightning/pytorch-lightning/pull/13461)) +- Removed the deprecated `Strategy.post_dispatch()` hook ([#13461](https://github.com/Lightning-AI/lightning/pull/13461)) - Removed deprecated `pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor.lr_sch_names` ([#13353](https://github.com/Lightning-AI/lightning/pull/13353)) -- Removed deprecated `Trainer.slurm_job_id` in favor of `SLURMEnvironment.job_id` ([#13459](https://github.com/PyTorchLightning/pytorch-lightning/pull/13459)) +- Removed deprecated `Trainer.slurm_job_id` in favor of `SLURMEnvironment.job_id` ([#13459](https://github.com/Lightning-AI/lightning/pull/13459)) -- Removed support for the `DDP2Strategy` ([#12705](https://github.com/PyTorchLightning/pytorch-lightning/pull/12705)) +- Removed support for the `DDP2Strategy` ([#12705](https://github.com/Lightning-AI/lightning/pull/12705)) -- Removed deprecated `LightningDistributed` ([#13549](https://github.com/PyTorchLightning/pytorch-lightning/pull/13549)) +- Removed deprecated `LightningDistributed` ([#13549](https://github.com/Lightning-AI/lightning/pull/13549)) -- Removed deprecated ClusterEnvironment properties `master_address` and `master_port` in favor of `main_address` and `main_port` ([#13458](https://github.com/PyTorchLightning/pytorch-lightning/pull/13458)) +- Removed deprecated ClusterEnvironment properties `master_address` and `master_port` in favor of `main_address` and `main_port` ([#13458](https://github.com/Lightning-AI/lightning/pull/13458)) -- Removed deprecated ClusterEnvironment methods `KubeflowEnvironment.is_using_kubelfow()`, `LSFEnvironment.is_using_lsf()` and `TorchElasticEnvironment.is_using_torchelastic()` in favor of the `detect()` method ([#13458](https://github.com/PyTorchLightning/pytorch-lightning/pull/13458)) +- Removed deprecated ClusterEnvironment methods `KubeflowEnvironment.is_using_kubelfow()`, `LSFEnvironment.is_using_lsf()` and `TorchElasticEnvironment.is_using_torchelastic()` in favor of the `detect()` method ([#13458](https://github.com/Lightning-AI/lightning/pull/13458)) - Removed deprecated `Callback.on_keyboard_interrupt` ([#13438](https://github.com/Lightning-AI/lightning/pull/13438)) @@ -336,40 +336,40 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed deprecated `LightningModule.on_post_move_to_device` ([#13548](https://github.com/Lightning-AI/lightning/pull/13548)) -- Removed `TPUSpawnStrategy.{tpu_local_core_rank,tpu_global_core_rank}` attributes in favor of `TPUSpawnStrategy.{local_rank,global_rank}` ([#11163](https://github.com/PyTorchLightning/pytorch-lightning/pull/11163)) +- Removed `TPUSpawnStrategy.{tpu_local_core_rank,tpu_global_core_rank}` attributes in favor of `TPUSpawnStrategy.{local_rank,global_rank}` ([#11163](https://github.com/Lightning-AI/lightning/pull/11163)) -- Removed `SingleTPUStrategy.{tpu_local_core_rank,tpu_global_core_rank}` attributes in favor of `SingleTPUStrategy.{local_rank,global_rank}`([#11163](https://github.com/PyTorchLightning/pytorch-lightning/pull/11163)) +- Removed `SingleTPUStrategy.{tpu_local_core_rank,tpu_global_core_rank}` attributes in favor of `SingleTPUStrategy.{local_rank,global_rank}`([#11163](https://github.com/Lightning-AI/lightning/pull/11163)) ### Fixed -- Improved support for custom `DataLoader`s when instantiated in `*_dataloader` hook ([#12981](https://github.com/PyTorchLightning/pytorch-lightning/pull/12981)) +- Improved support for custom `DataLoader`s when instantiated in `*_dataloader` hook ([#12981](https://github.com/Lightning-AI/lightning/pull/12981)) -- Allowed custom `BatchSampler`s when instantiated in `*_dataloader` hook [#13640](https://github.com/PyTorchLightning/pytorch-lightning/pull/13640)) +- Allowed custom `BatchSampler`s when instantiated in `*_dataloader` hook [#13640](https://github.com/Lightning-AI/lightning/pull/13640)) -- Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/PyTorchLightning/pytorch-lightning/pull/13014)) +- Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/Lightning-AI/lightning/pull/13014)) -- The model wrapper returned by `LightningLite.setup()` now properly supports pass-through when looking up attributes ([#12597](https://github.com/PyTorchLightning/pytorch-lightning/pull/12597)) +- The model wrapper returned by `LightningLite.setup()` now properly supports pass-through when looking up attributes ([#12597](https://github.com/Lightning-AI/lightning/pull/12597)) -- Fixed issue where the CLI fails with certain torch objects ([#13153](https://github.com/PyTorchLightning/pytorch-lightning/pull/13153)) +- Fixed issue where the CLI fails with certain torch objects ([#13153](https://github.com/Lightning-AI/lightning/pull/13153)) -- Fixed ``LightningCLI`` signature parameter resolving for some lightning classes ([#13283](https://github.com/PyTorchLightning/pytorch-lightning/pull/13283)) +- Fixed ``LightningCLI`` signature parameter resolving for some lightning classes ([#13283](https://github.com/Lightning-AI/lightning/pull/13283)) -- Fixed Model Summary when using DeepSpeed Stage 3 ([#13427](https://github.com/PyTorchLightning/pytorch-lightning/pull/13427)) +- Fixed Model Summary when using DeepSpeed Stage 3 ([#13427](https://github.com/Lightning-AI/lightning/pull/13427)) -- Fixed `pytorch_lightning.utilities.distributed.gather_all_tensors` to handle tensors of different dimensions ([#12630](https://github.com/PyTorchLightning/pytorch-lightning/pull/12630)) +- Fixed `pytorch_lightning.utilities.distributed.gather_all_tensors` to handle tensors of different dimensions ([#12630](https://github.com/Lightning-AI/lightning/pull/12630)) -- Fixed the input validation for the accelerator Trainer argument when passed as a string ([#13417](https://github.com/PyTorchLightning/pytorch-lightning/pull/13417)) +- Fixed the input validation for the accelerator Trainer argument when passed as a string ([#13417](https://github.com/Lightning-AI/lightning/pull/13417)) - Fixed `Trainer.predict(return_predictions=False)` to track prediction's batch_indices ([#13629](https://github.com/Lightning-AI/lightning/pull/13629)) @@ -401,2594 +401,2594 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/PyTorchLightning/pytorch-lightning/pull/13350)) -- Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/PyTorchLightning/pytorch-lightning/pull/13420)) -- The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/PyTorchLightning/pytorch-lightning/pull/13396)) -- Fixed the restoration of log step during restart ([#13467](https://github.com/PyTorchLightning/pytorch-lightning/pull/13467)) +- Fixed `estimated_stepping_batches` requiring distributed comms in `configure_optimizers` for the `DeepSpeedStrategy` ([#13350](https://github.com/Lightning-AI/lightning/pull/13350)) +- Fixed bug with Python version check that prevented use with development versions of Python ([#13420](https://github.com/Lightning-AI/lightning/pull/13420)) +- The loops now call `.set_epoch()` also on batch samplers if the dataloader has one wrapped in a distributed sampler ([#13396](https://github.com/Lightning-AI/lightning/pull/13396)) +- Fixed the restoration of log step during restart ([#13467](https://github.com/Lightning-AI/lightning/pull/13467)) ## [1.6.4] - 2022-06-01 ### Added -- Added all DDP params to be exposed through hpu parallel strategy ([#13067](https://github.com/PyTorchLightning/pytorch-lightning/pull/13067)) +- Added all DDP params to be exposed through hpu parallel strategy ([#13067](https://github.com/Lightning-AI/lightning/pull/13067)) ### Changed -- Keep `torch.backends.cudnn.benchmark=False` by default (unlike in v1.6.{0-3}) after speed and memory problems depending on the data used. Please consider tuning `Trainer(benchmark)` manually. ([#13154](https://github.com/PyTorchLightning/pytorch-lightning/pull/13154)) -- Prevent modification of `torch.backends.cudnn.benchmark` when `Trainer(benchmark=...)` is not set ([#13154](https://github.com/PyTorchLightning/pytorch-lightning/pull/13154)) +- Keep `torch.backends.cudnn.benchmark=False` by default (unlike in v1.6.{0-3}) after speed and memory problems depending on the data used. Please consider tuning `Trainer(benchmark)` manually. ([#13154](https://github.com/Lightning-AI/lightning/pull/13154)) +- Prevent modification of `torch.backends.cudnn.benchmark` when `Trainer(benchmark=...)` is not set ([#13154](https://github.com/Lightning-AI/lightning/pull/13154)) ### Fixed -- Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/PyTorchLightning/pytorch-lightning/pull/12885)) -- Fixed mismatching default values for the types of some arguments in the DeepSpeed and Fully-Sharded strategies which made the CLI unable to use them ([#12989](https://github.com/PyTorchLightning/pytorch-lightning/pull/12989)) -- Avoid redundant callback restore warning while tuning ([#13026](https://github.com/PyTorchLightning/pytorch-lightning/pull/13026)) -- Fixed `Trainer(precision=64)` during evaluation which now uses the wrapped precision module ([#12983](https://github.com/PyTorchLightning/pytorch-lightning/pull/12983)) -- Fixed an issue to use wrapped `LightningModule` for evaluation during `trainer.fit` for `BaguaStrategy` ([#12983](https://github.com/PyTorchLightning/pytorch-lightning/pull/12983)) -- Fixed an issue wrt unnecessary usage of habana mixed precision package for fp32 types ([#13028](https://github.com/PyTorchLightning/pytorch-lightning/pull/13028)) -- Fixed the number of references of `LightningModule` so it can be deleted ([#12897](https://github.com/PyTorchLightning/pytorch-lightning/pull/12897)) -- Fixed `materialize_module` setting a module's child recursively ([#12870](https://github.com/PyTorchLightning/pytorch-lightning/pull/12870)) -- Fixed issue where the CLI could not pass a `Profiler` to the `Trainer` ([#13084](https://github.com/PyTorchLightning/pytorch-lightning/pull/13084)) -- Fixed torchelastic detection with non-distributed installations ([#13142](https://github.com/PyTorchLightning/pytorch-lightning/pull/13142)) -- Fixed logging's step values when multiple dataloaders are used during evaluation ([#12184](https://github.com/PyTorchLightning/pytorch-lightning/pull/12184)) -- Fixed epoch logging on train epoch end ([#13025](https://github.com/PyTorchLightning/pytorch-lightning/pull/13025)) -- Fixed `DDPStrategy` and `DDPSpawnStrategy` to initialize optimizers only after moving the module to the device ([#11952](https://github.com/PyTorchLightning/pytorch-lightning/pull/11952)) +- Fixed an issue causing zero-division error for empty dataloaders ([#12885](https://github.com/Lightning-AI/lightning/pull/12885)) +- Fixed mismatching default values for the types of some arguments in the DeepSpeed and Fully-Sharded strategies which made the CLI unable to use them ([#12989](https://github.com/Lightning-AI/lightning/pull/12989)) +- Avoid redundant callback restore warning while tuning ([#13026](https://github.com/Lightning-AI/lightning/pull/13026)) +- Fixed `Trainer(precision=64)` during evaluation which now uses the wrapped precision module ([#12983](https://github.com/Lightning-AI/lightning/pull/12983)) +- Fixed an issue to use wrapped `LightningModule` for evaluation during `trainer.fit` for `BaguaStrategy` ([#12983](https://github.com/Lightning-AI/lightning/pull/12983)) +- Fixed an issue wrt unnecessary usage of habana mixed precision package for fp32 types ([#13028](https://github.com/Lightning-AI/lightning/pull/13028)) +- Fixed the number of references of `LightningModule` so it can be deleted ([#12897](https://github.com/Lightning-AI/lightning/pull/12897)) +- Fixed `materialize_module` setting a module's child recursively ([#12870](https://github.com/Lightning-AI/lightning/pull/12870)) +- Fixed issue where the CLI could not pass a `Profiler` to the `Trainer` ([#13084](https://github.com/Lightning-AI/lightning/pull/13084)) +- Fixed torchelastic detection with non-distributed installations ([#13142](https://github.com/Lightning-AI/lightning/pull/13142)) +- Fixed logging's step values when multiple dataloaders are used during evaluation ([#12184](https://github.com/Lightning-AI/lightning/pull/12184)) +- Fixed epoch logging on train epoch end ([#13025](https://github.com/Lightning-AI/lightning/pull/13025)) +- Fixed `DDPStrategy` and `DDPSpawnStrategy` to initialize optimizers only after moving the module to the device ([#11952](https://github.com/Lightning-AI/lightning/pull/11952)) ## [1.6.3] - 2022-05-03 ### Fixed -- Use only a single instance of `rich.console.Console` throughout codebase ([#12886](https://github.com/PyTorchLightning/pytorch-lightning/pull/12886)) -- Fixed an issue to ensure all the checkpoint states are saved in a common filepath with `DeepspeedStrategy` ([#12887](https://github.com/PyTorchLightning/pytorch-lightning/pull/12887)) -- Fixed `trainer.logger` deprecation message ([#12671](https://github.com/PyTorchLightning/pytorch-lightning/pull/12671)) -- Fixed an issue where sharded grad scaler is passed in when using BF16 with the `ShardedStrategy` ([#12915](https://github.com/PyTorchLightning/pytorch-lightning/pull/12915)) -- Fixed an issue wrt recursive invocation of DDP configuration in hpu parallel plugin ([#12912](https://github.com/PyTorchLightning/pytorch-lightning/pull/12912)) -- Fixed printing of ragged dictionaries in `Trainer.validate` and `Trainer.test` ([#12857](https://github.com/PyTorchLightning/pytorch-lightning/pull/12857)) -- Fixed threading support for legacy loading of checkpoints ([#12814](https://github.com/PyTorchLightning/pytorch-lightning/pull/12814)) -- Fixed pickling of `KFoldLoop` ([#12441](https://github.com/PyTorchLightning/pytorch-lightning/pull/12441)) -- Stopped `optimizer_zero_grad` from being called after IPU execution ([#12913](https://github.com/PyTorchLightning/pytorch-lightning/pull/12913)) -- Fixed `fuse_modules` to be qat-aware for `torch>=1.11` ([#12891](https://github.com/PyTorchLightning/pytorch-lightning/pull/12891)) -- Enforced eval shuffle warning only for default samplers in DataLoader ([#12653](https://github.com/PyTorchLightning/pytorch-lightning/pull/12653)) -- Enable mixed precision in `DDPFullyShardedStrategy` when `precision=16` ([#12965](https://github.com/PyTorchLightning/pytorch-lightning/pull/12965)) -- Fixed `TQDMProgressBar` reset and update to show correct time estimation ([#12889](https://github.com/PyTorchLightning/pytorch-lightning/pull/12889)) -- Fixed fit loop restart logic to enable resume using the checkpoint ([#12821](https://github.com/PyTorchLightning/pytorch-lightning/pull/12821)) +- Use only a single instance of `rich.console.Console` throughout codebase ([#12886](https://github.com/Lightning-AI/lightning/pull/12886)) +- Fixed an issue to ensure all the checkpoint states are saved in a common filepath with `DeepspeedStrategy` ([#12887](https://github.com/Lightning-AI/lightning/pull/12887)) +- Fixed `trainer.logger` deprecation message ([#12671](https://github.com/Lightning-AI/lightning/pull/12671)) +- Fixed an issue where sharded grad scaler is passed in when using BF16 with the `ShardedStrategy` ([#12915](https://github.com/Lightning-AI/lightning/pull/12915)) +- Fixed an issue wrt recursive invocation of DDP configuration in hpu parallel plugin ([#12912](https://github.com/Lightning-AI/lightning/pull/12912)) +- Fixed printing of ragged dictionaries in `Trainer.validate` and `Trainer.test` ([#12857](https://github.com/Lightning-AI/lightning/pull/12857)) +- Fixed threading support for legacy loading of checkpoints ([#12814](https://github.com/Lightning-AI/lightning/pull/12814)) +- Fixed pickling of `KFoldLoop` ([#12441](https://github.com/Lightning-AI/lightning/pull/12441)) +- Stopped `optimizer_zero_grad` from being called after IPU execution ([#12913](https://github.com/Lightning-AI/lightning/pull/12913)) +- Fixed `fuse_modules` to be qat-aware for `torch>=1.11` ([#12891](https://github.com/Lightning-AI/lightning/pull/12891)) +- Enforced eval shuffle warning only for default samplers in DataLoader ([#12653](https://github.com/Lightning-AI/lightning/pull/12653)) +- Enable mixed precision in `DDPFullyShardedStrategy` when `precision=16` ([#12965](https://github.com/Lightning-AI/lightning/pull/12965)) +- Fixed `TQDMProgressBar` reset and update to show correct time estimation ([#12889](https://github.com/Lightning-AI/lightning/pull/12889)) +- Fixed fit loop restart logic to enable resume using the checkpoint ([#12821](https://github.com/Lightning-AI/lightning/pull/12821)) ## [1.6.2] - 2022-04-27 ### Fixed -- Fixed `ImportError` when `torch.distributed` is not available. ([#12794](https://github.com/PyTorchLightning/pytorch-lightning/pull/12794)) -- When using custom DataLoaders in LightningDataModule, multiple inheritance is resolved properly ([#12716](https://github.com/PyTorchLightning/pytorch-lightning/pull/12716)) -- Fixed encoding issues on terminals that do not support unicode characters ([#12828](https://github.com/PyTorchLightning/pytorch-lightning/pull/12828)) -- Fixed support for `ModelCheckpoint` monitors with dots ([#12783](https://github.com/PyTorchLightning/pytorch-lightning/pull/12783)) +- Fixed `ImportError` when `torch.distributed` is not available. ([#12794](https://github.com/Lightning-AI/lightning/pull/12794)) +- When using custom DataLoaders in LightningDataModule, multiple inheritance is resolved properly ([#12716](https://github.com/Lightning-AI/lightning/pull/12716)) +- Fixed encoding issues on terminals that do not support unicode characters ([#12828](https://github.com/Lightning-AI/lightning/pull/12828)) +- Fixed support for `ModelCheckpoint` monitors with dots ([#12783](https://github.com/Lightning-AI/lightning/pull/12783)) ## [1.6.1] - 2022-04-13 ### Changed -- Support `strategy` argument being case insensitive ([#12528](https://github.com/PyTorchLightning/pytorch-lightning/pull/12528)) +- Support `strategy` argument being case insensitive ([#12528](https://github.com/Lightning-AI/lightning/pull/12528)) ### Fixed -- Run main progress bar updates independent of val progress bar updates in `TQDMProgressBar` ([#12563](https://github.com/PyTorchLightning/pytorch-lightning/pull/12563)) -- Avoid calling `average_parameters` multiple times per optimizer step ([#12452](https://github.com/PyTorchLightning/pytorch-lightning/pull/12452)) -- Properly pass some Logger's parent's arguments to `super().__init__()` ([#12609](https://github.com/PyTorchLightning/pytorch-lightning/pull/12609)) -- Fixed an issue where incorrect type warnings appear when the overridden `LightningLite.run` method accepts user-defined arguments ([#12629](https://github.com/PyTorchLightning/pytorch-lightning/pull/12629)) -- Fixed `rank_zero_only` decorator in LSF environments ([#12587](https://github.com/PyTorchLightning/pytorch-lightning/pull/12587)) -- Don't raise a warning when `nn.Module` is not saved under hparams ([#12669](https://github.com/PyTorchLightning/pytorch-lightning/pull/12669)) -- Raise `MisconfigurationException` when the accelerator is available but the user passes invalid `([]/0/"0")` values to the `devices` flag ([#12708](https://github.com/PyTorchLightning/pytorch-lightning/pull/12708)) -- Support `auto_select_gpus` with the accelerator and devices API ([#12608](https://github.com/PyTorchLightning/pytorch-lightning/pull/12608)) +- Run main progress bar updates independent of val progress bar updates in `TQDMProgressBar` ([#12563](https://github.com/Lightning-AI/lightning/pull/12563)) +- Avoid calling `average_parameters` multiple times per optimizer step ([#12452](https://github.com/Lightning-AI/lightning/pull/12452)) +- Properly pass some Logger's parent's arguments to `super().__init__()` ([#12609](https://github.com/Lightning-AI/lightning/pull/12609)) +- Fixed an issue where incorrect type warnings appear when the overridden `LightningLite.run` method accepts user-defined arguments ([#12629](https://github.com/Lightning-AI/lightning/pull/12629)) +- Fixed `rank_zero_only` decorator in LSF environments ([#12587](https://github.com/Lightning-AI/lightning/pull/12587)) +- Don't raise a warning when `nn.Module` is not saved under hparams ([#12669](https://github.com/Lightning-AI/lightning/pull/12669)) +- Raise `MisconfigurationException` when the accelerator is available but the user passes invalid `([]/0/"0")` values to the `devices` flag ([#12708](https://github.com/Lightning-AI/lightning/pull/12708)) +- Support `auto_select_gpus` with the accelerator and devices API ([#12608](https://github.com/Lightning-AI/lightning/pull/12608)) ## [1.6.0] - 2022-03-29 ### Added -- Allow logging to an existing run ID in MLflow with `MLFlowLogger` ([#12290](https://github.com/PyTorchLightning/pytorch-lightning/pull/12290)) -- Enable gradient accumulation using Horovod's `backward_passes_per_step` ([#11911](https://github.com/PyTorchLightning/pytorch-lightning/pull/11911)) -- Add new `DETAIL` log level to provide useful logs for improving monitoring and debugging of batch jobs ([#11008](https://github.com/PyTorchLightning/pytorch-lightning/pull/11008)) -- Added a flag `SLURMEnvironment(auto_requeue=True|False)` to control whether Lightning handles the requeuing ([#10601](https://github.com/PyTorchLightning/pytorch-lightning/pull/10601)) +- Allow logging to an existing run ID in MLflow with `MLFlowLogger` ([#12290](https://github.com/Lightning-AI/lightning/pull/12290)) +- Enable gradient accumulation using Horovod's `backward_passes_per_step` ([#11911](https://github.com/Lightning-AI/lightning/pull/11911)) +- Add new `DETAIL` log level to provide useful logs for improving monitoring and debugging of batch jobs ([#11008](https://github.com/Lightning-AI/lightning/pull/11008)) +- Added a flag `SLURMEnvironment(auto_requeue=True|False)` to control whether Lightning handles the requeuing ([#10601](https://github.com/Lightning-AI/lightning/pull/10601)) - Fault Tolerant Manual - * Add `_Stateful` protocol to detect if classes are stateful ([#10646](https://github.com/PyTorchLightning/pytorch-lightning/pull/10646)) - * Add `_FaultTolerantMode` enum used to track different supported fault tolerant modes ([#10645](https://github.com/PyTorchLightning/pytorch-lightning/pull/10645)) - * Add a `_rotate_worker_indices` utility to reload the state according the latest worker ([#10647](https://github.com/PyTorchLightning/pytorch-lightning/pull/10647)) - * Add stateful workers ([#10674](https://github.com/PyTorchLightning/pytorch-lightning/pull/10674)) - * Add an utility to collect the states across processes ([#10639](https://github.com/PyTorchLightning/pytorch-lightning/pull/10639)) - * Add logic to reload the states across data loading components ([#10699](https://github.com/PyTorchLightning/pytorch-lightning/pull/10699)) - * Cleanup some fault tolerant utilities ([#10703](https://github.com/PyTorchLightning/pytorch-lightning/pull/10703)) - * Enable Fault Tolerant Manual Training ([#10707](https://github.com/PyTorchLightning/pytorch-lightning/pull/10707)) - * Broadcast the `_terminate_gracefully` to all processes and add support for DDP ([#10638](https://github.com/PyTorchLightning/pytorch-lightning/pull/10638)) -- Added support for re-instantiation of custom (subclasses of) `DataLoaders` returned in the `*_dataloader()` methods, i.e., automatic replacement of samplers now works with custom types of `DataLoader` ([#10680](https://github.com/PyTorchLightning/pytorch-lightning/pull/10680)) -- Added a function to validate if fault tolerant training is supported. ([#10465](https://github.com/PyTorchLightning/pytorch-lightning/pull/10465)) -- Added a private callback to manage the creation and deletion of fault-tolerance checkpoints ([#11862](https://github.com/PyTorchLightning/pytorch-lightning/pull/11862)) -- Show a better error message when a custom `DataLoader` implementation is not well implemented and we need to reconstruct it ([#10719](https://github.com/PyTorchLightning/pytorch-lightning/pull/10719)) -- Show a better error message when frozen dataclass is used as a batch ([#10927](https://github.com/PyTorchLightning/pytorch-lightning/pull/10927)) -- Save the `Loop`'s state by default in the checkpoint ([#10784](https://github.com/PyTorchLightning/pytorch-lightning/pull/10784)) -- Added `Loop.replace` to easily switch one loop for another ([#10324](https://github.com/PyTorchLightning/pytorch-lightning/pull/10324)) -- Added support for `--lr_scheduler=ReduceLROnPlateau` to the `LightningCLI` ([#10860](https://github.com/PyTorchLightning/pytorch-lightning/pull/10860)) -- Added `LightningCLI.configure_optimizers` to override the `configure_optimizers` return value ([#10860](https://github.com/PyTorchLightning/pytorch-lightning/pull/10860)) -- Added `LightningCLI(auto_registry)` flag to register all subclasses of the registerable components automatically ([#12108](https://github.com/PyTorchLightning/pytorch-lightning/pull/12108)) -- Added a warning that shows when `max_epochs` in the `Trainer` is not set ([#10700](https://github.com/PyTorchLightning/pytorch-lightning/pull/10700)) -- Added support for returning a single Callback from `LightningModule.configure_callbacks` without wrapping it into a list ([#11060](https://github.com/PyTorchLightning/pytorch-lightning/pull/11060)) -- Added `console_kwargs` for `RichProgressBar` to initialize inner Console ([#10875](https://github.com/PyTorchLightning/pytorch-lightning/pull/10875)) -- Added support for shorthand notation to instantiate loggers with the `LightningCLI` ([#11533](https://github.com/PyTorchLightning/pytorch-lightning/pull/11533)) -- Added a `LOGGER_REGISTRY` instance to register custom loggers to the `LightningCLI` ([#11533](https://github.com/PyTorchLightning/pytorch-lightning/pull/11533)) -- Added info message when the `Trainer` arguments `limit_*_batches`, `overfit_batches`, or `val_check_interval` are set to `1` or `1.0` ([#11950](https://github.com/PyTorchLightning/pytorch-lightning/pull/11950)) -- Added a `PrecisionPlugin.teardown` method ([#10990](https://github.com/PyTorchLightning/pytorch-lightning/pull/10990)) -- Added `LightningModule.lr_scheduler_step` ([#10249](https://github.com/PyTorchLightning/pytorch-lightning/pull/10249)) -- Added support for no pre-fetching to `DataFetcher` ([#11606](https://github.com/PyTorchLightning/pytorch-lightning/pull/11606)) -- Added support for optimizer step progress tracking with manual optimization ([#11848](https://github.com/PyTorchLightning/pytorch-lightning/pull/11848)) -- Return the output of the `optimizer.step`. This can be useful for `LightningLite` users, manual optimization users, or users overriding `LightningModule.optimizer_step` ([#11711](https://github.com/PyTorchLightning/pytorch-lightning/pull/11711)) -- Teardown the active loop and strategy on exception ([#11620](https://github.com/PyTorchLightning/pytorch-lightning/pull/11620)) -- Added a `MisconfigurationException` if user provided `opt_idx` in scheduler config doesn't match with actual optimizer index of its respective optimizer ([#11247](https://github.com/PyTorchLightning/pytorch-lightning/pull/11247)) -- Added a `loggers` property to `Trainer` which returns a list of loggers provided by the user ([#11683](https://github.com/PyTorchLightning/pytorch-lightning/pull/11683)) -- Added a `loggers` property to `LightningModule` which retrieves the `loggers` property from `Trainer` ([#11683](https://github.com/PyTorchLightning/pytorch-lightning/pull/11683)) -- Added support for DDP when using a `CombinedLoader` for the training data ([#11648](https://github.com/PyTorchLightning/pytorch-lightning/pull/11648)) -- Added a warning when using `DistributedSampler` during validation/testing ([#11479](https://github.com/PyTorchLightning/pytorch-lightning/pull/11479)) -- Added support for `Bagua` training strategy ([#11146](https://github.com/PyTorchLightning/pytorch-lightning/pull/11146)) -- Added support for manually returning a `poptorch.DataLoader` in a `*_dataloader` hook ([#12116](https://github.com/PyTorchLightning/pytorch-lightning/pull/12116)) -- Added `rank_zero` module to centralize utilities ([#11747](https://github.com/PyTorchLightning/pytorch-lightning/pull/11747)) -- Added a `_Stateful` support for `LightningDataModule` ([#11637](https://github.com/PyTorchLightning/pytorch-lightning/pull/11637)) -- Added `_Stateful` support for `PrecisionPlugin` ([#11638](https://github.com/PyTorchLightning/pytorch-lightning/pull/11638)) -- Added `Accelerator.is_available` to check device availability ([#11797](https://github.com/PyTorchLightning/pytorch-lightning/pull/11797)) -- Enabled static type-checking on the signature of `Trainer` ([#11888](https://github.com/PyTorchLightning/pytorch-lightning/pull/11888)) -- Added utility functions for moving optimizers to devices ([#11758](https://github.com/PyTorchLightning/pytorch-lightning/pull/11758)) -- Added a warning when saving an instance of `nn.Module` with `save_hyperparameters()` ([#12068](https://github.com/PyTorchLightning/pytorch-lightning/pull/12068)) -- Added `estimated_stepping_batches` property to `Trainer` ([#11599](https://github.com/PyTorchLightning/pytorch-lightning/pull/11599)) -- Added support for pluggable Accelerators ([#12030](https://github.com/PyTorchLightning/pytorch-lightning/pull/12030)) -- Added profiling for `on_load_checkpoint`/`on_save_checkpoint` callback and LightningModule hooks ([#12149](https://github.com/PyTorchLightning/pytorch-lightning/pull/12149)) -- Added `LayerSync` and `NativeSyncBatchNorm` plugins ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754)) -- Added optional `storage_options` argument to `Trainer.save_checkpoint()` to pass to custom `CheckpointIO` implementations ([#11891](https://github.com/PyTorchLightning/pytorch-lightning/pull/11891)) -- Added support to explicitly specify the process group backend for parallel strategies ([#11745](https://github.com/PyTorchLightning/pytorch-lightning/pull/11745)) -- Added `device_ids` and `num_devices` property to `Trainer` ([#12151](https://github.com/PyTorchLightning/pytorch-lightning/pull/12151)) -- Added `Callback.state_dict()` and `Callback.load_state_dict()` methods ([#12232](https://github.com/PyTorchLightning/pytorch-lightning/pull/12232)) -- Added `AcceleratorRegistry` ([#12180](https://github.com/PyTorchLightning/pytorch-lightning/pull/12180)) -- Added support for Habana Accelerator (HPU) ([#11808](https://github.com/PyTorchLightning/pytorch-lightning/pull/11808)) -- Added support for dataclasses in `apply_to_collections` ([#11889](https://github.com/PyTorchLightning/pytorch-lightning/pull/11889)) + * Add `_Stateful` protocol to detect if classes are stateful ([#10646](https://github.com/Lightning-AI/lightning/pull/10646)) + * Add `_FaultTolerantMode` enum used to track different supported fault tolerant modes ([#10645](https://github.com/Lightning-AI/lightning/pull/10645)) + * Add a `_rotate_worker_indices` utility to reload the state according the latest worker ([#10647](https://github.com/Lightning-AI/lightning/pull/10647)) + * Add stateful workers ([#10674](https://github.com/Lightning-AI/lightning/pull/10674)) + * Add an utility to collect the states across processes ([#10639](https://github.com/Lightning-AI/lightning/pull/10639)) + * Add logic to reload the states across data loading components ([#10699](https://github.com/Lightning-AI/lightning/pull/10699)) + * Cleanup some fault tolerant utilities ([#10703](https://github.com/Lightning-AI/lightning/pull/10703)) + * Enable Fault Tolerant Manual Training ([#10707](https://github.com/Lightning-AI/lightning/pull/10707)) + * Broadcast the `_terminate_gracefully` to all processes and add support for DDP ([#10638](https://github.com/Lightning-AI/lightning/pull/10638)) +- Added support for re-instantiation of custom (subclasses of) `DataLoaders` returned in the `*_dataloader()` methods, i.e., automatic replacement of samplers now works with custom types of `DataLoader` ([#10680](https://github.com/Lightning-AI/lightning/pull/10680)) +- Added a function to validate if fault tolerant training is supported. ([#10465](https://github.com/Lightning-AI/lightning/pull/10465)) +- Added a private callback to manage the creation and deletion of fault-tolerance checkpoints ([#11862](https://github.com/Lightning-AI/lightning/pull/11862)) +- Show a better error message when a custom `DataLoader` implementation is not well implemented and we need to reconstruct it ([#10719](https://github.com/Lightning-AI/lightning/pull/10719)) +- Show a better error message when frozen dataclass is used as a batch ([#10927](https://github.com/Lightning-AI/lightning/pull/10927)) +- Save the `Loop`'s state by default in the checkpoint ([#10784](https://github.com/Lightning-AI/lightning/pull/10784)) +- Added `Loop.replace` to easily switch one loop for another ([#10324](https://github.com/Lightning-AI/lightning/pull/10324)) +- Added support for `--lr_scheduler=ReduceLROnPlateau` to the `LightningCLI` ([#10860](https://github.com/Lightning-AI/lightning/pull/10860)) +- Added `LightningCLI.configure_optimizers` to override the `configure_optimizers` return value ([#10860](https://github.com/Lightning-AI/lightning/pull/10860)) +- Added `LightningCLI(auto_registry)` flag to register all subclasses of the registerable components automatically ([#12108](https://github.com/Lightning-AI/lightning/pull/12108)) +- Added a warning that shows when `max_epochs` in the `Trainer` is not set ([#10700](https://github.com/Lightning-AI/lightning/pull/10700)) +- Added support for returning a single Callback from `LightningModule.configure_callbacks` without wrapping it into a list ([#11060](https://github.com/Lightning-AI/lightning/pull/11060)) +- Added `console_kwargs` for `RichProgressBar` to initialize inner Console ([#10875](https://github.com/Lightning-AI/lightning/pull/10875)) +- Added support for shorthand notation to instantiate loggers with the `LightningCLI` ([#11533](https://github.com/Lightning-AI/lightning/pull/11533)) +- Added a `LOGGER_REGISTRY` instance to register custom loggers to the `LightningCLI` ([#11533](https://github.com/Lightning-AI/lightning/pull/11533)) +- Added info message when the `Trainer` arguments `limit_*_batches`, `overfit_batches`, or `val_check_interval` are set to `1` or `1.0` ([#11950](https://github.com/Lightning-AI/lightning/pull/11950)) +- Added a `PrecisionPlugin.teardown` method ([#10990](https://github.com/Lightning-AI/lightning/pull/10990)) +- Added `LightningModule.lr_scheduler_step` ([#10249](https://github.com/Lightning-AI/lightning/pull/10249)) +- Added support for no pre-fetching to `DataFetcher` ([#11606](https://github.com/Lightning-AI/lightning/pull/11606)) +- Added support for optimizer step progress tracking with manual optimization ([#11848](https://github.com/Lightning-AI/lightning/pull/11848)) +- Return the output of the `optimizer.step`. This can be useful for `LightningLite` users, manual optimization users, or users overriding `LightningModule.optimizer_step` ([#11711](https://github.com/Lightning-AI/lightning/pull/11711)) +- Teardown the active loop and strategy on exception ([#11620](https://github.com/Lightning-AI/lightning/pull/11620)) +- Added a `MisconfigurationException` if user provided `opt_idx` in scheduler config doesn't match with actual optimizer index of its respective optimizer ([#11247](https://github.com/Lightning-AI/lightning/pull/11247)) +- Added a `loggers` property to `Trainer` which returns a list of loggers provided by the user ([#11683](https://github.com/Lightning-AI/lightning/pull/11683)) +- Added a `loggers` property to `LightningModule` which retrieves the `loggers` property from `Trainer` ([#11683](https://github.com/Lightning-AI/lightning/pull/11683)) +- Added support for DDP when using a `CombinedLoader` for the training data ([#11648](https://github.com/Lightning-AI/lightning/pull/11648)) +- Added a warning when using `DistributedSampler` during validation/testing ([#11479](https://github.com/Lightning-AI/lightning/pull/11479)) +- Added support for `Bagua` training strategy ([#11146](https://github.com/Lightning-AI/lightning/pull/11146)) +- Added support for manually returning a `poptorch.DataLoader` in a `*_dataloader` hook ([#12116](https://github.com/Lightning-AI/lightning/pull/12116)) +- Added `rank_zero` module to centralize utilities ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) +- Added a `_Stateful` support for `LightningDataModule` ([#11637](https://github.com/Lightning-AI/lightning/pull/11637)) +- Added `_Stateful` support for `PrecisionPlugin` ([#11638](https://github.com/Lightning-AI/lightning/pull/11638)) +- Added `Accelerator.is_available` to check device availability ([#11797](https://github.com/Lightning-AI/lightning/pull/11797)) +- Enabled static type-checking on the signature of `Trainer` ([#11888](https://github.com/Lightning-AI/lightning/pull/11888)) +- Added utility functions for moving optimizers to devices ([#11758](https://github.com/Lightning-AI/lightning/pull/11758)) +- Added a warning when saving an instance of `nn.Module` with `save_hyperparameters()` ([#12068](https://github.com/Lightning-AI/lightning/pull/12068)) +- Added `estimated_stepping_batches` property to `Trainer` ([#11599](https://github.com/Lightning-AI/lightning/pull/11599)) +- Added support for pluggable Accelerators ([#12030](https://github.com/Lightning-AI/lightning/pull/12030)) +- Added profiling for `on_load_checkpoint`/`on_save_checkpoint` callback and LightningModule hooks ([#12149](https://github.com/Lightning-AI/lightning/pull/12149)) +- Added `LayerSync` and `NativeSyncBatchNorm` plugins ([#11754](https://github.com/Lightning-AI/lightning/pull/11754)) +- Added optional `storage_options` argument to `Trainer.save_checkpoint()` to pass to custom `CheckpointIO` implementations ([#11891](https://github.com/Lightning-AI/lightning/pull/11891)) +- Added support to explicitly specify the process group backend for parallel strategies ([#11745](https://github.com/Lightning-AI/lightning/pull/11745)) +- Added `device_ids` and `num_devices` property to `Trainer` ([#12151](https://github.com/Lightning-AI/lightning/pull/12151)) +- Added `Callback.state_dict()` and `Callback.load_state_dict()` methods ([#12232](https://github.com/Lightning-AI/lightning/pull/12232)) +- Added `AcceleratorRegistry` ([#12180](https://github.com/Lightning-AI/lightning/pull/12180)) +- Added support for Habana Accelerator (HPU) ([#11808](https://github.com/Lightning-AI/lightning/pull/11808)) +- Added support for dataclasses in `apply_to_collections` ([#11889](https://github.com/Lightning-AI/lightning/pull/11889)) ### Changed -- Drop PyTorch 1.7 support ([#12191](https://github.com/PyTorchLightning/pytorch-lightning/pull/12191)), ([#12432](https://github.com/PyTorchLightning/pytorch-lightning/pull/12432)) -- Make `benchmark` flag optional and set its value based on the deterministic flag ([#11944](https://github.com/PyTorchLightning/pytorch-lightning/pull/11944)) -- Implemented a new native and rich format in `_print_results` method of the `EvaluationLoop` ([#11332](https://github.com/PyTorchLightning/pytorch-lightning/pull/11332)) -- Do not print an empty table at the end of the `EvaluationLoop` ([#12427](https://github.com/PyTorchLightning/pytorch-lightning/pull/12427)) -- Set the `prog_bar` flag to False in `LightningModule.log_grad_norm` ([#11472](https://github.com/PyTorchLightning/pytorch-lightning/pull/11472)) -- Raised exception in `init_dist_connection()` when torch distributed is not available ([#10418](https://github.com/PyTorchLightning/pytorch-lightning/pull/10418)) -- The `monitor` argument in the `EarlyStopping` callback is no longer optional ([#10328](https://github.com/PyTorchLightning/pytorch-lightning/pull/10328)) -- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/pull/10438)) -- Raised `MisconfigurationException` when `enable_progress_bar=False` and a progress bar instance has been passed in the callback list ([#10520](https://github.com/PyTorchLightning/pytorch-lightning/pull/10520)) -- Moved `trainer.connectors.env_vars_connector._defaults_from_env_vars` to `utilities.argsparse._defaults_from_env_vars` ([#10501](https://github.com/PyTorchLightning/pytorch-lightning/pull/10501)) -- Changes in `LightningCLI` required for the new major release of jsonargparse v4.0.0 ([#10426](https://github.com/PyTorchLightning/pytorch-lightning/pull/10426)) -- Renamed `refresh_rate_per_second` parameter to `refresh_rate` for `RichProgressBar` signature ([#10497](https://github.com/PyTorchLightning/pytorch-lightning/pull/10497)) -- Moved ownership of the `PrecisionPlugin` into `TrainingTypePlugin` and updated all references ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) -- Fault Tolerant relies on `signal.SIGTERM` to gracefully exit instead of `signal.SIGUSR1` ([#10605](https://github.com/PyTorchLightning/pytorch-lightning/pull/10605)) -- `Loop.restarting=...` now sets the value recursively for all subloops ([#11442](https://github.com/PyTorchLightning/pytorch-lightning/pull/11442)) -- Raised an error if the `batch_size` cannot be inferred from the current batch if it contained a string or was a custom batch object ([#10541](https://github.com/PyTorchLightning/pytorch-lightning/pull/10541)) -- The validation loop is now disabled when `overfit_batches > 0` is set in the Trainer ([#9709](https://github.com/PyTorchLightning/pytorch-lightning/pull/9709)) -- Moved optimizer related logics from `Accelerator` to `TrainingTypePlugin` ([#10596](https://github.com/PyTorchLightning/pytorch-lightning/pull/10596)) -- Moved ownership of the lightning optimizers from the `Trainer` to the `Strategy` ([#11444](https://github.com/PyTorchLightning/pytorch-lightning/pull/11444)) -- Moved ownership of the data fetchers from the DataConnector to the Loops ([#11621](https://github.com/PyTorchLightning/pytorch-lightning/pull/11621)) -- Moved `batch_to_device` method from `Accelerator` to `TrainingTypePlugin` ([#10649](https://github.com/PyTorchLightning/pytorch-lightning/pull/10649)) -- The `DDPSpawnPlugin` no longer overrides the `post_dispatch` plugin hook ([#10034](https://github.com/PyTorchLightning/pytorch-lightning/pull/10034)) -- Integrate the progress bar implementation with progress tracking ([#11213](https://github.com/PyTorchLightning/pytorch-lightning/pull/11213)) -- The `LightningModule.{add_to_queue,get_from_queue}` hooks no longer get a `torch.multiprocessing.SimpleQueue` and instead receive a list based queue ([#10034](https://github.com/PyTorchLightning/pytorch-lightning/pull/10034)) -- Changed `training_step`, `validation_step`, `test_step` and `predict_step` method signatures in `Accelerator` and updated input from caller side ([#10908](https://github.com/PyTorchLightning/pytorch-lightning/pull/10908)) -- Changed the name of the temporary checkpoint that the `DDPSpawnPlugin` and related plugins save ([#10934](https://github.com/PyTorchLightning/pytorch-lightning/pull/10934)) -- `LoggerCollection` returns only unique logger names and versions ([#10976](https://github.com/PyTorchLightning/pytorch-lightning/pull/10976)) -- Redesigned process creation for spawn-based plugins (`DDPSpawnPlugin`, `TPUSpawnPlugin`, etc.) ([#10896](https://github.com/PyTorchLightning/pytorch-lightning/pull/10896)) +- Drop PyTorch 1.7 support ([#12191](https://github.com/Lightning-AI/lightning/pull/12191)), ([#12432](https://github.com/Lightning-AI/lightning/pull/12432)) +- Make `benchmark` flag optional and set its value based on the deterministic flag ([#11944](https://github.com/Lightning-AI/lightning/pull/11944)) +- Implemented a new native and rich format in `_print_results` method of the `EvaluationLoop` ([#11332](https://github.com/Lightning-AI/lightning/pull/11332)) +- Do not print an empty table at the end of the `EvaluationLoop` ([#12427](https://github.com/Lightning-AI/lightning/pull/12427)) +- Set the `prog_bar` flag to False in `LightningModule.log_grad_norm` ([#11472](https://github.com/Lightning-AI/lightning/pull/11472)) +- Raised exception in `init_dist_connection()` when torch distributed is not available ([#10418](https://github.com/Lightning-AI/lightning/pull/10418)) +- The `monitor` argument in the `EarlyStopping` callback is no longer optional ([#10328](https://github.com/Lightning-AI/lightning/pull/10328)) +- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/Lightning-AI/lightning/pull/10438)) +- Raised `MisconfigurationException` when `enable_progress_bar=False` and a progress bar instance has been passed in the callback list ([#10520](https://github.com/Lightning-AI/lightning/pull/10520)) +- Moved `trainer.connectors.env_vars_connector._defaults_from_env_vars` to `utilities.argsparse._defaults_from_env_vars` ([#10501](https://github.com/Lightning-AI/lightning/pull/10501)) +- Changes in `LightningCLI` required for the new major release of jsonargparse v4.0.0 ([#10426](https://github.com/Lightning-AI/lightning/pull/10426)) +- Renamed `refresh_rate_per_second` parameter to `refresh_rate` for `RichProgressBar` signature ([#10497](https://github.com/Lightning-AI/lightning/pull/10497)) +- Moved ownership of the `PrecisionPlugin` into `TrainingTypePlugin` and updated all references ([#10570](https://github.com/Lightning-AI/lightning/pull/10570)) +- Fault Tolerant relies on `signal.SIGTERM` to gracefully exit instead of `signal.SIGUSR1` ([#10605](https://github.com/Lightning-AI/lightning/pull/10605)) +- `Loop.restarting=...` now sets the value recursively for all subloops ([#11442](https://github.com/Lightning-AI/lightning/pull/11442)) +- Raised an error if the `batch_size` cannot be inferred from the current batch if it contained a string or was a custom batch object ([#10541](https://github.com/Lightning-AI/lightning/pull/10541)) +- The validation loop is now disabled when `overfit_batches > 0` is set in the Trainer ([#9709](https://github.com/Lightning-AI/lightning/pull/9709)) +- Moved optimizer related logics from `Accelerator` to `TrainingTypePlugin` ([#10596](https://github.com/Lightning-AI/lightning/pull/10596)) +- Moved ownership of the lightning optimizers from the `Trainer` to the `Strategy` ([#11444](https://github.com/Lightning-AI/lightning/pull/11444)) +- Moved ownership of the data fetchers from the DataConnector to the Loops ([#11621](https://github.com/Lightning-AI/lightning/pull/11621)) +- Moved `batch_to_device` method from `Accelerator` to `TrainingTypePlugin` ([#10649](https://github.com/Lightning-AI/lightning/pull/10649)) +- The `DDPSpawnPlugin` no longer overrides the `post_dispatch` plugin hook ([#10034](https://github.com/Lightning-AI/lightning/pull/10034)) +- Integrate the progress bar implementation with progress tracking ([#11213](https://github.com/Lightning-AI/lightning/pull/11213)) +- The `LightningModule.{add_to_queue,get_from_queue}` hooks no longer get a `torch.multiprocessing.SimpleQueue` and instead receive a list based queue ([#10034](https://github.com/Lightning-AI/lightning/pull/10034)) +- Changed `training_step`, `validation_step`, `test_step` and `predict_step` method signatures in `Accelerator` and updated input from caller side ([#10908](https://github.com/Lightning-AI/lightning/pull/10908)) +- Changed the name of the temporary checkpoint that the `DDPSpawnPlugin` and related plugins save ([#10934](https://github.com/Lightning-AI/lightning/pull/10934)) +- `LoggerCollection` returns only unique logger names and versions ([#10976](https://github.com/Lightning-AI/lightning/pull/10976)) +- Redesigned process creation for spawn-based plugins (`DDPSpawnPlugin`, `TPUSpawnPlugin`, etc.) ([#10896](https://github.com/Lightning-AI/lightning/pull/10896)) * All spawn-based plugins now spawn processes immediately upon calling `Trainer.{fit,validate,test,predict}` * The hooks/callbacks `prepare_data`, `setup`, `configure_sharded_model` and `teardown` now run under initialized process group for spawn-based plugins just like their non-spawn counterparts * Some configuration errors that were previously raised as `MisconfigurationException`s will now be raised as `ProcessRaisedException` (torch>=1.8) or as `Exception` (torch<1.8) - * Removed the `TrainingTypePlugin.pre_dispatch()` method and merged it with `TrainingTypePlugin.setup()` ([#11137](https://github.com/PyTorchLightning/pytorch-lightning/pull/11137)) -- Changed profiler to index and display the names of the hooks with a new pattern []. ([#11026](https://github.com/PyTorchLightning/pytorch-lightning/pull/11026)) -- Changed `batch_to_device` entry in profiling from stage-specific to generic, to match profiling of other hooks ([#11031](https://github.com/PyTorchLightning/pytorch-lightning/pull/11031)) -- Changed the info message for finalizing ddp-spawn worker processes to a debug-level message ([#10864](https://github.com/PyTorchLightning/pytorch-lightning/pull/10864)) -- Removed duplicated file extension when uploading model checkpoints with `NeptuneLogger` ([#11015](https://github.com/PyTorchLightning/pytorch-lightning/pull/11015)) -- Removed `__getstate__` and `__setstate__` of `RichProgressBar` ([#11100](https://github.com/PyTorchLightning/pytorch-lightning/pull/11100)) -- The `DDPPlugin` and `DDPSpawnPlugin` and their subclasses now remove the `SyncBatchNorm` wrappers in `teardown()` to enable proper support at inference after fitting ([#11078](https://github.com/PyTorchLightning/pytorch-lightning/pull/11078)) -- Moved ownership of the `Accelerator` instance to the `TrainingTypePlugin`; all training-type plugins now take an optional parameter `accelerator` ([#11022](https://github.com/PyTorchLightning/pytorch-lightning/pull/11022)) -- Renamed the `TrainingTypePlugin` to `Strategy` ([#11120](https://github.com/PyTorchLightning/pytorch-lightning/pull/11120)) - * Renamed the `ParallelPlugin` to `ParallelStrategy` ([#11123](https://github.com/PyTorchLightning/pytorch-lightning/pull/11123)) - * Renamed the `DataParallelPlugin` to `DataParallelStrategy` ([#11183](https://github.com/PyTorchLightning/pytorch-lightning/pull/11183)) - * Renamed the `DDPPlugin` to `DDPStrategy` ([#11142](https://github.com/PyTorchLightning/pytorch-lightning/pull/11142)) - * Renamed the `DDP2Plugin` to `DDP2Strategy` ([#11185](https://github.com/PyTorchLightning/pytorch-lightning/pull/11185)) - * Renamed the `DDPShardedPlugin` to `DDPShardedStrategy` ([#11186](https://github.com/PyTorchLightning/pytorch-lightning/pull/11186)) - * Renamed the `DDPFullyShardedPlugin` to `DDPFullyShardedStrategy` ([#11143](https://github.com/PyTorchLightning/pytorch-lightning/pull/11143)) - * Renamed the `DDPSpawnPlugin` to `DDPSpawnStrategy` ([#11145](https://github.com/PyTorchLightning/pytorch-lightning/pull/11145)) - * Renamed the `DDPSpawnShardedPlugin` to `DDPSpawnShardedStrategy` ([#11210](https://github.com/PyTorchLightning/pytorch-lightning/pull/11210)) - * Renamed the `DeepSpeedPlugin` to `DeepSpeedStrategy` ([#11194](https://github.com/PyTorchLightning/pytorch-lightning/pull/11194)) - * Renamed the `HorovodPlugin` to `HorovodStrategy` ([#11195](https://github.com/PyTorchLightning/pytorch-lightning/pull/11195)) - * Renamed the `TPUSpawnPlugin` to `TPUSpawnStrategy` ([#11190](https://github.com/PyTorchLightning/pytorch-lightning/pull/11190)) - * Renamed the `IPUPlugin` to `IPUStrategy` ([#11193](https://github.com/PyTorchLightning/pytorch-lightning/pull/11193)) - * Renamed the `SingleDevicePlugin` to `SingleDeviceStrategy` ([#11182](https://github.com/PyTorchLightning/pytorch-lightning/pull/11182)) - * Renamed the `SingleTPUPlugin` to `SingleTPUStrategy` ([#11182](https://github.com/PyTorchLightning/pytorch-lightning/pull/11182)) - * Renamed the `TrainingTypePluginsRegistry` to `StrategyRegistry` ([#11233](https://github.com/PyTorchLightning/pytorch-lightning/pull/11233)) -- Marked the `ResultCollection`, `ResultMetric`, and `ResultMetricCollection` classes as protected ([#11130](https://github.com/PyTorchLightning/pytorch-lightning/pull/11130)) -- Marked `trainer.checkpoint_connector` as protected ([#11550](https://github.com/PyTorchLightning/pytorch-lightning/pull/11550)) -- The epoch start/end hooks are now called by the `FitLoop` instead of the `TrainingEpochLoop` ([#11201](https://github.com/PyTorchLightning/pytorch-lightning/pull/11201)) -- DeepSpeed does not require lightning module zero 3 partitioning ([#10655](https://github.com/PyTorchLightning/pytorch-lightning/pull/10655)) -- Moved `Strategy` classes to the `strategies` directory ([#11226](https://github.com/PyTorchLightning/pytorch-lightning/pull/11226)) -- Renamed `training_type_plugin` file to `strategy` ([#11239](https://github.com/PyTorchLightning/pytorch-lightning/pull/11239)) -- Changed `DeviceStatsMonitor` to group metrics based on the logger's `group_separator` ([#11254](https://github.com/PyTorchLightning/pytorch-lightning/pull/11254)) -- Raised `UserWarning` if evaluation is triggered with `best` ckpt and trainer is configured with multiple checkpoint callbacks ([#11274](https://github.com/PyTorchLightning/pytorch-lightning/pull/11274)) -- `Trainer.logged_metrics` now always contains scalar tensors, even when a Python scalar was logged ([#11270](https://github.com/PyTorchLightning/pytorch-lightning/pull/11270)) -- The tuner now uses the checkpoint connector to copy and restore its state ([#11518](https://github.com/PyTorchLightning/pytorch-lightning/pull/11518)) -- Changed `MisconfigurationException` to `ModuleNotFoundError` when `rich` isn't available ([#11360](https://github.com/PyTorchLightning/pytorch-lightning/pull/11360)) -- The `trainer.current_epoch` value is now increased by 1 during and after `on_train_end` ([#8578](https://github.com/PyTorchLightning/pytorch-lightning/pull/8578)) -- The `trainer.global_step` value now accounts for multiple optimizers and TBPTT splits ([#11805](https://github.com/PyTorchLightning/pytorch-lightning/pull/11805)) -- The `trainer.global_step` value is now increased right after the `optimizer.step()` call which will impact users who access it during an intra-training validation hook ([#11805](https://github.com/PyTorchLightning/pytorch-lightning/pull/11805)) -- The filename of checkpoints created with `ModelCheckpoint(filename='{step}')` is different compared to previous versions. A checkpoint saved after 1 step will be named `step=1.ckpt` instead of `step=0.ckpt` ([#11805](https://github.com/PyTorchLightning/pytorch-lightning/pull/11805)) -- Inherit from `ABC` for `Accelerator`: Users need to implement `auto_device_count` ([#11521](https://github.com/PyTorchLightning/pytorch-lightning/pull/11521)) -- Changed `parallel_devices` property in `ParallelStrategy` to be lazy initialized ([#11572](https://github.com/PyTorchLightning/pytorch-lightning/pull/11572)) -- Updated `TQDMProgressBar` to run a separate progress bar for each eval dataloader ([#11657](https://github.com/PyTorchLightning/pytorch-lightning/pull/11657)) -- Sorted `SimpleProfiler(extended=False)` summary based on mean duration for each hook ([#11671](https://github.com/PyTorchLightning/pytorch-lightning/pull/11671)) -- Avoid enforcing `shuffle=False` for eval dataloaders ([#11575](https://github.com/PyTorchLightning/pytorch-lightning/pull/11575)) -- When using DP (data-parallel), Lightning will no longer automatically reduce all tensors returned in training_step; it will only reduce the loss unless `training_step_end` is overridden ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594)) -- When using DP (data-parallel), the `training_epoch_end` hook will no longer receive reduced outputs from `training_step` and instead get the full tensor of results from all GPUs ([#11594](https://github.com/PyTorchLightning/pytorch-lightning/pull/11594)) -- Changed default logger name to `lightning_logs` for consistency ([#11762](https://github.com/PyTorchLightning/pytorch-lightning/pull/11762)) -- Rewrote `accelerator_connector` ([#11448](https://github.com/PyTorchLightning/pytorch-lightning/pull/11448)) -- When manual optimization is used with DDP, we no longer force `find_unused_parameters=True` ([#12425](https://github.com/PyTorchLightning/pytorch-lightning/pull/12425)) -- Disable loading dataloades if corresponding `limit_batches=0` ([#11576](https://github.com/PyTorchLightning/pytorch-lightning/pull/11576)) -- Removed `is_global_zero` check in `training_epoch_loop` before `logger.save`. If you have a custom logger that implements `save` the Trainer will now call `save` on all ranks by default. To change this behavior add `@rank_zero_only` to your `save` implementation ([#12134](https://github.com/PyTorchLightning/pytorch-lightning/pull/12134)) -- Disabled tuner with distributed strategies ([#12179](https://github.com/PyTorchLightning/pytorch-lightning/pull/12179)) -- Marked `trainer.logger_connector` as protected ([#12195](https://github.com/PyTorchLightning/pytorch-lightning/pull/12195)) -- Move `Strategy.process_dataloader` function call from `fit/evaluation/predict_loop.py` to `data_connector.py` ([#12251](https://github.com/PyTorchLightning/pytorch-lightning/pull/12251)) -- `ModelCheckpoint(save_last=True, every_n_epochs=N)` now saves a "last" checkpoint every epoch (disregarding `every_n_epochs`) instead of only once at the end of training ([#12418](https://github.com/PyTorchLightning/pytorch-lightning/pull/12418)) -- The strategies that support `sync_batchnorm` now only apply it when fitting ([#11919](https://github.com/PyTorchLightning/pytorch-lightning/pull/11919)) -- Avoided fallback on CPU if no devices are provided for other accelerators ([#12410](https://github.com/PyTorchLightning/pytorch-lightning/pull/12410)) -- Modified `supporters.py` so that in the accumulator element (for loss) is created directly on the device ([#12430](https://github.com/PyTorchLightning/pytorch-lightning/pull/12430)) -- Removed `EarlyStopping.on_save_checkpoint` and `EarlyStopping.on_load_checkpoint` in favor of `EarlyStopping.state_dict` and `EarlyStopping.load_state_dict` ([#11887](https://github.com/PyTorchLightning/pytorch-lightning/pull/11887)) -- Removed `BaseFinetuning.on_save_checkpoint` and `BaseFinetuning.on_load_checkpoint` in favor of `BaseFinetuning.state_dict` and `BaseFinetuning.load_state_dict` ([#11887](https://github.com/PyTorchLightning/pytorch-lightning/pull/11887)) -- Removed `BackboneFinetuning.on_save_checkpoint` and `BackboneFinetuning.on_load_checkpoint` in favor of `BackboneFinetuning.state_dict` and `BackboneFinetuning.load_state_dict` ([#11887](https://github.com/PyTorchLightning/pytorch-lightning/pull/11887)) -- Removed `ModelCheckpoint.on_save_checkpoint` and `ModelCheckpoint.on_load_checkpoint` in favor of `ModelCheckpoint.state_dict` and `ModelCheckpoint.load_state_dict` ([#11887](https://github.com/PyTorchLightning/pytorch-lightning/pull/11887)) -- Removed `Timer.on_save_checkpoint` and `Timer.on_load_checkpoint` in favor of `Timer.state_dict` and `Timer.load_state_dict` ([#11887](https://github.com/PyTorchLightning/pytorch-lightning/pull/11887)) -- Replaced PostLocalSGDOptimizer with a dedicated model averaging component ([#12378](https://github.com/PyTorchLightning/pytorch-lightning/pull/12378)) + * Removed the `TrainingTypePlugin.pre_dispatch()` method and merged it with `TrainingTypePlugin.setup()` ([#11137](https://github.com/Lightning-AI/lightning/pull/11137)) +- Changed profiler to index and display the names of the hooks with a new pattern []. ([#11026](https://github.com/Lightning-AI/lightning/pull/11026)) +- Changed `batch_to_device` entry in profiling from stage-specific to generic, to match profiling of other hooks ([#11031](https://github.com/Lightning-AI/lightning/pull/11031)) +- Changed the info message for finalizing ddp-spawn worker processes to a debug-level message ([#10864](https://github.com/Lightning-AI/lightning/pull/10864)) +- Removed duplicated file extension when uploading model checkpoints with `NeptuneLogger` ([#11015](https://github.com/Lightning-AI/lightning/pull/11015)) +- Removed `__getstate__` and `__setstate__` of `RichProgressBar` ([#11100](https://github.com/Lightning-AI/lightning/pull/11100)) +- The `DDPPlugin` and `DDPSpawnPlugin` and their subclasses now remove the `SyncBatchNorm` wrappers in `teardown()` to enable proper support at inference after fitting ([#11078](https://github.com/Lightning-AI/lightning/pull/11078)) +- Moved ownership of the `Accelerator` instance to the `TrainingTypePlugin`; all training-type plugins now take an optional parameter `accelerator` ([#11022](https://github.com/Lightning-AI/lightning/pull/11022)) +- Renamed the `TrainingTypePlugin` to `Strategy` ([#11120](https://github.com/Lightning-AI/lightning/pull/11120)) + * Renamed the `ParallelPlugin` to `ParallelStrategy` ([#11123](https://github.com/Lightning-AI/lightning/pull/11123)) + * Renamed the `DataParallelPlugin` to `DataParallelStrategy` ([#11183](https://github.com/Lightning-AI/lightning/pull/11183)) + * Renamed the `DDPPlugin` to `DDPStrategy` ([#11142](https://github.com/Lightning-AI/lightning/pull/11142)) + * Renamed the `DDP2Plugin` to `DDP2Strategy` ([#11185](https://github.com/Lightning-AI/lightning/pull/11185)) + * Renamed the `DDPShardedPlugin` to `DDPShardedStrategy` ([#11186](https://github.com/Lightning-AI/lightning/pull/11186)) + * Renamed the `DDPFullyShardedPlugin` to `DDPFullyShardedStrategy` ([#11143](https://github.com/Lightning-AI/lightning/pull/11143)) + * Renamed the `DDPSpawnPlugin` to `DDPSpawnStrategy` ([#11145](https://github.com/Lightning-AI/lightning/pull/11145)) + * Renamed the `DDPSpawnShardedPlugin` to `DDPSpawnShardedStrategy` ([#11210](https://github.com/Lightning-AI/lightning/pull/11210)) + * Renamed the `DeepSpeedPlugin` to `DeepSpeedStrategy` ([#11194](https://github.com/Lightning-AI/lightning/pull/11194)) + * Renamed the `HorovodPlugin` to `HorovodStrategy` ([#11195](https://github.com/Lightning-AI/lightning/pull/11195)) + * Renamed the `TPUSpawnPlugin` to `TPUSpawnStrategy` ([#11190](https://github.com/Lightning-AI/lightning/pull/11190)) + * Renamed the `IPUPlugin` to `IPUStrategy` ([#11193](https://github.com/Lightning-AI/lightning/pull/11193)) + * Renamed the `SingleDevicePlugin` to `SingleDeviceStrategy` ([#11182](https://github.com/Lightning-AI/lightning/pull/11182)) + * Renamed the `SingleTPUPlugin` to `SingleTPUStrategy` ([#11182](https://github.com/Lightning-AI/lightning/pull/11182)) + * Renamed the `TrainingTypePluginsRegistry` to `StrategyRegistry` ([#11233](https://github.com/Lightning-AI/lightning/pull/11233)) +- Marked the `ResultCollection`, `ResultMetric`, and `ResultMetricCollection` classes as protected ([#11130](https://github.com/Lightning-AI/lightning/pull/11130)) +- Marked `trainer.checkpoint_connector` as protected ([#11550](https://github.com/Lightning-AI/lightning/pull/11550)) +- The epoch start/end hooks are now called by the `FitLoop` instead of the `TrainingEpochLoop` ([#11201](https://github.com/Lightning-AI/lightning/pull/11201)) +- DeepSpeed does not require lightning module zero 3 partitioning ([#10655](https://github.com/Lightning-AI/lightning/pull/10655)) +- Moved `Strategy` classes to the `strategies` directory ([#11226](https://github.com/Lightning-AI/lightning/pull/11226)) +- Renamed `training_type_plugin` file to `strategy` ([#11239](https://github.com/Lightning-AI/lightning/pull/11239)) +- Changed `DeviceStatsMonitor` to group metrics based on the logger's `group_separator` ([#11254](https://github.com/Lightning-AI/lightning/pull/11254)) +- Raised `UserWarning` if evaluation is triggered with `best` ckpt and trainer is configured with multiple checkpoint callbacks ([#11274](https://github.com/Lightning-AI/lightning/pull/11274)) +- `Trainer.logged_metrics` now always contains scalar tensors, even when a Python scalar was logged ([#11270](https://github.com/Lightning-AI/lightning/pull/11270)) +- The tuner now uses the checkpoint connector to copy and restore its state ([#11518](https://github.com/Lightning-AI/lightning/pull/11518)) +- Changed `MisconfigurationException` to `ModuleNotFoundError` when `rich` isn't available ([#11360](https://github.com/Lightning-AI/lightning/pull/11360)) +- The `trainer.current_epoch` value is now increased by 1 during and after `on_train_end` ([#8578](https://github.com/Lightning-AI/lightning/pull/8578)) +- The `trainer.global_step` value now accounts for multiple optimizers and TBPTT splits ([#11805](https://github.com/Lightning-AI/lightning/pull/11805)) +- The `trainer.global_step` value is now increased right after the `optimizer.step()` call which will impact users who access it during an intra-training validation hook ([#11805](https://github.com/Lightning-AI/lightning/pull/11805)) +- The filename of checkpoints created with `ModelCheckpoint(filename='{step}')` is different compared to previous versions. A checkpoint saved after 1 step will be named `step=1.ckpt` instead of `step=0.ckpt` ([#11805](https://github.com/Lightning-AI/lightning/pull/11805)) +- Inherit from `ABC` for `Accelerator`: Users need to implement `auto_device_count` ([#11521](https://github.com/Lightning-AI/lightning/pull/11521)) +- Changed `parallel_devices` property in `ParallelStrategy` to be lazy initialized ([#11572](https://github.com/Lightning-AI/lightning/pull/11572)) +- Updated `TQDMProgressBar` to run a separate progress bar for each eval dataloader ([#11657](https://github.com/Lightning-AI/lightning/pull/11657)) +- Sorted `SimpleProfiler(extended=False)` summary based on mean duration for each hook ([#11671](https://github.com/Lightning-AI/lightning/pull/11671)) +- Avoid enforcing `shuffle=False` for eval dataloaders ([#11575](https://github.com/Lightning-AI/lightning/pull/11575)) +- When using DP (data-parallel), Lightning will no longer automatically reduce all tensors returned in training_step; it will only reduce the loss unless `training_step_end` is overridden ([#11594](https://github.com/Lightning-AI/lightning/pull/11594)) +- When using DP (data-parallel), the `training_epoch_end` hook will no longer receive reduced outputs from `training_step` and instead get the full tensor of results from all GPUs ([#11594](https://github.com/Lightning-AI/lightning/pull/11594)) +- Changed default logger name to `lightning_logs` for consistency ([#11762](https://github.com/Lightning-AI/lightning/pull/11762)) +- Rewrote `accelerator_connector` ([#11448](https://github.com/Lightning-AI/lightning/pull/11448)) +- When manual optimization is used with DDP, we no longer force `find_unused_parameters=True` ([#12425](https://github.com/Lightning-AI/lightning/pull/12425)) +- Disable loading dataloades if corresponding `limit_batches=0` ([#11576](https://github.com/Lightning-AI/lightning/pull/11576)) +- Removed `is_global_zero` check in `training_epoch_loop` before `logger.save`. If you have a custom logger that implements `save` the Trainer will now call `save` on all ranks by default. To change this behavior add `@rank_zero_only` to your `save` implementation ([#12134](https://github.com/Lightning-AI/lightning/pull/12134)) +- Disabled tuner with distributed strategies ([#12179](https://github.com/Lightning-AI/lightning/pull/12179)) +- Marked `trainer.logger_connector` as protected ([#12195](https://github.com/Lightning-AI/lightning/pull/12195)) +- Move `Strategy.process_dataloader` function call from `fit/evaluation/predict_loop.py` to `data_connector.py` ([#12251](https://github.com/Lightning-AI/lightning/pull/12251)) +- `ModelCheckpoint(save_last=True, every_n_epochs=N)` now saves a "last" checkpoint every epoch (disregarding `every_n_epochs`) instead of only once at the end of training ([#12418](https://github.com/Lightning-AI/lightning/pull/12418)) +- The strategies that support `sync_batchnorm` now only apply it when fitting ([#11919](https://github.com/Lightning-AI/lightning/pull/11919)) +- Avoided fallback on CPU if no devices are provided for other accelerators ([#12410](https://github.com/Lightning-AI/lightning/pull/12410)) +- Modified `supporters.py` so that in the accumulator element (for loss) is created directly on the device ([#12430](https://github.com/Lightning-AI/lightning/pull/12430)) +- Removed `EarlyStopping.on_save_checkpoint` and `EarlyStopping.on_load_checkpoint` in favor of `EarlyStopping.state_dict` and `EarlyStopping.load_state_dict` ([#11887](https://github.com/Lightning-AI/lightning/pull/11887)) +- Removed `BaseFinetuning.on_save_checkpoint` and `BaseFinetuning.on_load_checkpoint` in favor of `BaseFinetuning.state_dict` and `BaseFinetuning.load_state_dict` ([#11887](https://github.com/Lightning-AI/lightning/pull/11887)) +- Removed `BackboneFinetuning.on_save_checkpoint` and `BackboneFinetuning.on_load_checkpoint` in favor of `BackboneFinetuning.state_dict` and `BackboneFinetuning.load_state_dict` ([#11887](https://github.com/Lightning-AI/lightning/pull/11887)) +- Removed `ModelCheckpoint.on_save_checkpoint` and `ModelCheckpoint.on_load_checkpoint` in favor of `ModelCheckpoint.state_dict` and `ModelCheckpoint.load_state_dict` ([#11887](https://github.com/Lightning-AI/lightning/pull/11887)) +- Removed `Timer.on_save_checkpoint` and `Timer.on_load_checkpoint` in favor of `Timer.state_dict` and `Timer.load_state_dict` ([#11887](https://github.com/Lightning-AI/lightning/pull/11887)) +- Replaced PostLocalSGDOptimizer with a dedicated model averaging component ([#12378](https://github.com/Lightning-AI/lightning/pull/12378)) ### Deprecated -- Deprecated `training_type_plugin` property in favor of `strategy` in `Trainer` and updated the references ([#11141](https://github.com/PyTorchLightning/pytorch-lightning/pull/11141)) -- Deprecated `Trainer.{validated,tested,predicted}_ckpt_path` and replaced with read-only property `Trainer.ckpt_path` set when checkpoints loaded via `Trainer.{fit,validate,test,predict}` ([#11696](https://github.com/PyTorchLightning/pytorch-lightning/pull/11696)) -- Deprecated `ClusterEnvironment.master_{address,port}` in favor of `ClusterEnvironment.main_{address,port}` ([#10103](https://github.com/PyTorchLightning/pytorch-lightning/pull/10103)) -- Deprecated `DistributedType` in favor of `_StrategyType` ([#10505](https://github.com/PyTorchLightning/pytorch-lightning/pull/10505)) -- Deprecated the `precision_plugin` constructor argument from `Accelerator` ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) -- Deprecated `DeviceType` in favor of `_AcceleratorType` ([#10503](https://github.com/PyTorchLightning/pytorch-lightning/pull/10503)) -- Deprecated the property `Trainer.slurm_job_id` in favor of the new `SLURMEnvironment.job_id()` method ([#10622](https://github.com/PyTorchLightning/pytorch-lightning/pull/10622)) -- Deprecated the access to the attribute `IndexBatchSamplerWrapper.batch_indices` in favor of `IndexBatchSamplerWrapper.seen_batch_indices` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) -- Deprecated `on_init_start` and `on_init_end` callback hooks ([#10940](https://github.com/PyTorchLightning/pytorch-lightning/pull/10940)) -- Deprecated `Trainer.call_hook` in favor of `Trainer._call_callback_hooks`, `Trainer._call_lightning_module_hook`, `Trainer._call_ttp_hook`, and `Trainer._call_accelerator_hook` ([#10979](https://github.com/PyTorchLightning/pytorch-lightning/pull/10979)) -- Deprecated `TrainingTypePlugin.post_dispatch` in favor of `TrainingTypePlugin.teardown` ([#10939](https://github.com/PyTorchLightning/pytorch-lightning/pull/10939)) -- Deprecated `ModelIO.on_hpc_{save/load}` in favor of `CheckpointHooks.on_{save/load}_checkpoint` ([#10911](https://github.com/PyTorchLightning/pytorch-lightning/pull/10911)) -- Deprecated `Trainer.run_stage` in favor of `Trainer.{fit,validate,test,predict}` ([#11000](https://github.com/PyTorchLightning/pytorch-lightning/pull/11000)) -- Deprecated `Trainer.lr_schedulers` in favor of `Trainer.lr_scheduler_configs` which returns a list of dataclasses instead of dictionaries ([#11443](https://github.com/PyTorchLightning/pytorch-lightning/pull/11443)) -- Deprecated `Trainer.verbose_evaluate` in favor of `EvaluationLoop(verbose=...)` ([#10931](https://github.com/PyTorchLightning/pytorch-lightning/pull/10931)) -- Deprecated `Trainer.should_rank_save_checkpoint` Trainer property ([#11068](https://github.com/PyTorchLightning/pytorch-lightning/pull/11068)) -- Deprecated `Trainer.lightning_optimizers` ([#11444](https://github.com/PyTorchLightning/pytorch-lightning/pull/11444)) -- Deprecated `TrainerOptimizersMixin` and moved functionality to `core/optimizer.py`([#11155](https://github.com/PyTorchLightning/pytorch-lightning/pull/11155)) -- Deprecated the `on_train_batch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#12182](https://github.com/PyTorchLightning/pytorch-lightning/pull/12182)) -- Deprecated the `training_epoch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#12182](https://github.com/PyTorchLightning/pytorch-lightning/pull/12182)) -- Deprecated `TrainerCallbackHookMixin` ([#11148](https://github.com/PyTorchLightning/pytorch-lightning/pull/11148)) -- Deprecated `TrainerDataLoadingMixin` and moved functionality to `Trainer` and `DataConnector` ([#11282](https://github.com/PyTorchLightning/pytorch-lightning/pull/11282)) -- Deprecated function `pytorch_lightning.callbacks.device_stats_monitor.prefix_metric_keys` ([#11254](https://github.com/PyTorchLightning/pytorch-lightning/pull/11254)) -- Deprecated `Callback.on_epoch_start` hook in favour of `Callback.on_{train/val/test}_epoch_start` ([#11578](https://github.com/PyTorchLightning/pytorch-lightning/pull/11578)) -- Deprecated `Callback.on_epoch_end` hook in favour of `Callback.on_{train/val/test}_epoch_end` ([#11578](https://github.com/PyTorchLightning/pytorch-lightning/pull/11578)) -- Deprecated `LightningModule.on_epoch_start` hook in favor of `LightningModule.on_{train/val/test}_epoch_start` ([#11578](https://github.com/PyTorchLightning/pytorch-lightning/pull/11578)) -- Deprecated `LightningModule.on_epoch_end` hook in favor of `LightningModule.on_{train/val/test}_epoch_end` ([#11578](https://github.com/PyTorchLightning/pytorch-lightning/pull/11578)) -- Deprecated `on_before_accelerator_backend_setup` callback hook in favour of `setup` ([#11568](https://github.com/PyTorchLightning/pytorch-lightning/pull/11568)) -- Deprecated `on_batch_start` and `on_batch_end` callback hooks in favor of `on_train_batch_start` and `on_train_batch_end` ([#11577](https://github.com/PyTorchLightning/pytorch-lightning/pull/11577)) -- Deprecated `on_configure_sharded_model` callback hook in favor of `setup` ([#11627](https://github.com/PyTorchLightning/pytorch-lightning/pull/11627)) -- Deprecated `pytorch_lightning.utilities.distributed.rank_zero_only` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_only` ([#11747](https://github.com/PyTorchLightning/pytorch-lightning/pull/11747)) -- Deprecated `pytorch_lightning.utilities.distributed.rank_zero_debug` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_debug` ([#11747](https://github.com/PyTorchLightning/pytorch-lightning/pull/11747)) -- Deprecated `pytorch_lightning.utilities.distributed.rank_zero_info` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_info` ([#11747](https://github.com/PyTorchLightning/pytorch-lightning/pull/11747)) -- Deprecated `pytorch_lightning.utilities.warnings.rank_zero_warn` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_warn` ([#11747](https://github.com/PyTorchLightning/pytorch-lightning/pull/11747)) -- Deprecated `pytorch_lightning.utilities.warnings.rank_zero_deprecation` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_deprecation` ([#11747](https://github.com/PyTorchLightning/pytorch-lightning/pull/11747)) +- Deprecated `training_type_plugin` property in favor of `strategy` in `Trainer` and updated the references ([#11141](https://github.com/Lightning-AI/lightning/pull/11141)) +- Deprecated `Trainer.{validated,tested,predicted}_ckpt_path` and replaced with read-only property `Trainer.ckpt_path` set when checkpoints loaded via `Trainer.{fit,validate,test,predict}` ([#11696](https://github.com/Lightning-AI/lightning/pull/11696)) +- Deprecated `ClusterEnvironment.master_{address,port}` in favor of `ClusterEnvironment.main_{address,port}` ([#10103](https://github.com/Lightning-AI/lightning/pull/10103)) +- Deprecated `DistributedType` in favor of `_StrategyType` ([#10505](https://github.com/Lightning-AI/lightning/pull/10505)) +- Deprecated the `precision_plugin` constructor argument from `Accelerator` ([#10570](https://github.com/Lightning-AI/lightning/pull/10570)) +- Deprecated `DeviceType` in favor of `_AcceleratorType` ([#10503](https://github.com/Lightning-AI/lightning/pull/10503)) +- Deprecated the property `Trainer.slurm_job_id` in favor of the new `SLURMEnvironment.job_id()` method ([#10622](https://github.com/Lightning-AI/lightning/pull/10622)) +- Deprecated the access to the attribute `IndexBatchSamplerWrapper.batch_indices` in favor of `IndexBatchSamplerWrapper.seen_batch_indices` ([#10870](https://github.com/Lightning-AI/lightning/pull/10870)) +- Deprecated `on_init_start` and `on_init_end` callback hooks ([#10940](https://github.com/Lightning-AI/lightning/pull/10940)) +- Deprecated `Trainer.call_hook` in favor of `Trainer._call_callback_hooks`, `Trainer._call_lightning_module_hook`, `Trainer._call_ttp_hook`, and `Trainer._call_accelerator_hook` ([#10979](https://github.com/Lightning-AI/lightning/pull/10979)) +- Deprecated `TrainingTypePlugin.post_dispatch` in favor of `TrainingTypePlugin.teardown` ([#10939](https://github.com/Lightning-AI/lightning/pull/10939)) +- Deprecated `ModelIO.on_hpc_{save/load}` in favor of `CheckpointHooks.on_{save/load}_checkpoint` ([#10911](https://github.com/Lightning-AI/lightning/pull/10911)) +- Deprecated `Trainer.run_stage` in favor of `Trainer.{fit,validate,test,predict}` ([#11000](https://github.com/Lightning-AI/lightning/pull/11000)) +- Deprecated `Trainer.lr_schedulers` in favor of `Trainer.lr_scheduler_configs` which returns a list of dataclasses instead of dictionaries ([#11443](https://github.com/Lightning-AI/lightning/pull/11443)) +- Deprecated `Trainer.verbose_evaluate` in favor of `EvaluationLoop(verbose=...)` ([#10931](https://github.com/Lightning-AI/lightning/pull/10931)) +- Deprecated `Trainer.should_rank_save_checkpoint` Trainer property ([#11068](https://github.com/Lightning-AI/lightning/pull/11068)) +- Deprecated `Trainer.lightning_optimizers` ([#11444](https://github.com/Lightning-AI/lightning/pull/11444)) +- Deprecated `TrainerOptimizersMixin` and moved functionality to `core/optimizer.py`([#11155](https://github.com/Lightning-AI/lightning/pull/11155)) +- Deprecated the `on_train_batch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#12182](https://github.com/Lightning-AI/lightning/pull/12182)) +- Deprecated the `training_epoch_end(outputs)` format when multiple optimizers are used and TBPTT is enabled ([#12182](https://github.com/Lightning-AI/lightning/pull/12182)) +- Deprecated `TrainerCallbackHookMixin` ([#11148](https://github.com/Lightning-AI/lightning/pull/11148)) +- Deprecated `TrainerDataLoadingMixin` and moved functionality to `Trainer` and `DataConnector` ([#11282](https://github.com/Lightning-AI/lightning/pull/11282)) +- Deprecated function `pytorch_lightning.callbacks.device_stats_monitor.prefix_metric_keys` ([#11254](https://github.com/Lightning-AI/lightning/pull/11254)) +- Deprecated `Callback.on_epoch_start` hook in favour of `Callback.on_{train/val/test}_epoch_start` ([#11578](https://github.com/Lightning-AI/lightning/pull/11578)) +- Deprecated `Callback.on_epoch_end` hook in favour of `Callback.on_{train/val/test}_epoch_end` ([#11578](https://github.com/Lightning-AI/lightning/pull/11578)) +- Deprecated `LightningModule.on_epoch_start` hook in favor of `LightningModule.on_{train/val/test}_epoch_start` ([#11578](https://github.com/Lightning-AI/lightning/pull/11578)) +- Deprecated `LightningModule.on_epoch_end` hook in favor of `LightningModule.on_{train/val/test}_epoch_end` ([#11578](https://github.com/Lightning-AI/lightning/pull/11578)) +- Deprecated `on_before_accelerator_backend_setup` callback hook in favour of `setup` ([#11568](https://github.com/Lightning-AI/lightning/pull/11568)) +- Deprecated `on_batch_start` and `on_batch_end` callback hooks in favor of `on_train_batch_start` and `on_train_batch_end` ([#11577](https://github.com/Lightning-AI/lightning/pull/11577)) +- Deprecated `on_configure_sharded_model` callback hook in favor of `setup` ([#11627](https://github.com/Lightning-AI/lightning/pull/11627)) +- Deprecated `pytorch_lightning.utilities.distributed.rank_zero_only` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_only` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) +- Deprecated `pytorch_lightning.utilities.distributed.rank_zero_debug` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_debug` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) +- Deprecated `pytorch_lightning.utilities.distributed.rank_zero_info` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_info` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) +- Deprecated `pytorch_lightning.utilities.warnings.rank_zero_warn` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_warn` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) +- Deprecated `pytorch_lightning.utilities.warnings.rank_zero_deprecation` in favor of `pytorch_lightning.utilities.rank_zero.rank_zero_deprecation` ([#11747](https://github.com/Lightning-AI/lightning/pull/11747)) - Deprecated `pytorch_lightning.utilities.warnings.LightningDeprecationWarning` in favor of `pytorch_lightning.utilities.rank_zero.LightningDeprecationWarning` -- Deprecated `on_pretrain_routine_start` and `on_pretrain_routine_end` callback hooks in favor of `on_fit_start` ([#11794](https://github.com/PyTorchLightning/pytorch-lightning/pull/11794)) -- Deprecated `LightningModule.on_pretrain_routine_start` and `LightningModule.on_pretrain_routine_end` hooks in favor of `on_fit_start` ([#12122](https://github.com/PyTorchLightning/pytorch-lightning/pull/12122)) -- Deprecated `agg_key_funcs` and `agg_default_func` parameters from `LightningLoggerBase` ([#11871](https://github.com/PyTorchLightning/pytorch-lightning/pull/11871)) -- Deprecated `LightningLoggerBase.update_agg_funcs` ([#11871](https://github.com/PyTorchLightning/pytorch-lightning/pull/11871)) -- Deprecated `LightningLoggerBase.agg_and_log_metrics` in favor of `LightningLoggerBase.log_metrics` ([#11832](https://github.com/PyTorchLightning/pytorch-lightning/pull/11832)) -- Deprecated passing `weights_save_path` to the `Trainer` constructor in favor of adding the `ModelCheckpoint` callback with `dirpath` directly to the list of callbacks ([#12084](https://github.com/PyTorchLightning/pytorch-lightning/pull/12084)) -- Deprecated `pytorch_lightning.profiler.AbstractProfiler` in favor of `pytorch_lightning.profiler.Profiler` ([#12106](https://github.com/PyTorchLightning/pytorch-lightning/pull/12106)) -- Deprecated `pytorch_lightning.profiler.BaseProfiler` in favor of `pytorch_lightning.profiler.Profiler` ([#12150](https://github.com/PyTorchLightning/pytorch-lightning/pull/12150)) -- Deprecated `BaseProfiler.profile_iterable` ([#12102](https://github.com/PyTorchLightning/pytorch-lightning/pull/12102)) -- Deprecated `LoggerCollection` in favor of `trainer.loggers` ([#12147](https://github.com/PyTorchLightning/pytorch-lightning/pull/12147)) -- Deprecated `PrecisionPlugin.on_{save,load}_checkpoint` in favor of `PrecisionPlugin.{state_dict,load_state_dict}` ([#11978](https://github.com/PyTorchLightning/pytorch-lightning/pull/11978)) -- Deprecated `LightningDataModule.on_save/load_checkpoint` in favor of `state_dict/load_state_dict` ([#11893](https://github.com/PyTorchLightning/pytorch-lightning/pull/11893)) -- Deprecated `Trainer.use_amp` in favor of `Trainer.amp_backend` ([#12312](https://github.com/PyTorchLightning/pytorch-lightning/pull/12312)) -- Deprecated `LightingModule.use_amp` in favor of `Trainer.amp_backend` ([#12315](https://github.com/PyTorchLightning/pytorch-lightning/pull/12315)) -- Deprecated specifying the process group backend through the environment variable `PL_TORCH_DISTRIBUTED_BACKEND` ([#11745](https://github.com/PyTorchLightning/pytorch-lightning/pull/11745)) -- Deprecated `ParallelPlugin.torch_distributed_backend` in favor of `DDPStrategy.process_group_backend` property ([#11745](https://github.com/PyTorchLightning/pytorch-lightning/pull/11745)) -- Deprecated `ModelCheckpoint.save_checkpoint` in favor of `Trainer.save_checkpoint` ([#12456](https://github.com/PyTorchLightning/pytorch-lightning/pull/12456)) -- Deprecated `Trainer.devices` in favor of `Trainer.num_devices` and `Trainer.device_ids` ([#12151](https://github.com/PyTorchLightning/pytorch-lightning/pull/12151)) -- Deprecated `Trainer.root_gpu` in favor of `Trainer.strategy.root_device.index` when GPU is used ([#12262](https://github.com/PyTorchLightning/pytorch-lightning/pull/12262)) -- Deprecated `Trainer.num_gpus` in favor of `Trainer.num_devices` when GPU is used ([#12384](https://github.com/PyTorchLightning/pytorch-lightning/pull/12384)) -- Deprecated `Trainer.ipus` in favor of `Trainer.num_devices` when IPU is used ([#12386](https://github.com/PyTorchLightning/pytorch-lightning/pull/12386)) -- Deprecated `Trainer.num_processes` in favor of `Trainer.num_devices` ([#12388](https://github.com/PyTorchLightning/pytorch-lightning/pull/12388)) -- Deprecated `Trainer.data_parallel_device_ids` in favor of `Trainer.device_ids` ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) -- Deprecated returning state from `Callback.on_save_checkpoint` in favor of returning state in `Callback.state_dict` for checkpointing ([#11887](https://github.com/PyTorchLightning/pytorch-lightning/pull/11887)) -- Deprecated passing only the callback state to `Callback.on_load_checkpoint(callback_state)` in favor of passing the callback state to `Callback.load_state_dict` and in 1.8, passing the entire checkpoint dictionary to `Callback.on_load_checkpoint(checkpoint)` ([#11887](https://github.com/PyTorchLightning/pytorch-lightning/pull/11887)) -- Deprecated `Trainer.gpus` in favor of `Trainer.device_ids` or `Trainer.num_devices` ([#12436](https://github.com/PyTorchLightning/pytorch-lightning/pull/12436)) -- Deprecated `Trainer.tpu_cores` in favor of `Trainer.num_devices` ([#12437](https://github.com/PyTorchLightning/pytorch-lightning/pull/12437)) +- Deprecated `on_pretrain_routine_start` and `on_pretrain_routine_end` callback hooks in favor of `on_fit_start` ([#11794](https://github.com/Lightning-AI/lightning/pull/11794)) +- Deprecated `LightningModule.on_pretrain_routine_start` and `LightningModule.on_pretrain_routine_end` hooks in favor of `on_fit_start` ([#12122](https://github.com/Lightning-AI/lightning/pull/12122)) +- Deprecated `agg_key_funcs` and `agg_default_func` parameters from `LightningLoggerBase` ([#11871](https://github.com/Lightning-AI/lightning/pull/11871)) +- Deprecated `LightningLoggerBase.update_agg_funcs` ([#11871](https://github.com/Lightning-AI/lightning/pull/11871)) +- Deprecated `LightningLoggerBase.agg_and_log_metrics` in favor of `LightningLoggerBase.log_metrics` ([#11832](https://github.com/Lightning-AI/lightning/pull/11832)) +- Deprecated passing `weights_save_path` to the `Trainer` constructor in favor of adding the `ModelCheckpoint` callback with `dirpath` directly to the list of callbacks ([#12084](https://github.com/Lightning-AI/lightning/pull/12084)) +- Deprecated `pytorch_lightning.profiler.AbstractProfiler` in favor of `pytorch_lightning.profiler.Profiler` ([#12106](https://github.com/Lightning-AI/lightning/pull/12106)) +- Deprecated `pytorch_lightning.profiler.BaseProfiler` in favor of `pytorch_lightning.profiler.Profiler` ([#12150](https://github.com/Lightning-AI/lightning/pull/12150)) +- Deprecated `BaseProfiler.profile_iterable` ([#12102](https://github.com/Lightning-AI/lightning/pull/12102)) +- Deprecated `LoggerCollection` in favor of `trainer.loggers` ([#12147](https://github.com/Lightning-AI/lightning/pull/12147)) +- Deprecated `PrecisionPlugin.on_{save,load}_checkpoint` in favor of `PrecisionPlugin.{state_dict,load_state_dict}` ([#11978](https://github.com/Lightning-AI/lightning/pull/11978)) +- Deprecated `LightningDataModule.on_save/load_checkpoint` in favor of `state_dict/load_state_dict` ([#11893](https://github.com/Lightning-AI/lightning/pull/11893)) +- Deprecated `Trainer.use_amp` in favor of `Trainer.amp_backend` ([#12312](https://github.com/Lightning-AI/lightning/pull/12312)) +- Deprecated `LightingModule.use_amp` in favor of `Trainer.amp_backend` ([#12315](https://github.com/Lightning-AI/lightning/pull/12315)) +- Deprecated specifying the process group backend through the environment variable `PL_TORCH_DISTRIBUTED_BACKEND` ([#11745](https://github.com/Lightning-AI/lightning/pull/11745)) +- Deprecated `ParallelPlugin.torch_distributed_backend` in favor of `DDPStrategy.process_group_backend` property ([#11745](https://github.com/Lightning-AI/lightning/pull/11745)) +- Deprecated `ModelCheckpoint.save_checkpoint` in favor of `Trainer.save_checkpoint` ([#12456](https://github.com/Lightning-AI/lightning/pull/12456)) +- Deprecated `Trainer.devices` in favor of `Trainer.num_devices` and `Trainer.device_ids` ([#12151](https://github.com/Lightning-AI/lightning/pull/12151)) +- Deprecated `Trainer.root_gpu` in favor of `Trainer.strategy.root_device.index` when GPU is used ([#12262](https://github.com/Lightning-AI/lightning/pull/12262)) +- Deprecated `Trainer.num_gpus` in favor of `Trainer.num_devices` when GPU is used ([#12384](https://github.com/Lightning-AI/lightning/pull/12384)) +- Deprecated `Trainer.ipus` in favor of `Trainer.num_devices` when IPU is used ([#12386](https://github.com/Lightning-AI/lightning/pull/12386)) +- Deprecated `Trainer.num_processes` in favor of `Trainer.num_devices` ([#12388](https://github.com/Lightning-AI/lightning/pull/12388)) +- Deprecated `Trainer.data_parallel_device_ids` in favor of `Trainer.device_ids` ([#12072](https://github.com/Lightning-AI/lightning/pull/12072)) +- Deprecated returning state from `Callback.on_save_checkpoint` in favor of returning state in `Callback.state_dict` for checkpointing ([#11887](https://github.com/Lightning-AI/lightning/pull/11887)) +- Deprecated passing only the callback state to `Callback.on_load_checkpoint(callback_state)` in favor of passing the callback state to `Callback.load_state_dict` and in 1.8, passing the entire checkpoint dictionary to `Callback.on_load_checkpoint(checkpoint)` ([#11887](https://github.com/Lightning-AI/lightning/pull/11887)) +- Deprecated `Trainer.gpus` in favor of `Trainer.device_ids` or `Trainer.num_devices` ([#12436](https://github.com/Lightning-AI/lightning/pull/12436)) +- Deprecated `Trainer.tpu_cores` in favor of `Trainer.num_devices` ([#12437](https://github.com/Lightning-AI/lightning/pull/12437)) ### Removed -- Removed deprecated parameter `method` in `pytorch_lightning.utilities.model_helpers.is_overridden` ([#10507](https://github.com/PyTorchLightning/pytorch-lightning/pull/10507)) -- Remove deprecated method `ClusterEnvironment.creates_children` ([#10339](https://github.com/PyTorchLightning/pytorch-lightning/pull/10339)) -- Removed deprecated `TrainerModelHooksMixin.is_function_implemented` and `TrainerModelHooksMixin.has_arg` ([#10322](https://github.com/PyTorchLightning/pytorch-lightning/pull/10322)) -- Removed deprecated `pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin` in favor of `pytorch_lightning.core.mixins.device_dtype_mixin.DeviceDtypeModuleMixin` ([#10442](https://github.com/PyTorchLightning/pytorch-lightning/pull/10442)) -- Removed deprecated `LightningModule.loaded_optimizer_states_dict` property ([#10346](https://github.com/PyTorchLightning/pytorch-lightning/pull/10346)) -- Removed deprecated `Trainer.fit(train_dataloader=)`, `Trainer.validate(val_dataloaders=)`, and `Trainer.test(test_dataloader=)` ([#10325](https://github.com/PyTorchLightning/pytorch-lightning/pull/10325)) -- Removed deprecated `has_prepared_data`, `has_setup_fit`, `has_setup_validate`, `has_setup_test`, `has_setup_predict`, `has_teardown_fit`, `has_teardown_validate`, `has_teardown_test` and `has_teardown_predict` datamodule lifecycle properties ([#10350](https://github.com/PyTorchLightning/pytorch-lightning/pull/10350)) -- Removed deprecated `every_n_val_epochs` parameter of ModelCheckpoint ([#10366](https://github.com/PyTorchLightning/pytorch-lightning/pull/10366)) -- Removed deprecated `import pytorch_lightning.profiler.profilers` in favor of `import pytorch_lightning.profiler` ([#10443](https://github.com/PyTorchLightning/pytorch-lightning/pull/10443)) -- Removed deprecated property `configure_slurm_dpp` from accelerator connector ([#10370](https://github.com/PyTorchLightning/pytorch-lightning/pull/10370)) -- Removed deprecated arguments `num_nodes` and `sync_batchnorm` from `DDPPlugin`, `DDPSpawnPlugin`, `DeepSpeedPlugin` ([#10357](https://github.com/PyTorchLightning/pytorch-lightning/pull/10357)) -- Removed deprecated property `is_slurm_managing_tasks` from AcceleratorConnector ([#10353](https://github.com/PyTorchLightning/pytorch-lightning/pull/10353)) -- Removed deprecated `LightningModule.log(tbptt_reduce_fx, tbptt_reduce_token, sync_dist_op)` ([#10423](https://github.com/PyTorchLightning/pytorch-lightning/pull/10423)) -- Removed deprecated `Plugin.task_idx` ([#10441](https://github.com/PyTorchLightning/pytorch-lightning/pull/10441)) -- Removed deprecated method `master_params` from PrecisionPlugin ([#10372](https://github.com/PyTorchLightning/pytorch-lightning/pull/10372)) -- Removed the automatic detachment of "extras" returned from `training_step`. For example, `return {'loss': ..., 'foo': foo.detach()}` will now be necessary if `foo` has gradients which you do not want to store ([#10424](https://github.com/PyTorchLightning/pytorch-lightning/pull/10424)) +- Removed deprecated parameter `method` in `pytorch_lightning.utilities.model_helpers.is_overridden` ([#10507](https://github.com/Lightning-AI/lightning/pull/10507)) +- Remove deprecated method `ClusterEnvironment.creates_children` ([#10339](https://github.com/Lightning-AI/lightning/pull/10339)) +- Removed deprecated `TrainerModelHooksMixin.is_function_implemented` and `TrainerModelHooksMixin.has_arg` ([#10322](https://github.com/Lightning-AI/lightning/pull/10322)) +- Removed deprecated `pytorch_lightning.utilities.device_dtype_mixin.DeviceDtypeModuleMixin` in favor of `pytorch_lightning.core.mixins.device_dtype_mixin.DeviceDtypeModuleMixin` ([#10442](https://github.com/Lightning-AI/lightning/pull/10442)) +- Removed deprecated `LightningModule.loaded_optimizer_states_dict` property ([#10346](https://github.com/Lightning-AI/lightning/pull/10346)) +- Removed deprecated `Trainer.fit(train_dataloader=)`, `Trainer.validate(val_dataloaders=)`, and `Trainer.test(test_dataloader=)` ([#10325](https://github.com/Lightning-AI/lightning/pull/10325)) +- Removed deprecated `has_prepared_data`, `has_setup_fit`, `has_setup_validate`, `has_setup_test`, `has_setup_predict`, `has_teardown_fit`, `has_teardown_validate`, `has_teardown_test` and `has_teardown_predict` datamodule lifecycle properties ([#10350](https://github.com/Lightning-AI/lightning/pull/10350)) +- Removed deprecated `every_n_val_epochs` parameter of ModelCheckpoint ([#10366](https://github.com/Lightning-AI/lightning/pull/10366)) +- Removed deprecated `import pytorch_lightning.profiler.profilers` in favor of `import pytorch_lightning.profiler` ([#10443](https://github.com/Lightning-AI/lightning/pull/10443)) +- Removed deprecated property `configure_slurm_dpp` from accelerator connector ([#10370](https://github.com/Lightning-AI/lightning/pull/10370)) +- Removed deprecated arguments `num_nodes` and `sync_batchnorm` from `DDPPlugin`, `DDPSpawnPlugin`, `DeepSpeedPlugin` ([#10357](https://github.com/Lightning-AI/lightning/pull/10357)) +- Removed deprecated property `is_slurm_managing_tasks` from AcceleratorConnector ([#10353](https://github.com/Lightning-AI/lightning/pull/10353)) +- Removed deprecated `LightningModule.log(tbptt_reduce_fx, tbptt_reduce_token, sync_dist_op)` ([#10423](https://github.com/Lightning-AI/lightning/pull/10423)) +- Removed deprecated `Plugin.task_idx` ([#10441](https://github.com/Lightning-AI/lightning/pull/10441)) +- Removed deprecated method `master_params` from PrecisionPlugin ([#10372](https://github.com/Lightning-AI/lightning/pull/10372)) +- Removed the automatic detachment of "extras" returned from `training_step`. For example, `return {'loss': ..., 'foo': foo.detach()}` will now be necessary if `foo` has gradients which you do not want to store ([#10424](https://github.com/Lightning-AI/lightning/pull/10424)) - Removed deprecated passthrough methods and properties from `Accelerator` base class: - * ([#10403](https://github.com/PyTorchLightning/pytorch-lightning/pull/10403)) - * ([#10448](https://github.com/PyTorchLightning/pytorch-lightning/pull/10448)) -- Removed deprecated signature for `transfer_batch_to_device` hook. The new argument `dataloader_idx` is now required ([#10480](https://github.com/PyTorchLightning/pytorch-lightning/pull/10480)) -- Removed deprecated `utilities.distributed.rank_zero_{warn/deprecation}` ([#10451](https://github.com/PyTorchLightning/pytorch-lightning/pull/10451)) -- Removed deprecated `mode` argument from `ModelSummary` class ([#10449](https://github.com/PyTorchLightning/pytorch-lightning/pull/10449)) -- Removed deprecated `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#10482](https://github.com/PyTorchLightning/pytorch-lightning/pull/10482)) -- Removed deprecated `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#10482](https://github.com/PyTorchLightning/pytorch-lightning/pull/10482)) -- Removed deprecated `disable_validation` property from Trainer ([#10450](https://github.com/PyTorchLightning/pytorch-lightning/pull/10450)) -- Removed deprecated `CheckpointConnector.hpc_load` property in favor of `CheckpointConnector.restore` ([#10525](https://github.com/PyTorchLightning/pytorch-lightning/pull/10525)) -- Removed deprecated `reload_dataloaders_every_epoch` from `Trainer` in favour of `reload_dataloaders_every_n_epochs` ([#10481](https://github.com/PyTorchLightning/pytorch-lightning/pull/10481)) -- Removed the `precision_plugin` attribute from `Accelerator` in favor of its equivalent attribute `precision_plugin` in the `TrainingTypePlugin` ([#10570](https://github.com/PyTorchLightning/pytorch-lightning/pull/10570)) -- Removed `DeepSpeedPlugin.{precision,amp_type,amp_level}` properties ([#10657](https://github.com/PyTorchLightning/pytorch-lightning/pull/10657)) -- Removed patching of `on_before_batch_transfer`, `transfer_batch_to_device` and `on_after_batch_transfer` hooks in `LightningModule` ([#10603](https://github.com/PyTorchLightning/pytorch-lightning/pull/10603)) -- Removed argument `return_result` from the `DDPSpawnPlugin.spawn()` method ([#10867](https://github.com/PyTorchLightning/pytorch-lightning/pull/10867)) -- Removed the property `TrainingTypePlugin.results` and corresponding properties in subclasses ([#10034](https://github.com/PyTorchLightning/pytorch-lightning/pull/10034)) -- Removed the `mp_queue` attribute from `DDPSpawnPlugin` and `TPUSpawnPlugin` ([#10034](https://github.com/PyTorchLightning/pytorch-lightning/pull/10034)) -- Removed unnecessary `_move_optimizer_state` method overrides from `TPUSpawnPlugin` and `SingleTPUPlugin` ([#10849](https://github.com/PyTorchLightning/pytorch-lightning/pull/10849)) -- Removed `should_rank_save_checkpoint` property from `TrainingTypePlugin` ([#11070](https://github.com/PyTorchLightning/pytorch-lightning/pull/11070)) -- Removed `model_sharded_context` method from `Accelerator` ([#10886](https://github.com/PyTorchLightning/pytorch-lightning/pull/10886)) -- Removed method `pre_dispatch` from the `PrecisionPlugin` ([#10887](https://github.com/PyTorchLightning/pytorch-lightning/pull/10887)) -- Removed method `setup_optimizers_in_pre_dispatch` from the `strategies` and achieve the same logic in `setup` and `pre_dispatch` methods ([#10906](https://github.com/PyTorchLightning/pytorch-lightning/pull/10906)) -- Removed methods `pre_dispatch`, `dispatch` and `post_dispatch` from the `Accelerator` ([#10885](https://github.com/PyTorchLightning/pytorch-lightning/pull/10885)) -- Removed method `training_step`, `test_step`, `validation_step` and `predict_step` from the `Accelerator` ([#10890](https://github.com/PyTorchLightning/pytorch-lightning/pull/10890)) -- Removed `TrainingTypePlugin.start_{training,evaluating,predicting}` hooks and the same in all subclasses ([#10989](https://github.com/PyTorchLightning/pytorch-lightning/pull/10989), [#10896](https://github.com/PyTorchLightning/pytorch-lightning/pull/10896)) -- Removed `Accelerator.on_train_start` ([#10999](https://github.com/PyTorchLightning/pytorch-lightning/pull/10999)) -- Removed support for Python 3.6 ([#11117](https://github.com/PyTorchLightning/pytorch-lightning/pull/11117)) -- Removed `Strategy.init_optimizers` in favor of `Strategy.setup_optimizers` ([#11236](https://github.com/PyTorchLightning/pytorch-lightning/pull/11236)) -- Removed `profile("training_step_and_backward")` in `Closure` class since we already profile calls `training_step` and `backward` ([#11222](https://github.com/PyTorchLightning/pytorch-lightning/pull/11222)) -- Removed `Strategy.optimizer_zero_grad` ([#11246](https://github.com/PyTorchLightning/pytorch-lightning/pull/11246)) -- Removed `Strategy.on_gpu` ([#11537](https://github.com/PyTorchLightning/pytorch-lightning/pull/11537)) -- Removed `Strategy.on_tpu` property ([#11536](https://github.com/PyTorchLightning/pytorch-lightning/pull/11536)) -- Removed the abstract property `LightningLoggerBase.experiment` ([#11603](https://github.com/PyTorchLightning/pytorch-lightning/pull/11603)) -- Removed `FitLoop.current_epoch` getter and setter ([#11562](https://github.com/PyTorchLightning/pytorch-lightning/pull/11562)) -- Removed access to `_short_id` in `NeptuneLogger` ([#11517](https://github.com/PyTorchLightning/pytorch-lightning/pull/11517)) -- Removed `log_text` and `log_image` from the `LightningLoggerBase` API ([#11857](https://github.com/PyTorchLightning/pytorch-lightning/pull/11857)) -- Removed calls to `profile("model_forward")` in favor of profiling `training_step` ([#12032](https://github.com/PyTorchLightning/pytorch-lightning/pull/12032)) -- Removed `get_mp_spawn_kwargs` from `DDPSpawnStrategy` and `TPUSpawnStrategy` in favor of configuration in the `_SpawnLauncher` ([#11966](https://github.com/PyTorchLightning/pytorch-lightning/pull/11966)) -- Removed `_aggregate_metrics`, `_reduce_agg_metrics`, and `_finalize_agg_metrics` from `LightningLoggerBase` ([#12053](https://github.com/PyTorchLightning/pytorch-lightning/pull/12053)) -- Removed the `AcceleratorConnector.device_type` property ([#12081](https://github.com/PyTorchLightning/pytorch-lightning/pull/12081)) -- Removed `AcceleratorConnector.num_nodes` ([#12107](https://github.com/PyTorchLightning/pytorch-lightning/pull/12107)) -- Removed `AcceleratorConnector.has_ipu` property ([#12111](https://github.com/PyTorchLightning/pytorch-lightning/pull/12111)) -- Removed `AcceleratorConnector.use_ipu` property ([#12110](https://github.com/PyTorchLightning/pytorch-lightning/pull/12110)) -- Removed `AcceleratorConnector.has_tpu` property ([#12109](https://github.com/PyTorchLightning/pytorch-lightning/pull/12109)) -- Removed `AcceleratorConnector.use_dp` property ([#12112](https://github.com/PyTorchLightning/pytorch-lightning/pull/12112)) -- Removed `configure_sync_batchnorm` from `ParallelStrategy` and all other strategies that inherit from it ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754)) -- Removed public attribute `sync_batchnorm` from strategies ([#11754](https://github.com/PyTorchLightning/pytorch-lightning/pull/11754)) -- Removed `AcceleratorConnector.root_gpu` property ([#12262](https://github.com/PyTorchLightning/pytorch-lightning/pull/12262)) -- Removed `AcceleratorConnector.tpu_id` property ([#12387](https://github.com/PyTorchLightning/pytorch-lightning/pull/12387)) -- Removed `AcceleratorConnector.num_gpus` property ([#12384](https://github.com/PyTorchLightning/pytorch-lightning/pull/12384)) -- Removed `AcceleratorConnector.num_ipus` property ([#12386](https://github.com/PyTorchLightning/pytorch-lightning/pull/12386)) -- Removed `AcceleratorConnector.num_processes` property ([#12388](https://github.com/PyTorchLightning/pytorch-lightning/pull/12388)) -- Removed `AcceleratorConnector.parallel_device_ids` property ([#12072](https://github.com/PyTorchLightning/pytorch-lightning/pull/12072)) -- Removed `AcceleratorConnector.devices` property ([#12435](https://github.com/PyTorchLightning/pytorch-lightning/pull/12435)) -- Removed `AcceleratorConnector.parallel_devices` property ([#12075](https://github.com/PyTorchLightning/pytorch-lightning/pull/12075)) -- Removed `AcceleratorConnector.tpu_cores` property ([#12437](https://github.com/PyTorchLightning/pytorch-lightning/pull/12437)) - -### Fixed - -- Fixed an issue where `ModelCheckpoint` could delete last checkpoint from the old directory when `dirpath` has changed during resumed training ([#12225](https://github.com/PyTorchLightning/pytorch-lightning/pull/12225)) -- Fixed an issue where `ModelCheckpoint` could delete older checkpoints when `dirpath` has changed during resumed training ([#12045](https://github.com/PyTorchLightning/pytorch-lightning/pull/12045)) -- Fixed an issue where `HorovodStrategy.teardown()` did not complete gracefully if an exception was thrown during callback setup [#11752](https://github.com/PyTorchLightning/pytorch-lightning/pull/11752) -- Fixed security vulnerabilities CVE-2020-1747 and CVE-2020-14343 caused by the `PyYAML` dependency ([#11099](https://github.com/PyTorchLightning/pytorch-lightning/pull/11099)) -- Fixed security vulnerability "CWE-94: Improper Control of Generation of Code (Code Injection)" ([#12212](https://github.com/PyTorchLightning/pytorch-lightning/pull/12212)) -- Fixed logging on `{test,validation}_epoch_end` with multiple dataloaders ([#11132](https://github.com/PyTorchLightning/pytorch-lightning/pull/11132)) -- Reset the validation progress tracking state after sanity checking ([#11218](https://github.com/PyTorchLightning/pytorch-lightning/pull/11218)) -- Fixed double evaluation bug with fault-tolerance enabled where the second call was completely skipped ([#11119](https://github.com/PyTorchLightning/pytorch-lightning/pull/11119)) -- Fixed an issue with the `TPUSpawnPlugin` handling the `XLA_USE_BF16` environment variable incorrectly ([#10990](https://github.com/PyTorchLightning/pytorch-lightning/pull/10990)) -- Fixed wrong typehint for `Trainer.lightning_optimizers` ([#11155](https://github.com/PyTorchLightning/pytorch-lightning/pull/11155)) -- Fixed the lr-scheduler state not being dumped to checkpoint when using the deepspeed strategy ([#11307](https://github.com/PyTorchLightning/pytorch-lightning/pull/11307)) -- Fixed bug that forced overriding `configure_optimizers` with the CLI ([#11672](https://github.com/PyTorchLightning/pytorch-lightning/pull/11672)) -- Fixed type promotion when tensors of higher category than float are logged ([#11401](https://github.com/PyTorchLightning/pytorch-lightning/pull/11401)) -- Fixed `SimpleProfiler` summary ([#11414](https://github.com/PyTorchLightning/pytorch-lightning/pull/11414)) -- No longer set a `DistributedSampler` to the `poptorch.DataLoader` when IPUs are used ([#12114](https://github.com/PyTorchLightning/pytorch-lightning/pull/12114)) -- Fixed bug where progress bar was not being disabled when not in rank zero during predict ([#11377](https://github.com/PyTorchLightning/pytorch-lightning/pull/11377)) -- Fixed the mid-epoch warning call while resuming training ([#11556](https://github.com/PyTorchLightning/pytorch-lightning/pull/11556)) -- Fixed `LightningModule.{un,}toggle_model` when only 1 optimizer is used ([#12088](https://github.com/PyTorchLightning/pytorch-lightning/pull/12088)) -- Fixed an issue in `RichProgressbar` to display the metrics logged only on main progress bar ([#11690](https://github.com/PyTorchLightning/pytorch-lightning/pull/11690)) -- Fixed `RichProgressBar` progress when refresh rate does not evenly divide the total counter ([#11668](https://github.com/PyTorchLightning/pytorch-lightning/pull/11668)) -- Fixed `RichProgressBar` progress validation bar total when using multiple validation runs within a single training epoch ([#11668](https://github.com/PyTorchLightning/pytorch-lightning/pull/11668)) -- Configure native Deepspeed schedulers with interval='step' ([#11788](https://github.com/PyTorchLightning/pytorch-lightning/pull/11788)), ([#12031](https://github.com/PyTorchLightning/pytorch-lightning/pull/12031)) -- Update `RichProgressBarTheme` styles after detecting light theme on colab ([#10993](https://github.com/PyTorchLightning/pytorch-lightning/pull/10993)) -- Fixed passing `_ddp_params_and_buffers_to_ignore` ([#11949](https://github.com/PyTorchLightning/pytorch-lightning/pull/11949)) -- Fixed an `AttributeError` when calling `save_hyperparameters` and no parameters need saving ([#11827](https://github.com/PyTorchLightning/pytorch-lightning/pull/11827)) -- Fixed environment variable priority for global rank determination ([#11406](https://github.com/PyTorchLightning/pytorch-lightning/pull/11406)) -- Fixed an issue that caused the Trainer to produce identical results on subsequent runs without explicit re-seeding ([#11870](https://github.com/PyTorchLightning/pytorch-lightning/pull/11870)) -- Fixed an issue that caused the Tuner to affect the random state ([#11870](https://github.com/PyTorchLightning/pytorch-lightning/pull/11870)) -- Fixed to avoid common hook warning if no hook is overridden ([#12131](https://github.com/PyTorchLightning/pytorch-lightning/pull/12131)) -- Fixed deepspeed keeping old sub-folders in same ckpt path ([#12194](https://github.com/PyTorchLightning/pytorch-lightning/pull/12194)) -- Fixed returning logged metrics instead of callback metrics during evaluation ([#12224](https://github.com/PyTorchLightning/pytorch-lightning/pull/12224)) -- Fixed the case where `logger=None` is passed to the Trainer ([#12249](https://github.com/PyTorchLightning/pytorch-lightning/pull/12249)) -- Fixed bug where the global step tracked by `ModelCheckpoint` was still set even if no checkpoint was saved ([#12418](https://github.com/PyTorchLightning/pytorch-lightning/pull/12418)) -- Fixed bug where `ModelCheckpoint` was overriding the `epoch` and `step` logged values ([#12418](https://github.com/PyTorchLightning/pytorch-lightning/pull/12418)) -- Fixed bug where monitoring the default `epoch` and `step` values with `ModelCheckpoint` would fail ([#12418](https://github.com/PyTorchLightning/pytorch-lightning/pull/12418)) -- Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/PyTorchLightning/pytorch-lightning/pull/12267)) -- Fixed check for horovod module ([#12377](https://github.com/PyTorchLightning/pytorch-lightning/pull/12377)) -- Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/PyTorchLightning/pytorch-lightning/pull/12454)) -- Fixed an issue with resuming from a checkpoint trained with QAT ([#11346](https://github.com/PyTorchLightning/pytorch-lightning/pull/11346)) + * ([#10403](https://github.com/Lightning-AI/lightning/pull/10403)) + * ([#10448](https://github.com/Lightning-AI/lightning/pull/10448)) +- Removed deprecated signature for `transfer_batch_to_device` hook. The new argument `dataloader_idx` is now required ([#10480](https://github.com/Lightning-AI/lightning/pull/10480)) +- Removed deprecated `utilities.distributed.rank_zero_{warn/deprecation}` ([#10451](https://github.com/Lightning-AI/lightning/pull/10451)) +- Removed deprecated `mode` argument from `ModelSummary` class ([#10449](https://github.com/Lightning-AI/lightning/pull/10449)) +- Removed deprecated `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#10482](https://github.com/Lightning-AI/lightning/pull/10482)) +- Removed deprecated `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#10482](https://github.com/Lightning-AI/lightning/pull/10482)) +- Removed deprecated `disable_validation` property from Trainer ([#10450](https://github.com/Lightning-AI/lightning/pull/10450)) +- Removed deprecated `CheckpointConnector.hpc_load` property in favor of `CheckpointConnector.restore` ([#10525](https://github.com/Lightning-AI/lightning/pull/10525)) +- Removed deprecated `reload_dataloaders_every_epoch` from `Trainer` in favour of `reload_dataloaders_every_n_epochs` ([#10481](https://github.com/Lightning-AI/lightning/pull/10481)) +- Removed the `precision_plugin` attribute from `Accelerator` in favor of its equivalent attribute `precision_plugin` in the `TrainingTypePlugin` ([#10570](https://github.com/Lightning-AI/lightning/pull/10570)) +- Removed `DeepSpeedPlugin.{precision,amp_type,amp_level}` properties ([#10657](https://github.com/Lightning-AI/lightning/pull/10657)) +- Removed patching of `on_before_batch_transfer`, `transfer_batch_to_device` and `on_after_batch_transfer` hooks in `LightningModule` ([#10603](https://github.com/Lightning-AI/lightning/pull/10603)) +- Removed argument `return_result` from the `DDPSpawnPlugin.spawn()` method ([#10867](https://github.com/Lightning-AI/lightning/pull/10867)) +- Removed the property `TrainingTypePlugin.results` and corresponding properties in subclasses ([#10034](https://github.com/Lightning-AI/lightning/pull/10034)) +- Removed the `mp_queue` attribute from `DDPSpawnPlugin` and `TPUSpawnPlugin` ([#10034](https://github.com/Lightning-AI/lightning/pull/10034)) +- Removed unnecessary `_move_optimizer_state` method overrides from `TPUSpawnPlugin` and `SingleTPUPlugin` ([#10849](https://github.com/Lightning-AI/lightning/pull/10849)) +- Removed `should_rank_save_checkpoint` property from `TrainingTypePlugin` ([#11070](https://github.com/Lightning-AI/lightning/pull/11070)) +- Removed `model_sharded_context` method from `Accelerator` ([#10886](https://github.com/Lightning-AI/lightning/pull/10886)) +- Removed method `pre_dispatch` from the `PrecisionPlugin` ([#10887](https://github.com/Lightning-AI/lightning/pull/10887)) +- Removed method `setup_optimizers_in_pre_dispatch` from the `strategies` and achieve the same logic in `setup` and `pre_dispatch` methods ([#10906](https://github.com/Lightning-AI/lightning/pull/10906)) +- Removed methods `pre_dispatch`, `dispatch` and `post_dispatch` from the `Accelerator` ([#10885](https://github.com/Lightning-AI/lightning/pull/10885)) +- Removed method `training_step`, `test_step`, `validation_step` and `predict_step` from the `Accelerator` ([#10890](https://github.com/Lightning-AI/lightning/pull/10890)) +- Removed `TrainingTypePlugin.start_{training,evaluating,predicting}` hooks and the same in all subclasses ([#10989](https://github.com/Lightning-AI/lightning/pull/10989), [#10896](https://github.com/Lightning-AI/lightning/pull/10896)) +- Removed `Accelerator.on_train_start` ([#10999](https://github.com/Lightning-AI/lightning/pull/10999)) +- Removed support for Python 3.6 ([#11117](https://github.com/Lightning-AI/lightning/pull/11117)) +- Removed `Strategy.init_optimizers` in favor of `Strategy.setup_optimizers` ([#11236](https://github.com/Lightning-AI/lightning/pull/11236)) +- Removed `profile("training_step_and_backward")` in `Closure` class since we already profile calls `training_step` and `backward` ([#11222](https://github.com/Lightning-AI/lightning/pull/11222)) +- Removed `Strategy.optimizer_zero_grad` ([#11246](https://github.com/Lightning-AI/lightning/pull/11246)) +- Removed `Strategy.on_gpu` ([#11537](https://github.com/Lightning-AI/lightning/pull/11537)) +- Removed `Strategy.on_tpu` property ([#11536](https://github.com/Lightning-AI/lightning/pull/11536)) +- Removed the abstract property `LightningLoggerBase.experiment` ([#11603](https://github.com/Lightning-AI/lightning/pull/11603)) +- Removed `FitLoop.current_epoch` getter and setter ([#11562](https://github.com/Lightning-AI/lightning/pull/11562)) +- Removed access to `_short_id` in `NeptuneLogger` ([#11517](https://github.com/Lightning-AI/lightning/pull/11517)) +- Removed `log_text` and `log_image` from the `LightningLoggerBase` API ([#11857](https://github.com/Lightning-AI/lightning/pull/11857)) +- Removed calls to `profile("model_forward")` in favor of profiling `training_step` ([#12032](https://github.com/Lightning-AI/lightning/pull/12032)) +- Removed `get_mp_spawn_kwargs` from `DDPSpawnStrategy` and `TPUSpawnStrategy` in favor of configuration in the `_SpawnLauncher` ([#11966](https://github.com/Lightning-AI/lightning/pull/11966)) +- Removed `_aggregate_metrics`, `_reduce_agg_metrics`, and `_finalize_agg_metrics` from `LightningLoggerBase` ([#12053](https://github.com/Lightning-AI/lightning/pull/12053)) +- Removed the `AcceleratorConnector.device_type` property ([#12081](https://github.com/Lightning-AI/lightning/pull/12081)) +- Removed `AcceleratorConnector.num_nodes` ([#12107](https://github.com/Lightning-AI/lightning/pull/12107)) +- Removed `AcceleratorConnector.has_ipu` property ([#12111](https://github.com/Lightning-AI/lightning/pull/12111)) +- Removed `AcceleratorConnector.use_ipu` property ([#12110](https://github.com/Lightning-AI/lightning/pull/12110)) +- Removed `AcceleratorConnector.has_tpu` property ([#12109](https://github.com/Lightning-AI/lightning/pull/12109)) +- Removed `AcceleratorConnector.use_dp` property ([#12112](https://github.com/Lightning-AI/lightning/pull/12112)) +- Removed `configure_sync_batchnorm` from `ParallelStrategy` and all other strategies that inherit from it ([#11754](https://github.com/Lightning-AI/lightning/pull/11754)) +- Removed public attribute `sync_batchnorm` from strategies ([#11754](https://github.com/Lightning-AI/lightning/pull/11754)) +- Removed `AcceleratorConnector.root_gpu` property ([#12262](https://github.com/Lightning-AI/lightning/pull/12262)) +- Removed `AcceleratorConnector.tpu_id` property ([#12387](https://github.com/Lightning-AI/lightning/pull/12387)) +- Removed `AcceleratorConnector.num_gpus` property ([#12384](https://github.com/Lightning-AI/lightning/pull/12384)) +- Removed `AcceleratorConnector.num_ipus` property ([#12386](https://github.com/Lightning-AI/lightning/pull/12386)) +- Removed `AcceleratorConnector.num_processes` property ([#12388](https://github.com/Lightning-AI/lightning/pull/12388)) +- Removed `AcceleratorConnector.parallel_device_ids` property ([#12072](https://github.com/Lightning-AI/lightning/pull/12072)) +- Removed `AcceleratorConnector.devices` property ([#12435](https://github.com/Lightning-AI/lightning/pull/12435)) +- Removed `AcceleratorConnector.parallel_devices` property ([#12075](https://github.com/Lightning-AI/lightning/pull/12075)) +- Removed `AcceleratorConnector.tpu_cores` property ([#12437](https://github.com/Lightning-AI/lightning/pull/12437)) + +### Fixed + +- Fixed an issue where `ModelCheckpoint` could delete last checkpoint from the old directory when `dirpath` has changed during resumed training ([#12225](https://github.com/Lightning-AI/lightning/pull/12225)) +- Fixed an issue where `ModelCheckpoint` could delete older checkpoints when `dirpath` has changed during resumed training ([#12045](https://github.com/Lightning-AI/lightning/pull/12045)) +- Fixed an issue where `HorovodStrategy.teardown()` did not complete gracefully if an exception was thrown during callback setup [#11752](https://github.com/Lightning-AI/lightning/pull/11752) +- Fixed security vulnerabilities CVE-2020-1747 and CVE-2020-14343 caused by the `PyYAML` dependency ([#11099](https://github.com/Lightning-AI/lightning/pull/11099)) +- Fixed security vulnerability "CWE-94: Improper Control of Generation of Code (Code Injection)" ([#12212](https://github.com/Lightning-AI/lightning/pull/12212)) +- Fixed logging on `{test,validation}_epoch_end` with multiple dataloaders ([#11132](https://github.com/Lightning-AI/lightning/pull/11132)) +- Reset the validation progress tracking state after sanity checking ([#11218](https://github.com/Lightning-AI/lightning/pull/11218)) +- Fixed double evaluation bug with fault-tolerance enabled where the second call was completely skipped ([#11119](https://github.com/Lightning-AI/lightning/pull/11119)) +- Fixed an issue with the `TPUSpawnPlugin` handling the `XLA_USE_BF16` environment variable incorrectly ([#10990](https://github.com/Lightning-AI/lightning/pull/10990)) +- Fixed wrong typehint for `Trainer.lightning_optimizers` ([#11155](https://github.com/Lightning-AI/lightning/pull/11155)) +- Fixed the lr-scheduler state not being dumped to checkpoint when using the deepspeed strategy ([#11307](https://github.com/Lightning-AI/lightning/pull/11307)) +- Fixed bug that forced overriding `configure_optimizers` with the CLI ([#11672](https://github.com/Lightning-AI/lightning/pull/11672)) +- Fixed type promotion when tensors of higher category than float are logged ([#11401](https://github.com/Lightning-AI/lightning/pull/11401)) +- Fixed `SimpleProfiler` summary ([#11414](https://github.com/Lightning-AI/lightning/pull/11414)) +- No longer set a `DistributedSampler` to the `poptorch.DataLoader` when IPUs are used ([#12114](https://github.com/Lightning-AI/lightning/pull/12114)) +- Fixed bug where progress bar was not being disabled when not in rank zero during predict ([#11377](https://github.com/Lightning-AI/lightning/pull/11377)) +- Fixed the mid-epoch warning call while resuming training ([#11556](https://github.com/Lightning-AI/lightning/pull/11556)) +- Fixed `LightningModule.{un,}toggle_model` when only 1 optimizer is used ([#12088](https://github.com/Lightning-AI/lightning/pull/12088)) +- Fixed an issue in `RichProgressbar` to display the metrics logged only on main progress bar ([#11690](https://github.com/Lightning-AI/lightning/pull/11690)) +- Fixed `RichProgressBar` progress when refresh rate does not evenly divide the total counter ([#11668](https://github.com/Lightning-AI/lightning/pull/11668)) +- Fixed `RichProgressBar` progress validation bar total when using multiple validation runs within a single training epoch ([#11668](https://github.com/Lightning-AI/lightning/pull/11668)) +- Configure native Deepspeed schedulers with interval='step' ([#11788](https://github.com/Lightning-AI/lightning/pull/11788)), ([#12031](https://github.com/Lightning-AI/lightning/pull/12031)) +- Update `RichProgressBarTheme` styles after detecting light theme on colab ([#10993](https://github.com/Lightning-AI/lightning/pull/10993)) +- Fixed passing `_ddp_params_and_buffers_to_ignore` ([#11949](https://github.com/Lightning-AI/lightning/pull/11949)) +- Fixed an `AttributeError` when calling `save_hyperparameters` and no parameters need saving ([#11827](https://github.com/Lightning-AI/lightning/pull/11827)) +- Fixed environment variable priority for global rank determination ([#11406](https://github.com/Lightning-AI/lightning/pull/11406)) +- Fixed an issue that caused the Trainer to produce identical results on subsequent runs without explicit re-seeding ([#11870](https://github.com/Lightning-AI/lightning/pull/11870)) +- Fixed an issue that caused the Tuner to affect the random state ([#11870](https://github.com/Lightning-AI/lightning/pull/11870)) +- Fixed to avoid common hook warning if no hook is overridden ([#12131](https://github.com/Lightning-AI/lightning/pull/12131)) +- Fixed deepspeed keeping old sub-folders in same ckpt path ([#12194](https://github.com/Lightning-AI/lightning/pull/12194)) +- Fixed returning logged metrics instead of callback metrics during evaluation ([#12224](https://github.com/Lightning-AI/lightning/pull/12224)) +- Fixed the case where `logger=None` is passed to the Trainer ([#12249](https://github.com/Lightning-AI/lightning/pull/12249)) +- Fixed bug where the global step tracked by `ModelCheckpoint` was still set even if no checkpoint was saved ([#12418](https://github.com/Lightning-AI/lightning/pull/12418)) +- Fixed bug where `ModelCheckpoint` was overriding the `epoch` and `step` logged values ([#12418](https://github.com/Lightning-AI/lightning/pull/12418)) +- Fixed bug where monitoring the default `epoch` and `step` values with `ModelCheckpoint` would fail ([#12418](https://github.com/Lightning-AI/lightning/pull/12418)) +- Fixed initializing optimizers unnecessarily in `DDPFullyShardedStrategy` ([#12267](https://github.com/Lightning-AI/lightning/pull/12267)) +- Fixed check for horovod module ([#12377](https://github.com/Lightning-AI/lightning/pull/12377)) +- Fixed logging to loggers with multiple eval dataloaders ([#12454](https://github.com/Lightning-AI/lightning/pull/12454)) +- Fixed an issue with resuming from a checkpoint trained with QAT ([#11346](https://github.com/Lightning-AI/lightning/pull/11346)) ## [1.5.10] - 2022-02-08 ### Fixed -- Fixed an issue to avoid validation loop run on restart ([#11552](https://github.com/PyTorchLightning/pytorch-lightning/pull/11552)) -- The `RichProgressBar` now correctly shows the `on_epoch` logged values on train epoch end ([#11689](https://github.com/PyTorchLightning/pytorch-lightning/pull/11689)) -- Fixed an issue to make the `step` argument in `WandbLogger.log_image` work ([#11716](https://github.com/PyTorchLightning/pytorch-lightning/pull/11716)) -- Fixed `restore_optimizers` for mapping states ([#11757](https://github.com/PyTorchLightning/pytorch-lightning/pull/11757)) -- With `DPStrategy`, the batch is not explicitly moved to the device ([#11780](https://github.com/PyTorchLightning/pytorch-lightning/pull/11780)) -- Fixed an issue to avoid val bar disappear after `trainer.validate()` ([#11700](https://github.com/PyTorchLightning/pytorch-lightning/pull/11700)) -- Fixed supporting remote filesystems with `Trainer.weights_save_path` for fault-tolerant training ([#11776](https://github.com/PyTorchLightning/pytorch-lightning/pull/11776)) -- Fixed check for available modules ([#11526](https://github.com/PyTorchLightning/pytorch-lightning/pull/11526)) -- Fixed bug where the path for "last" checkpoints was not getting saved correctly which caused newer runs to not remove the previous "last" checkpoint ([#11481](https://github.com/PyTorchLightning/pytorch-lightning/pull/11481)) -- Fixed bug where the path for best checkpoints was not getting saved correctly when no metric was monitored which caused newer runs to not use the best checkpoint ([#11481](https://github.com/PyTorchLightning/pytorch-lightning/pull/11481)) +- Fixed an issue to avoid validation loop run on restart ([#11552](https://github.com/Lightning-AI/lightning/pull/11552)) +- The `RichProgressBar` now correctly shows the `on_epoch` logged values on train epoch end ([#11689](https://github.com/Lightning-AI/lightning/pull/11689)) +- Fixed an issue to make the `step` argument in `WandbLogger.log_image` work ([#11716](https://github.com/Lightning-AI/lightning/pull/11716)) +- Fixed `restore_optimizers` for mapping states ([#11757](https://github.com/Lightning-AI/lightning/pull/11757)) +- With `DPStrategy`, the batch is not explicitly moved to the device ([#11780](https://github.com/Lightning-AI/lightning/pull/11780)) +- Fixed an issue to avoid val bar disappear after `trainer.validate()` ([#11700](https://github.com/Lightning-AI/lightning/pull/11700)) +- Fixed supporting remote filesystems with `Trainer.weights_save_path` for fault-tolerant training ([#11776](https://github.com/Lightning-AI/lightning/pull/11776)) +- Fixed check for available modules ([#11526](https://github.com/Lightning-AI/lightning/pull/11526)) +- Fixed bug where the path for "last" checkpoints was not getting saved correctly which caused newer runs to not remove the previous "last" checkpoint ([#11481](https://github.com/Lightning-AI/lightning/pull/11481)) +- Fixed bug where the path for best checkpoints was not getting saved correctly when no metric was monitored which caused newer runs to not use the best checkpoint ([#11481](https://github.com/Lightning-AI/lightning/pull/11481)) ## [1.5.9] - 2022-01-20 ### Fixed -- Pinned sphinx-autodoc-typehints with 0` ([#10870](https://github.com/PyTorchLightning/pytorch-lightning/pull/10870)) -- Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/PyTorchLightning/pytorch-lightning/pull/10917)) -- Fixed importing `torch_xla.debug` for `torch-xla<1.8` ([#10836](https://github.com/PyTorchLightning/pytorch-lightning/pull/10836)) -- Fixed an issue with `DDPSpawnPlugin` and related plugins leaving a temporary checkpoint behind ([#10934](https://github.com/PyTorchLightning/pytorch-lightning/pull/10934)) -- Fixed a `TypeError` occurring in the `SingalConnector.teardown()` method ([#10961](https://github.com/PyTorchLightning/pytorch-lightning/pull/10961)) +- Disabled batch_size extraction for torchmetric instances because they accumulate the metrics internally ([#10815](https://github.com/Lightning-AI/lightning/pull/10815)) +- Fixed an issue with `SignalConnector` not restoring the default signal handlers on teardown when running on SLURM or with fault-tolerant training enabled ([#10611](https://github.com/Lightning-AI/lightning/pull/10611)) +- Fixed `SignalConnector._has_already_handler` check for callable type ([#10483](https://github.com/Lightning-AI/lightning/pull/10483)) +- Fixed an issue to return the results for each dataloader separately instead of duplicating them for each ([#10810](https://github.com/Lightning-AI/lightning/pull/10810)) +- Improved exception message if `rich` version is less than `10.2.2` ([#10839](https://github.com/Lightning-AI/lightning/pull/10839)) +- Fixed uploading best model checkpoint in NeptuneLogger ([#10369](https://github.com/Lightning-AI/lightning/pull/10369)) +- Fixed early schedule reset logic in PyTorch profiler that was causing data leak ([#10837](https://github.com/Lightning-AI/lightning/pull/10837)) +- Fixed a bug that caused incorrect batch indices to be passed to the `BasePredictionWriter` hooks when using a dataloader with `num_workers > 0` ([#10870](https://github.com/Lightning-AI/lightning/pull/10870)) +- Fixed an issue with item assignment on the logger on rank > 0 for those who support it ([#10917](https://github.com/Lightning-AI/lightning/pull/10917)) +- Fixed importing `torch_xla.debug` for `torch-xla<1.8` ([#10836](https://github.com/Lightning-AI/lightning/pull/10836)) +- Fixed an issue with `DDPSpawnPlugin` and related plugins leaving a temporary checkpoint behind ([#10934](https://github.com/Lightning-AI/lightning/pull/10934)) +- Fixed a `TypeError` occurring in the `SingalConnector.teardown()` method ([#10961](https://github.com/Lightning-AI/lightning/pull/10961)) ## [1.5.4] - 2021-11-30 ### Fixed -- Fixed support for `--key.help=class` with the `LightningCLI` ([#10767](https://github.com/PyTorchLightning/pytorch-lightning/pull/10767)) -- Fixed `_compare_version` for python packages ([#10762](https://github.com/PyTorchLightning/pytorch-lightning/pull/10762)) -- Fixed TensorBoardLogger `SummaryWriter` not close before spawning the processes ([#10777](https://github.com/PyTorchLightning/pytorch-lightning/pull/10777)) -- Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/PyTorchLightning/pytorch-lightning/pull/10746)) -- Fixed the default logging level for batch hooks associated with training from `on_step=False, on_epoch=True` to `on_step=True, on_epoch=False` ([#10756](https://github.com/PyTorchLightning/pytorch-lightning/pull/10756)) +- Fixed support for `--key.help=class` with the `LightningCLI` ([#10767](https://github.com/Lightning-AI/lightning/pull/10767)) +- Fixed `_compare_version` for python packages ([#10762](https://github.com/Lightning-AI/lightning/pull/10762)) +- Fixed TensorBoardLogger `SummaryWriter` not close before spawning the processes ([#10777](https://github.com/Lightning-AI/lightning/pull/10777)) +- Fixed a consolidation error in Lite when attempting to save the state dict of a sharded optimizer ([#10746](https://github.com/Lightning-AI/lightning/pull/10746)) +- Fixed the default logging level for batch hooks associated with training from `on_step=False, on_epoch=True` to `on_step=True, on_epoch=False` ([#10756](https://github.com/Lightning-AI/lightning/pull/10756)) ### Removed -- Removed PyTorch 1.6 support ([#10367](https://github.com/PyTorchLightning/pytorch-lightning/pull/10367), [#10738](https://github.com/PyTorchLightning/pytorch-lightning/pull/10738)) +- Removed PyTorch 1.6 support ([#10367](https://github.com/Lightning-AI/lightning/pull/10367), [#10738](https://github.com/Lightning-AI/lightning/pull/10738)) ## [1.5.3] - 2021-11-24 ### Fixed -- Fixed `ShardedTensor` state dict hook registration to check if torch distributed is available ([#10621](https://github.com/PyTorchLightning/pytorch-lightning/pull/10621)) -- Fixed an issue with `self.log` not respecting a tensor's `dtype` when applying computations ([#10076](https://github.com/PyTorchLightning/pytorch-lightning/pull/10076)) -- Fixed LigtningLite `_wrap_init` popping unexisting keys from DataLoader signature parameters ([#10613](https://github.com/PyTorchLightning/pytorch-lightning/pull/10613)) -- Fixed signals being registered within threads ([#10610](https://github.com/PyTorchLightning/pytorch-lightning/pull/10610)) -- Fixed an issue that caused Lightning to extract the batch size even though it was set by the user in `LightningModule.log` ([#10408](https://github.com/PyTorchLightning/pytorch-lightning/pull/10408)) -- Fixed `Trainer(move_metrics_to_cpu=True)` not moving the evaluation logged results to CPU ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) -- Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/PyTorchLightning/pytorch-lightning/pull/10631)) -- Fixed an issue with collecting logged test results with multiple dataloaders ([#10522](https://github.com/PyTorchLightning/pytorch-lightning/pull/10522)) +- Fixed `ShardedTensor` state dict hook registration to check if torch distributed is available ([#10621](https://github.com/Lightning-AI/lightning/pull/10621)) +- Fixed an issue with `self.log` not respecting a tensor's `dtype` when applying computations ([#10076](https://github.com/Lightning-AI/lightning/pull/10076)) +- Fixed LigtningLite `_wrap_init` popping unexisting keys from DataLoader signature parameters ([#10613](https://github.com/Lightning-AI/lightning/pull/10613)) +- Fixed signals being registered within threads ([#10610](https://github.com/Lightning-AI/lightning/pull/10610)) +- Fixed an issue that caused Lightning to extract the batch size even though it was set by the user in `LightningModule.log` ([#10408](https://github.com/Lightning-AI/lightning/pull/10408)) +- Fixed `Trainer(move_metrics_to_cpu=True)` not moving the evaluation logged results to CPU ([#10631](https://github.com/Lightning-AI/lightning/pull/10631)) +- Fixed the `{validation,test}_step` outputs getting moved to CPU with `Trainer(move_metrics_to_cpu=True)` ([#10631](https://github.com/Lightning-AI/lightning/pull/10631)) +- Fixed an issue with collecting logged test results with multiple dataloaders ([#10522](https://github.com/Lightning-AI/lightning/pull/10522)) ## [1.5.2] - 2021-11-16 ### Fixed -- Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/PyTorchLightning/pytorch-lightning/pull/10374)) -- Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/PyTorchLightning/pytorch-lightning/pull/9702)) +- Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/Lightning-AI/lightning/pull/10374)) +- Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/Lightning-AI/lightning/pull/9702)) - Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) -- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/PyTorchLightning/pytorch-lightning/pull/10463)) -- Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/PyTorchLightning/pytorch-lightning/pull/10461)) -- Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/PyTorchLightning/pytorch-lightning/pull/10486)) -- Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/PyTorchLightning/pytorch-lightning/pull/10470), [#10555](https://github.com/PyTorchLightning/pytorch-lightning/pull/10555)) -- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/PyTorchLightning/pytorch-lightning/pull/10438)) -- Fixed propagation of device and dtype information to submodules of LightningLite when they inherit from `DeviceDtypeModuleMixin` ([#10559](https://github.com/PyTorchLightning/pytorch-lightning/pull/10559)) +- Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/Lightning-AI/lightning/pull/10463)) +- Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/Lightning-AI/lightning/pull/10461)) +- Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/Lightning-AI/lightning/pull/10486)) +- Fixed scripting causing false positive deprecation warnings ([#10470](https://github.com/Lightning-AI/lightning/pull/10470), [#10555](https://github.com/Lightning-AI/lightning/pull/10555)) +- Do not fail if batch size could not be inferred for logging when using DeepSpeed ([#10438](https://github.com/Lightning-AI/lightning/pull/10438)) +- Fixed propagation of device and dtype information to submodules of LightningLite when they inherit from `DeviceDtypeModuleMixin` ([#10559](https://github.com/Lightning-AI/lightning/pull/10559)) ## [1.5.1] - 2021-11-09 ### Fixed -- Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/PyTorchLightning/pytorch-lightning/pull/10316)) -- Fixed failure when `DataLoader(batch_size=None)` is passed ([#10345](https://github.com/PyTorchLightning/pytorch-lightning/pull/10345)) -- Fixed interception of `__init__` arguments for sub-classed DataLoader re-instantiation in Lite ([#10334](https://github.com/PyTorchLightning/pytorch-lightning/pull/10334)) -- Fixed issue with pickling `CSVLogger` after a call to `CSVLogger.save` ([#10388](https://github.com/PyTorchLightning/pytorch-lightning/pull/10388)) -- Fixed an import error being caused by `PostLocalSGD` when `torch.distributed` not available ([#10359](https://github.com/PyTorchLightning/pytorch-lightning/pull/10359)) -- Fixed the logging with `on_step=True` in epoch-level hooks causing unintended side-effects. Logging with `on_step=True` in epoch-level hooks will now correctly raise an error ([#10409](https://github.com/PyTorchLightning/pytorch-lightning/pull/10409)) -- Fixed deadlocks for distributed training with `RichProgressBar` ([#10428](https://github.com/PyTorchLightning/pytorch-lightning/pull/10428)) -- Fixed an issue where the model wrapper in Lite converted non-floating point tensors to float ([#10429](https://github.com/PyTorchLightning/pytorch-lightning/pull/10429)) -- Fixed an issue with inferring the dataset type in fault-tolerant training ([#10432](https://github.com/PyTorchLightning/pytorch-lightning/pull/10432)) -- Fixed dataloader workers with `persistent_workers` being deleted on every iteration ([#10434](https://github.com/PyTorchLightning/pytorch-lightning/pull/10434)) +- Fixed `apply_to_collection(defaultdict)` ([#10316](https://github.com/Lightning-AI/lightning/pull/10316)) +- Fixed failure when `DataLoader(batch_size=None)` is passed ([#10345](https://github.com/Lightning-AI/lightning/pull/10345)) +- Fixed interception of `__init__` arguments for sub-classed DataLoader re-instantiation in Lite ([#10334](https://github.com/Lightning-AI/lightning/pull/10334)) +- Fixed issue with pickling `CSVLogger` after a call to `CSVLogger.save` ([#10388](https://github.com/Lightning-AI/lightning/pull/10388)) +- Fixed an import error being caused by `PostLocalSGD` when `torch.distributed` not available ([#10359](https://github.com/Lightning-AI/lightning/pull/10359)) +- Fixed the logging with `on_step=True` in epoch-level hooks causing unintended side-effects. Logging with `on_step=True` in epoch-level hooks will now correctly raise an error ([#10409](https://github.com/Lightning-AI/lightning/pull/10409)) +- Fixed deadlocks for distributed training with `RichProgressBar` ([#10428](https://github.com/Lightning-AI/lightning/pull/10428)) +- Fixed an issue where the model wrapper in Lite converted non-floating point tensors to float ([#10429](https://github.com/Lightning-AI/lightning/pull/10429)) +- Fixed an issue with inferring the dataset type in fault-tolerant training ([#10432](https://github.com/Lightning-AI/lightning/pull/10432)) +- Fixed dataloader workers with `persistent_workers` being deleted on every iteration ([#10434](https://github.com/Lightning-AI/lightning/pull/10434)) ## [1.5.0] - 2021-11-02 ### Added -- Added support for monitoring the learning rate without schedulers in `LearningRateMonitor` ([#9786](https://github.com/PyTorchLightning/pytorch-lightning/pull/9786)) -- Added registration of `ShardedTensor` state dict hooks in `LightningModule.__init__` if the PyTorch version supports `ShardedTensor` ([#8944](https://github.com/PyTorchLightning/pytorch-lightning/pull/8944)) -- Added error handling including calling of `on_keyboard_interrupt()` and `on_exception()` for all entrypoints (fit, validate, test, predict) ([#8819](https://github.com/PyTorchLightning/pytorch-lightning/pull/8819)) -- Added a flavor of `training_step` that takes `dataloader_iter` as an argument ([#8807](https://github.com/PyTorchLightning/pytorch-lightning/pull/8807)) -- Added a `state_key` property to the `Callback` base class ([#6886](https://github.com/PyTorchLightning/pytorch-lightning/pull/6886)) +- Added support for monitoring the learning rate without schedulers in `LearningRateMonitor` ([#9786](https://github.com/Lightning-AI/lightning/pull/9786)) +- Added registration of `ShardedTensor` state dict hooks in `LightningModule.__init__` if the PyTorch version supports `ShardedTensor` ([#8944](https://github.com/Lightning-AI/lightning/pull/8944)) +- Added error handling including calling of `on_keyboard_interrupt()` and `on_exception()` for all entrypoints (fit, validate, test, predict) ([#8819](https://github.com/Lightning-AI/lightning/pull/8819)) +- Added a flavor of `training_step` that takes `dataloader_iter` as an argument ([#8807](https://github.com/Lightning-AI/lightning/pull/8807)) +- Added a `state_key` property to the `Callback` base class ([#6886](https://github.com/Lightning-AI/lightning/pull/6886)) - Added progress tracking to loops: - * Integrated `TrainingEpochLoop.total_batch_idx` ([#8598](https://github.com/PyTorchLightning/pytorch-lightning/pull/8598)) - * Added `BatchProgress` and integrated `TrainingEpochLoop.is_last_batch` ([#9657](https://github.com/PyTorchLightning/pytorch-lightning/pull/9657)) - * Avoid optional `Tracker` attributes ([#9320](https://github.com/PyTorchLightning/pytorch-lightning/pull/9320)) - * Reset `current` progress counters when restarting an epoch loop that had already finished ([#9371](https://github.com/PyTorchLightning/pytorch-lightning/pull/9371)) - * Call `reset_on_restart` in the loop's `reset` hook instead of when loading a checkpoint ([#9561](https://github.com/PyTorchLightning/pytorch-lightning/pull/9561)) - * Use `completed` over `processed` in `reset_on_restart` ([#9656](https://github.com/PyTorchLightning/pytorch-lightning/pull/9656)) - * Renamed `reset_on_epoch` to `reset_on_run` ([#9658](https://github.com/PyTorchLightning/pytorch-lightning/pull/9658)) -- Added `batch_size` and `rank_zero_only` arguments for `log_dict` to match `log` ([#8628](https://github.com/PyTorchLightning/pytorch-lightning/pull/8628)) -- Added a check for unique GPU ids ([#8666](https://github.com/PyTorchLightning/pytorch-lightning/pull/8666)) -- Added `ResultCollection` state_dict to the Loop `state_dict` and added support for distributed reload ([#8641](https://github.com/PyTorchLightning/pytorch-lightning/pull/8641)) -- Added DeepSpeed collate checkpoint utility function ([#8701](https://github.com/PyTorchLightning/pytorch-lightning/pull/8701)) -- Added a `handles_accumulate_grad_batches` property to the training type plugins ([#8856](https://github.com/PyTorchLightning/pytorch-lightning/pull/8856)) -- Added a warning to `WandbLogger` when reusing a wandb run ([#8714](https://github.com/PyTorchLightning/pytorch-lightning/pull/8714)) -- Added `log_graph` argument for `watch` method of `WandbLogger` ([#8662](https://github.com/PyTorchLightning/pytorch-lightning/pull/8662)) + * Integrated `TrainingEpochLoop.total_batch_idx` ([#8598](https://github.com/Lightning-AI/lightning/pull/8598)) + * Added `BatchProgress` and integrated `TrainingEpochLoop.is_last_batch` ([#9657](https://github.com/Lightning-AI/lightning/pull/9657)) + * Avoid optional `Tracker` attributes ([#9320](https://github.com/Lightning-AI/lightning/pull/9320)) + * Reset `current` progress counters when restarting an epoch loop that had already finished ([#9371](https://github.com/Lightning-AI/lightning/pull/9371)) + * Call `reset_on_restart` in the loop's `reset` hook instead of when loading a checkpoint ([#9561](https://github.com/Lightning-AI/lightning/pull/9561)) + * Use `completed` over `processed` in `reset_on_restart` ([#9656](https://github.com/Lightning-AI/lightning/pull/9656)) + * Renamed `reset_on_epoch` to `reset_on_run` ([#9658](https://github.com/Lightning-AI/lightning/pull/9658)) +- Added `batch_size` and `rank_zero_only` arguments for `log_dict` to match `log` ([#8628](https://github.com/Lightning-AI/lightning/pull/8628)) +- Added a check for unique GPU ids ([#8666](https://github.com/Lightning-AI/lightning/pull/8666)) +- Added `ResultCollection` state_dict to the Loop `state_dict` and added support for distributed reload ([#8641](https://github.com/Lightning-AI/lightning/pull/8641)) +- Added DeepSpeed collate checkpoint utility function ([#8701](https://github.com/Lightning-AI/lightning/pull/8701)) +- Added a `handles_accumulate_grad_batches` property to the training type plugins ([#8856](https://github.com/Lightning-AI/lightning/pull/8856)) +- Added a warning to `WandbLogger` when reusing a wandb run ([#8714](https://github.com/Lightning-AI/lightning/pull/8714)) +- Added `log_graph` argument for `watch` method of `WandbLogger` ([#8662](https://github.com/Lightning-AI/lightning/pull/8662)) - `LightningCLI` additions: - * Added `LightningCLI(run=False|True)` to choose whether to run a `Trainer` subcommand ([#8751](https://github.com/PyTorchLightning/pytorch-lightning/pull/8751)) - * Added support to call any trainer function from the `LightningCLI` via subcommands ([#7508](https://github.com/PyTorchLightning/pytorch-lightning/pull/7508)) - * Allow easy trainer re-instantiation ([#7508](https://github.com/PyTorchLightning/pytorch-lightning/pull/9241)) - * Automatically register all optimizers and learning rate schedulers ([#9565](https://github.com/PyTorchLightning/pytorch-lightning/pull/9565)) - * Allow registering custom optimizers and learning rate schedulers without subclassing the CLI ([#9565](https://github.com/PyTorchLightning/pytorch-lightning/pull/9565)) - * Support shorthand notation to instantiate optimizers and learning rate schedulers ([#9565](https://github.com/PyTorchLightning/pytorch-lightning/pull/9565)) - * Support passing lists of callbacks via command line ([#8815](https://github.com/PyTorchLightning/pytorch-lightning/pull/8815)) - * Support shorthand notation to instantiate models ([#9588](https://github.com/PyTorchLightning/pytorch-lightning/pull/9588)) - * Support shorthand notation to instantiate datamodules ([#10011](https://github.com/PyTorchLightning/pytorch-lightning/pull/10011)) - * Added `multifile` option to `LightningCLI` to enable/disable config saving to preserve multiple files structure ([#9073](https://github.com/PyTorchLightning/pytorch-lightning/pull/9073)) + * Added `LightningCLI(run=False|True)` to choose whether to run a `Trainer` subcommand ([#8751](https://github.com/Lightning-AI/lightning/pull/8751)) + * Added support to call any trainer function from the `LightningCLI` via subcommands ([#7508](https://github.com/Lightning-AI/lightning/pull/7508)) + * Allow easy trainer re-instantiation ([#7508](https://github.com/Lightning-AI/lightning/pull/9241)) + * Automatically register all optimizers and learning rate schedulers ([#9565](https://github.com/Lightning-AI/lightning/pull/9565)) + * Allow registering custom optimizers and learning rate schedulers without subclassing the CLI ([#9565](https://github.com/Lightning-AI/lightning/pull/9565)) + * Support shorthand notation to instantiate optimizers and learning rate schedulers ([#9565](https://github.com/Lightning-AI/lightning/pull/9565)) + * Support passing lists of callbacks via command line ([#8815](https://github.com/Lightning-AI/lightning/pull/8815)) + * Support shorthand notation to instantiate models ([#9588](https://github.com/Lightning-AI/lightning/pull/9588)) + * Support shorthand notation to instantiate datamodules ([#10011](https://github.com/Lightning-AI/lightning/pull/10011)) + * Added `multifile` option to `LightningCLI` to enable/disable config saving to preserve multiple files structure ([#9073](https://github.com/Lightning-AI/lightning/pull/9073)) - Fault-tolerant training: - * Added `FastForwardSampler` and `CaptureIterableDataset` injection to data loading utilities ([#8366](https://github.com/PyTorchLightning/pytorch-lightning/pull/8366)) - * Added `DataFetcher` to control fetching flow ([#8890](https://github.com/PyTorchLightning/pytorch-lightning/pull/8890)) - * Added `SharedCycleIteratorState` to prevent infinite loop ([#8889](https://github.com/PyTorchLightning/pytorch-lightning/pull/8889)) - * Added `CaptureMapDataset` for state management in map-style datasets ([#8891](https://github.com/PyTorchLightning/pytorch-lightning/pull/8891)) - * Added Fault Tolerant Training to `DataFetcher` ([#8891](https://github.com/PyTorchLightning/pytorch-lightning/pull/8891)) - * Replaced old prefetch iterator with new `DataFetcher` in training loop ([#8953](https://github.com/PyTorchLightning/pytorch-lightning/pull/8953)) - * Added partial support for global random state fault-tolerance in map-style datasets ([#8950](https://github.com/PyTorchLightning/pytorch-lightning/pull/8950)) - * Converted state to tuple explicitly when setting Python random state ([#9401](https://github.com/PyTorchLightning/pytorch-lightning/pull/9401)) - * Added support for restarting an optimizer loop (multiple optimizers) ([#9537](https://github.com/PyTorchLightning/pytorch-lightning/pull/9537)) - * Added support for restarting within Evaluation Loop ([#9563](https://github.com/PyTorchLightning/pytorch-lightning/pull/9563)) - * Added mechanism to detect that a signal has been sent so the Trainer can gracefully exit ([#9566](https://github.com/PyTorchLightning/pytorch-lightning/pull/9566)) - * Added support for skipping ahead to validation during the auto-restart of fitting ([#9681](https://github.com/PyTorchLightning/pytorch-lightning/pull/9681)) - * Added support for auto-restart if a fault-tolerant checkpoint is available ([#9722](https://github.com/PyTorchLightning/pytorch-lightning/pull/9722)) + * Added `FastForwardSampler` and `CaptureIterableDataset` injection to data loading utilities ([#8366](https://github.com/Lightning-AI/lightning/pull/8366)) + * Added `DataFetcher` to control fetching flow ([#8890](https://github.com/Lightning-AI/lightning/pull/8890)) + * Added `SharedCycleIteratorState` to prevent infinite loop ([#8889](https://github.com/Lightning-AI/lightning/pull/8889)) + * Added `CaptureMapDataset` for state management in map-style datasets ([#8891](https://github.com/Lightning-AI/lightning/pull/8891)) + * Added Fault Tolerant Training to `DataFetcher` ([#8891](https://github.com/Lightning-AI/lightning/pull/8891)) + * Replaced old prefetch iterator with new `DataFetcher` in training loop ([#8953](https://github.com/Lightning-AI/lightning/pull/8953)) + * Added partial support for global random state fault-tolerance in map-style datasets ([#8950](https://github.com/Lightning-AI/lightning/pull/8950)) + * Converted state to tuple explicitly when setting Python random state ([#9401](https://github.com/Lightning-AI/lightning/pull/9401)) + * Added support for restarting an optimizer loop (multiple optimizers) ([#9537](https://github.com/Lightning-AI/lightning/pull/9537)) + * Added support for restarting within Evaluation Loop ([#9563](https://github.com/Lightning-AI/lightning/pull/9563)) + * Added mechanism to detect that a signal has been sent so the Trainer can gracefully exit ([#9566](https://github.com/Lightning-AI/lightning/pull/9566)) + * Added support for skipping ahead to validation during the auto-restart of fitting ([#9681](https://github.com/Lightning-AI/lightning/pull/9681)) + * Added support for auto-restart if a fault-tolerant checkpoint is available ([#9722](https://github.com/Lightning-AI/lightning/pull/9722)) - Checkpoint saving and loading extensibility: - * Added `CheckpointIO` plugin to expose checkpoint IO from training type plugin ([#8743](https://github.com/PyTorchLightning/pytorch-lightning/pull/8743)) - * Refactored `CheckpointConnector` to offload validation logic to the `CheckpointIO` plugin ([#9045](https://github.com/PyTorchLightning/pytorch-lightning/pull/9045)) - * Added `remove_checkpoint` to `CheckpointIO` plugin by moving the responsibility out of the `ModelCheckpoint` callback ([#9373](https://github.com/PyTorchLightning/pytorch-lightning/pull/9373)) - * Added `XLACheckpointIO` plugin ([#9972](https://github.com/PyTorchLightning/pytorch-lightning/pull/9972)) + * Added `CheckpointIO` plugin to expose checkpoint IO from training type plugin ([#8743](https://github.com/Lightning-AI/lightning/pull/8743)) + * Refactored `CheckpointConnector` to offload validation logic to the `CheckpointIO` plugin ([#9045](https://github.com/Lightning-AI/lightning/pull/9045)) + * Added `remove_checkpoint` to `CheckpointIO` plugin by moving the responsibility out of the `ModelCheckpoint` callback ([#9373](https://github.com/Lightning-AI/lightning/pull/9373)) + * Added `XLACheckpointIO` plugin ([#9972](https://github.com/Lightning-AI/lightning/pull/9972)) - Loop customization: - * Added `Closure` and `AbstractClosure` classes ([#8642](https://github.com/PyTorchLightning/pytorch-lightning/pull/8642)) - * Refactored `TrainingBatchLoop` and extracted `OptimizerLoop`, splitting off automatic optimization into its own loop ([#9191](https://github.com/PyTorchLightning/pytorch-lightning/pull/9191)) - * Removed `TrainingBatchLoop.backward()`; manual optimization now calls directly into `Accelerator.backward()` and automatic optimization handles backward in new `OptimizerLoop` ([#9265](https://github.com/PyTorchLightning/pytorch-lightning/pull/9265)) - * Extracted `ManualOptimization` logic from `TrainingBatchLoop` into its own separate loop class ([#9266](https://github.com/PyTorchLightning/pytorch-lightning/pull/9266)) - * Added `OutputResult` and `ManualResult` classes ([#9437](https://github.com/PyTorchLightning/pytorch-lightning/pull/9437), [#9424](https://github.com/PyTorchLightning/pytorch-lightning/pull/9424)) - * Marked `OptimizerLoop.backward` as protected ([#9514](https://github.com/PyTorchLightning/pytorch-lightning/pull/9514)) - * Marked `FitLoop.should_accumulate` as protected ([#9515](https://github.com/PyTorchLightning/pytorch-lightning/pull/9515)) - * Marked several methods in `PredictionLoop` as protected: `on_predict_start`, `on_predict_epoch_end`, `on_predict_end`, `on_predict_model_eval` ([#9516](https://github.com/PyTorchLightning/pytorch-lightning/pull/9516)) - * Marked several methods in `EvaluationLoop` as protected: `get_max_batches`, `on_evaluation_model_eval`, `on_evaluation_model_train`, `on_evaluation_start`, `on_evaluation_epoch_start`, `on_evaluation_epoch_end`, `on_evaluation_end`, `reload_evaluation_dataloaders` ([#9516](https://github.com/PyTorchLightning/pytorch-lightning/pull/9516)) - * Marked several methods in `EvaluationEpochLoop` as protected: `on_evaluation_batch_start`, `evaluation_step`, `evaluation_step_end` ([#9516](https://github.com/PyTorchLightning/pytorch-lightning/pull/9516)) - * Added `yielding_training_step` example ([#9983](https://github.com/PyTorchLightning/pytorch-lightning/pull/9983)) -- Added support for saving and loading state of multiple callbacks of the same type ([#7187](https://github.com/PyTorchLightning/pytorch-lightning/pull/7187)) -- Added DeepSpeed Stage 1 support ([#8974](https://github.com/PyTorchLightning/pytorch-lightning/pull/8974)) -- Added `Python dataclass` support for `LightningDataModule` ([#8272](https://github.com/PyTorchLightning/pytorch-lightning/pull/8272)) -- Added sanitization of tensors when they get logged as hyperparameters in `TensorBoardLogger` ([#9031](https://github.com/PyTorchLightning/pytorch-lightning/pull/9031)) -- Added `InterBatchParallelDataFetcher` ([#9020](https://github.com/PyTorchLightning/pytorch-lightning/pull/9020)) -- Added `DataLoaderIterDataFetcher` ([#9020](https://github.com/PyTorchLightning/pytorch-lightning/pull/9020)) -- Added `DataFetcher` within `Fit / Evaluation` Loop ([#9047](https://github.com/PyTorchLightning/pytorch-lightning/pull/9047)) -- Added a friendly error message when DDP attempts to spawn new distributed processes with rank > 0 ([#9005](https://github.com/PyTorchLightning/pytorch-lightning/pull/9005)) + * Added `Closure` and `AbstractClosure` classes ([#8642](https://github.com/Lightning-AI/lightning/pull/8642)) + * Refactored `TrainingBatchLoop` and extracted `OptimizerLoop`, splitting off automatic optimization into its own loop ([#9191](https://github.com/Lightning-AI/lightning/pull/9191)) + * Removed `TrainingBatchLoop.backward()`; manual optimization now calls directly into `Accelerator.backward()` and automatic optimization handles backward in new `OptimizerLoop` ([#9265](https://github.com/Lightning-AI/lightning/pull/9265)) + * Extracted `ManualOptimization` logic from `TrainingBatchLoop` into its own separate loop class ([#9266](https://github.com/Lightning-AI/lightning/pull/9266)) + * Added `OutputResult` and `ManualResult` classes ([#9437](https://github.com/Lightning-AI/lightning/pull/9437), [#9424](https://github.com/Lightning-AI/lightning/pull/9424)) + * Marked `OptimizerLoop.backward` as protected ([#9514](https://github.com/Lightning-AI/lightning/pull/9514)) + * Marked `FitLoop.should_accumulate` as protected ([#9515](https://github.com/Lightning-AI/lightning/pull/9515)) + * Marked several methods in `PredictionLoop` as protected: `on_predict_start`, `on_predict_epoch_end`, `on_predict_end`, `on_predict_model_eval` ([#9516](https://github.com/Lightning-AI/lightning/pull/9516)) + * Marked several methods in `EvaluationLoop` as protected: `get_max_batches`, `on_evaluation_model_eval`, `on_evaluation_model_train`, `on_evaluation_start`, `on_evaluation_epoch_start`, `on_evaluation_epoch_end`, `on_evaluation_end`, `reload_evaluation_dataloaders` ([#9516](https://github.com/Lightning-AI/lightning/pull/9516)) + * Marked several methods in `EvaluationEpochLoop` as protected: `on_evaluation_batch_start`, `evaluation_step`, `evaluation_step_end` ([#9516](https://github.com/Lightning-AI/lightning/pull/9516)) + * Added `yielding_training_step` example ([#9983](https://github.com/Lightning-AI/lightning/pull/9983)) +- Added support for saving and loading state of multiple callbacks of the same type ([#7187](https://github.com/Lightning-AI/lightning/pull/7187)) +- Added DeepSpeed Stage 1 support ([#8974](https://github.com/Lightning-AI/lightning/pull/8974)) +- Added `Python dataclass` support for `LightningDataModule` ([#8272](https://github.com/Lightning-AI/lightning/pull/8272)) +- Added sanitization of tensors when they get logged as hyperparameters in `TensorBoardLogger` ([#9031](https://github.com/Lightning-AI/lightning/pull/9031)) +- Added `InterBatchParallelDataFetcher` ([#9020](https://github.com/Lightning-AI/lightning/pull/9020)) +- Added `DataLoaderIterDataFetcher` ([#9020](https://github.com/Lightning-AI/lightning/pull/9020)) +- Added `DataFetcher` within `Fit / Evaluation` Loop ([#9047](https://github.com/Lightning-AI/lightning/pull/9047)) +- Added a friendly error message when DDP attempts to spawn new distributed processes with rank > 0 ([#9005](https://github.com/Lightning-AI/lightning/pull/9005)) - Added Rich integration: - * Added Rich progress bar ([#8929](https://github.com/PyTorchLightning/pytorch-lightning/pull/8929), [#9559](https://github.com/PyTorchLightning/pytorch-lightning/pull/9559)) - * Added Support for iterable datasets ([#9734](https://github.com/PyTorchLightning/pytorch-lightning/pull/9734)) - * Added `RichModelSummary` callback ([#9546](https://github.com/PyTorchLightning/pytorch-lightning/pull/9546)) - * Added `configure_columns` method to `RichProgressBar` ([#10288](https://github.com/PyTorchLightning/pytorch-lightning/pull/10288)) - * Added `leave` argument to `RichProgressBar` ([#10301](https://github.com/PyTorchLightning/pytorch-lightning/pull/10301)) -- Added input validation logic for precision ([#9080](https://github.com/PyTorchLightning/pytorch-lightning/pull/9080)) -- Added support for CPU AMP autocast ([#9084](https://github.com/PyTorchLightning/pytorch-lightning/pull/9084)) -- Added `on_exception` callback hook ([#9183](https://github.com/PyTorchLightning/pytorch-lightning/pull/9183)) -- Added a warning to DeepSpeed when inferring batch size ([#9221](https://github.com/PyTorchLightning/pytorch-lightning/pull/9221)) -- Added `ModelSummary` callback ([#9344](https://github.com/PyTorchLightning/pytorch-lightning/pull/9344)) -- Added `log_images`, `log_text` and `log_table` to `WandbLogger` ([#9545](https://github.com/PyTorchLightning/pytorch-lightning/pull/9545)) -- Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389)) -- Added `get_device_stats` to the Accelerator interface and added its implementation for GPU and TPU ([#9586](https://github.com/PyTorchLightning/pytorch-lightning/pull/9586)) -- Added a warning when an unknown key is encountered in the optimizer configuration, and when `OneCycleLR` is used with `"interval": "epoch"` ([#9666](https://github.com/PyTorchLightning/pytorch-lightning/pull/9666)) -- Added `DeviceStatsMonitor` callback ([#9712](https://github.com/PyTorchLightning/pytorch-lightning/pull/9712)) -- Added `enable_progress_bar` to the Trainer constructor ([#9664](https://github.com/PyTorchLightning/pytorch-lightning/pull/9664)) -- Added `pl_legacy_patch` load utility for loading old checkpoints that have pickled legacy Lightning attributes ([#9166](https://github.com/PyTorchLightning/pytorch-lightning/pull/9166)) -- Added support for `torch.use_deterministic_algorithms` ([#9121](https://github.com/PyTorchLightning/pytorch-lightning/pull/9121)) -- Added automatic parameters tying for TPUs ([#9525](https://github.com/PyTorchLightning/pytorch-lightning/pull/9525)) -- Added support for `torch.autograd.set_detect_anomaly` through `Trainer` constructor argument `detect_anomaly` ([#9848](https://github.com/PyTorchLightning/pytorch-lightning/pull/9848)) -- Added `enable_model_summary` flag to Trainer ([#9699](https://github.com/PyTorchLightning/pytorch-lightning/pull/9699)) -- Added `strategy` argument to Trainer ([#8597](https://github.com/PyTorchLightning/pytorch-lightning/pull/8597)) -- Added `init_meta_context`, `materialize_module` utilities ([#9920](https://github.com/PyTorchLightning/pytorch-lightning/pull/9920)) -- Added `TPUPrecisionPlugin` ([#10020](https://github.com/PyTorchLightning/pytorch-lightning/pull/#10020)) + * Added Rich progress bar ([#8929](https://github.com/Lightning-AI/lightning/pull/8929), [#9559](https://github.com/Lightning-AI/lightning/pull/9559)) + * Added Support for iterable datasets ([#9734](https://github.com/Lightning-AI/lightning/pull/9734)) + * Added `RichModelSummary` callback ([#9546](https://github.com/Lightning-AI/lightning/pull/9546)) + * Added `configure_columns` method to `RichProgressBar` ([#10288](https://github.com/Lightning-AI/lightning/pull/10288)) + * Added `leave` argument to `RichProgressBar` ([#10301](https://github.com/Lightning-AI/lightning/pull/10301)) +- Added input validation logic for precision ([#9080](https://github.com/Lightning-AI/lightning/pull/9080)) +- Added support for CPU AMP autocast ([#9084](https://github.com/Lightning-AI/lightning/pull/9084)) +- Added `on_exception` callback hook ([#9183](https://github.com/Lightning-AI/lightning/pull/9183)) +- Added a warning to DeepSpeed when inferring batch size ([#9221](https://github.com/Lightning-AI/lightning/pull/9221)) +- Added `ModelSummary` callback ([#9344](https://github.com/Lightning-AI/lightning/pull/9344)) +- Added `log_images`, `log_text` and `log_table` to `WandbLogger` ([#9545](https://github.com/Lightning-AI/lightning/pull/9545)) +- Added `PL_RECONCILE_PROCESS` environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/Lightning-AI/lightning/pull/9389)) +- Added `get_device_stats` to the Accelerator interface and added its implementation for GPU and TPU ([#9586](https://github.com/Lightning-AI/lightning/pull/9586)) +- Added a warning when an unknown key is encountered in the optimizer configuration, and when `OneCycleLR` is used with `"interval": "epoch"` ([#9666](https://github.com/Lightning-AI/lightning/pull/9666)) +- Added `DeviceStatsMonitor` callback ([#9712](https://github.com/Lightning-AI/lightning/pull/9712)) +- Added `enable_progress_bar` to the Trainer constructor ([#9664](https://github.com/Lightning-AI/lightning/pull/9664)) +- Added `pl_legacy_patch` load utility for loading old checkpoints that have pickled legacy Lightning attributes ([#9166](https://github.com/Lightning-AI/lightning/pull/9166)) +- Added support for `torch.use_deterministic_algorithms` ([#9121](https://github.com/Lightning-AI/lightning/pull/9121)) +- Added automatic parameters tying for TPUs ([#9525](https://github.com/Lightning-AI/lightning/pull/9525)) +- Added support for `torch.autograd.set_detect_anomaly` through `Trainer` constructor argument `detect_anomaly` ([#9848](https://github.com/Lightning-AI/lightning/pull/9848)) +- Added `enable_model_summary` flag to Trainer ([#9699](https://github.com/Lightning-AI/lightning/pull/9699)) +- Added `strategy` argument to Trainer ([#8597](https://github.com/Lightning-AI/lightning/pull/8597)) +- Added `init_meta_context`, `materialize_module` utilities ([#9920](https://github.com/Lightning-AI/lightning/pull/9920)) +- Added `TPUPrecisionPlugin` ([#10020](https://github.com/Lightning-AI/lightning/pull/#10020)) - Added `torch.bfloat16` support: - * Added bfloat16 support for Lightning Trainer ([#9049](https://github.com/PyTorchLightning/pytorch-lightning/pull/9049)) - * Renamed `TPUHalfPrecisionPlugin` to `TPUBf16PrecisionPlugin` ([#10026](https://github.com/PyTorchLightning/pytorch-lightning/pull/10026)) - * Default to `precision=bf16` on CPU when `precision=16` is passed ([#10033](https://github.com/PyTorchLightning/pytorch-lightning/pull/10033)) - * Added support for `torch.autocast` ([#10053](https://github.com/PyTorchLightning/pytorch-lightning/pull/10053)) -- Added `kfold` example for loop customization ([#9965](https://github.com/PyTorchLightning/pytorch-lightning/pull/9965)) + * Added bfloat16 support for Lightning Trainer ([#9049](https://github.com/Lightning-AI/lightning/pull/9049)) + * Renamed `TPUHalfPrecisionPlugin` to `TPUBf16PrecisionPlugin` ([#10026](https://github.com/Lightning-AI/lightning/pull/10026)) + * Default to `precision=bf16` on CPU when `precision=16` is passed ([#10033](https://github.com/Lightning-AI/lightning/pull/10033)) + * Added support for `torch.autocast` ([#10053](https://github.com/Lightning-AI/lightning/pull/10053)) +- Added `kfold` example for loop customization ([#9965](https://github.com/Lightning-AI/lightning/pull/9965)) - LightningLite: - * Added `PrecisionPlugin.forward_context`, making it the default implementation for all `{train,val,test,predict}_step_context()` methods ([#9988](https://github.com/PyTorchLightning/pytorch-lightning/pull/9988)) - * Added `DDPSpawnPlugin.spawn()` for spawning new processes of a given function ([#10018](https://github.com/PyTorchLightning/pytorch-lightning/pull/10018), [#10022](https://github.com/PyTorchLightning/pytorch-lightning/pull/10022)) - * Added `TrainingTypePlugin.{_setup_model, _setup_optimizer}` methods ([#9994](https://github.com/PyTorchLightning/pytorch-lightning/pull/9994), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064)) - * Implemented `DataParallelPlugin._setup_model` ([#10010](https://github.com/PyTorchLightning/pytorch-lightning/pull/10010)) - * Implemented `DeepSpeedPlugin._setup_model_and_optimizers` ([#10009](https://github.com/PyTorchLightning/pytorch-lightning/pull/10009), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064)) - * Implemented `{DDPShardedPlugin,DDPShardedSpawnPlugin}._setup_model_and_optimizers` ([#10028](https://github.com/PyTorchLightning/pytorch-lightning/pull/10028), [#10064](https://github.com/PyTorchLightning/pytorch-lightning/pull/10064)) - * Added optional `model` argument to the `optimizer_step` methods in accelerators and plugins ([#10023](https://github.com/PyTorchLightning/pytorch-lightning/pull/10023)) - * Updated precision attributes in `DeepSpeedPlugin` ([#10164](https://github.com/PyTorchLightning/pytorch-lightning/pull/10164)) - * Added the ability to return a result from rank 0 in `DDPSpawnPlugin.spawn` ([#10162](https://github.com/PyTorchLightning/pytorch-lightning/pull/10162)) - * Added `pytorch_lightning.lite` package ([#10175](https://github.com/PyTorchLightning/pytorch-lightning/pull/10175)) - * Added `LightningLite` documentation ([#10043](https://github.com/PyTorchLightning/pytorch-lightning/pull/10043)) - * Added `LightningLite` examples ([#9987](https://github.com/PyTorchLightning/pytorch-lightning/pull/9987)) - * Make the `_LiteDataLoader` an iterator and add supports for custom dataloader ([#10279](https://github.com/PyTorchLightning/pytorch-lightning/pull/10279)) -- Added `use_omegaconf` argument to `save_hparams_to_yaml` plugin ([#9170](https://github.com/PyTorchLightning/pytorch-lightning/pull/9170)) -- Added `ckpt_path` argument for `Trainer.fit()` ([#10061](https://github.com/PyTorchLightning/pytorch-lightning/pull/10061)) -- Added `auto_device_count` method to `Accelerators` ([#10222](https://github.com/PyTorchLightning/pytorch-lightning/pull/10222)) -- Added support for `devices="auto"` ([#10264](https://github.com/PyTorchLightning/pytorch-lightning/pull/10264)) -- Added a `filename` argument in `ModelCheckpoint.format_checkpoint_name` ([#9818](https://github.com/PyTorchLightning/pytorch-lightning/pull/9818)) -- Added support for empty `gpus` list to run on CPU ([#10246](https://github.com/PyTorchLightning/pytorch-lightning/pull/10246)) -- Added a warning if multiple batch sizes are found from ambiguous batch ([#10247](https://github.com/PyTorchLightning/pytorch-lightning/pull/10247)) + * Added `PrecisionPlugin.forward_context`, making it the default implementation for all `{train,val,test,predict}_step_context()` methods ([#9988](https://github.com/Lightning-AI/lightning/pull/9988)) + * Added `DDPSpawnPlugin.spawn()` for spawning new processes of a given function ([#10018](https://github.com/Lightning-AI/lightning/pull/10018), [#10022](https://github.com/Lightning-AI/lightning/pull/10022)) + * Added `TrainingTypePlugin.{_setup_model, _setup_optimizer}` methods ([#9994](https://github.com/Lightning-AI/lightning/pull/9994), [#10064](https://github.com/Lightning-AI/lightning/pull/10064)) + * Implemented `DataParallelPlugin._setup_model` ([#10010](https://github.com/Lightning-AI/lightning/pull/10010)) + * Implemented `DeepSpeedPlugin._setup_model_and_optimizers` ([#10009](https://github.com/Lightning-AI/lightning/pull/10009), [#10064](https://github.com/Lightning-AI/lightning/pull/10064)) + * Implemented `{DDPShardedPlugin,DDPShardedSpawnPlugin}._setup_model_and_optimizers` ([#10028](https://github.com/Lightning-AI/lightning/pull/10028), [#10064](https://github.com/Lightning-AI/lightning/pull/10064)) + * Added optional `model` argument to the `optimizer_step` methods in accelerators and plugins ([#10023](https://github.com/Lightning-AI/lightning/pull/10023)) + * Updated precision attributes in `DeepSpeedPlugin` ([#10164](https://github.com/Lightning-AI/lightning/pull/10164)) + * Added the ability to return a result from rank 0 in `DDPSpawnPlugin.spawn` ([#10162](https://github.com/Lightning-AI/lightning/pull/10162)) + * Added `pytorch_lightning.lite` package ([#10175](https://github.com/Lightning-AI/lightning/pull/10175)) + * Added `LightningLite` documentation ([#10043](https://github.com/Lightning-AI/lightning/pull/10043)) + * Added `LightningLite` examples ([#9987](https://github.com/Lightning-AI/lightning/pull/9987)) + * Make the `_LiteDataLoader` an iterator and add supports for custom dataloader ([#10279](https://github.com/Lightning-AI/lightning/pull/10279)) +- Added `use_omegaconf` argument to `save_hparams_to_yaml` plugin ([#9170](https://github.com/Lightning-AI/lightning/pull/9170)) +- Added `ckpt_path` argument for `Trainer.fit()` ([#10061](https://github.com/Lightning-AI/lightning/pull/10061)) +- Added `auto_device_count` method to `Accelerators` ([#10222](https://github.com/Lightning-AI/lightning/pull/10222)) +- Added support for `devices="auto"` ([#10264](https://github.com/Lightning-AI/lightning/pull/10264)) +- Added a `filename` argument in `ModelCheckpoint.format_checkpoint_name` ([#9818](https://github.com/Lightning-AI/lightning/pull/9818)) +- Added support for empty `gpus` list to run on CPU ([#10246](https://github.com/Lightning-AI/lightning/pull/10246)) +- Added a warning if multiple batch sizes are found from ambiguous batch ([#10247](https://github.com/Lightning-AI/lightning/pull/10247)) ### Changed -- Trainer now raises a `MisconfigurationException` when its methods are called with `ckpt_path="best"` but a checkpoint callback isn't configured ([#9841](https://github.com/PyTorchLightning/pytorch-lightning/pull/9841)) -- Setting `Trainer(accelerator="ddp_cpu")` now does not spawn a subprocess if `num_processes` is kept `1` along with `num_nodes > 1` ([#9603](https://github.com/PyTorchLightning/pytorch-lightning/pull/9603)) -- Module imports are now catching `ModuleNotFoundError` instead of `ImportError` ([#9867](https://github.com/PyTorchLightning/pytorch-lightning/pull/9867)) -- `pytorch_lightning.loggers.neptune.NeptuneLogger` is now consistent with the new [neptune-client](https://github.com/neptune-ai/neptune-client) API; the old [neptune-client](https://github.com/neptune-ai/neptune-client) API is supported by `NeptuneClient` from the [neptune-contrib](https://github.com/neptune-ai/neptune-contrib) repo ([#6867](https://github.com/PyTorchLightning/pytorch-lightning/pull/6867)) -- Parsing of `enums` type hyperparameters to be saved in the `haprams.yaml` file by TensorBoard and CSV loggers has been fixed and made in line with how OmegaConf parses it ([#9170](https://github.com/PyTorchLightning/pytorch-lightning/pull/9170)) -- Parsing of the `gpus` Trainer argument has changed: `gpus="n"` (str) no longer selects the GPU index n and instead selects the first n devices ([#8770](https://github.com/PyTorchLightning/pytorch-lightning/pull/8770)) -- `iteration_count` and other index attributes in the loops has been replaced with progress dataclasses ([#8477](https://github.com/PyTorchLightning/pytorch-lightning/pull/8477)) -- The `trainer.lightning_module` reference is now properly set at the very beginning of a run ([#8536](https://github.com/PyTorchLightning/pytorch-lightning/pull/8536)) -- The model weights now get loaded in all cases when the checkpoint path gets provided in validate/test/predict, regardless of whether the model instance is provided or not ([#8352](https://github.com/PyTorchLightning/pytorch-lightning/pull/8352)) -- The `Trainer` functions `reset_{train,val,test,predict}_dataloader`, `reset_train_val_dataloaders`, and `request_dataloader` `model` argument is now optional ([#8536](https://github.com/PyTorchLightning/pytorch-lightning/pull/8536)) -- Saved checkpoints will no longer use the type of a `Callback` as the key to avoid issues with unpickling ([#6886](https://github.com/PyTorchLightning/pytorch-lightning/pull/6886)) -- Improved string conversion for `ResultCollection` ([#8622](https://github.com/PyTorchLightning/pytorch-lightning/pull/8622)) +- Trainer now raises a `MisconfigurationException` when its methods are called with `ckpt_path="best"` but a checkpoint callback isn't configured ([#9841](https://github.com/Lightning-AI/lightning/pull/9841)) +- Setting `Trainer(accelerator="ddp_cpu")` now does not spawn a subprocess if `num_processes` is kept `1` along with `num_nodes > 1` ([#9603](https://github.com/Lightning-AI/lightning/pull/9603)) +- Module imports are now catching `ModuleNotFoundError` instead of `ImportError` ([#9867](https://github.com/Lightning-AI/lightning/pull/9867)) +- `pytorch_lightning.loggers.neptune.NeptuneLogger` is now consistent with the new [neptune-client](https://github.com/neptune-ai/neptune-client) API; the old [neptune-client](https://github.com/neptune-ai/neptune-client) API is supported by `NeptuneClient` from the [neptune-contrib](https://github.com/neptune-ai/neptune-contrib) repo ([#6867](https://github.com/Lightning-AI/lightning/pull/6867)) +- Parsing of `enums` type hyperparameters to be saved in the `haprams.yaml` file by TensorBoard and CSV loggers has been fixed and made in line with how OmegaConf parses it ([#9170](https://github.com/Lightning-AI/lightning/pull/9170)) +- Parsing of the `gpus` Trainer argument has changed: `gpus="n"` (str) no longer selects the GPU index n and instead selects the first n devices ([#8770](https://github.com/Lightning-AI/lightning/pull/8770)) +- `iteration_count` and other index attributes in the loops has been replaced with progress dataclasses ([#8477](https://github.com/Lightning-AI/lightning/pull/8477)) +- The `trainer.lightning_module` reference is now properly set at the very beginning of a run ([#8536](https://github.com/Lightning-AI/lightning/pull/8536)) +- The model weights now get loaded in all cases when the checkpoint path gets provided in validate/test/predict, regardless of whether the model instance is provided or not ([#8352](https://github.com/Lightning-AI/lightning/pull/8352)) +- The `Trainer` functions `reset_{train,val,test,predict}_dataloader`, `reset_train_val_dataloaders`, and `request_dataloader` `model` argument is now optional ([#8536](https://github.com/Lightning-AI/lightning/pull/8536)) +- Saved checkpoints will no longer use the type of a `Callback` as the key to avoid issues with unpickling ([#6886](https://github.com/Lightning-AI/lightning/pull/6886)) +- Improved string conversion for `ResultCollection` ([#8622](https://github.com/Lightning-AI/lightning/pull/8622)) - `LightningCLI` changes: - * `LightningCLI.init_parser` now returns the parser instance ([#8721](https://github.com/PyTorchLightning/pytorch-lightning/pull/8721)) - * `LightningCLI.add_core_arguments_to_parser`, `LightningCLI.parse_arguments` now take a `parser` argument ([#8721](https://github.com/PyTorchLightning/pytorch-lightning/pull/8721)) - * `LightningCLI.instantiate_trainer` now takes a config and a list of callbacks ([#8721](https://github.com/PyTorchLightning/pytorch-lightning/pull/8721)) - * Split `LightningCLI.add_core_arguments_to_parser` into `LightningCLI.add_default_arguments_to_parser` + `LightningCLI.add_core_arguments_to_parser` ([#8721](https://github.com/PyTorchLightning/pytorch-lightning/pull/8721)) -- The accelerator and training type plugin `setup` hooks no longer have a `model` argument ([#8536](https://github.com/PyTorchLightning/pytorch-lightning/pull/8536)) -- The accelerator and training type plugin `update_global_step` hook has been removed ([#8856](https://github.com/PyTorchLightning/pytorch-lightning/pull/8856)) -- The coverage of `self.log`-ing in any `LightningModule` or `Callback` hook has been improved ([#8498](https://github.com/PyTorchLightning/pytorch-lightning/pull/8498)) -- `self.log`-ing without a `Trainer` reference now raises a warning instead of an exception ([#9733](https://github.com/PyTorchLightning/pytorch-lightning/pull/9733)) -- Removed restrictions in the Trainer that loggers can only log from rank 0; the existing logger behavior has not changed ([#8608](https://github.com/PyTorchLightning/pytorch-lightning/pull/8608)) -- `Trainer.request_dataloader` now takes a `RunningStage` enum instance ([#8858](https://github.com/PyTorchLightning/pytorch-lightning/pull/8858)) -- Changed `rank_zero_warn` to `NotImplementedError` in the `{train, val, test, predict}_dataloader` hooks that `Lightning(Data)Module` uses ([#9161](https://github.com/PyTorchLightning/pytorch-lightning/pull/9161)) -- Moved `block_ddp_sync_behaviour` out of `TrainingBatchLoop` to loop utilities ([#9192](https://github.com/PyTorchLightning/pytorch-lightning/pull/9192)) -- Executing the `optimizer_closure` is now required when overriding the `optimizer_step` hook ([#9360](https://github.com/PyTorchLightning/pytorch-lightning/pull/9360)) -- Changed logging of `LightningModule` and `LightningDataModule` hyperparameters to raise an exception only if there are colliding keys with different values ([#9496](https://github.com/PyTorchLightning/pytorch-lightning/pull/9496)) -- `seed_everything` now fails when an invalid seed value is passed instead of selecting a random seed ([#8787](https://github.com/PyTorchLightning/pytorch-lightning/pull/8787)) -- The Trainer now calls `TrainingTypePlugin` collective APIs directly instead of going through the Accelerator reference ([#9677](https://github.com/PyTorchLightning/pytorch-lightning/pull/9677), [#9901](https://github.com/PyTorchLightning/pytorch-lightning/pull/9901)) -- The tuner now uses a unique filename to save a temporary checkpoint ([#9682](https://github.com/PyTorchLightning/pytorch-lightning/pull/9682)) -- Changed `HorovodPlugin.all_gather` to return a `torch.Tensor` instead of a list ([#9696](https://github.com/PyTorchLightning/pytorch-lightning/pull/9696)) + * `LightningCLI.init_parser` now returns the parser instance ([#8721](https://github.com/Lightning-AI/lightning/pull/8721)) + * `LightningCLI.add_core_arguments_to_parser`, `LightningCLI.parse_arguments` now take a `parser` argument ([#8721](https://github.com/Lightning-AI/lightning/pull/8721)) + * `LightningCLI.instantiate_trainer` now takes a config and a list of callbacks ([#8721](https://github.com/Lightning-AI/lightning/pull/8721)) + * Split `LightningCLI.add_core_arguments_to_parser` into `LightningCLI.add_default_arguments_to_parser` + `LightningCLI.add_core_arguments_to_parser` ([#8721](https://github.com/Lightning-AI/lightning/pull/8721)) +- The accelerator and training type plugin `setup` hooks no longer have a `model` argument ([#8536](https://github.com/Lightning-AI/lightning/pull/8536)) +- The accelerator and training type plugin `update_global_step` hook has been removed ([#8856](https://github.com/Lightning-AI/lightning/pull/8856)) +- The coverage of `self.log`-ing in any `LightningModule` or `Callback` hook has been improved ([#8498](https://github.com/Lightning-AI/lightning/pull/8498)) +- `self.log`-ing without a `Trainer` reference now raises a warning instead of an exception ([#9733](https://github.com/Lightning-AI/lightning/pull/9733)) +- Removed restrictions in the Trainer that loggers can only log from rank 0; the existing logger behavior has not changed ([#8608](https://github.com/Lightning-AI/lightning/pull/8608)) +- `Trainer.request_dataloader` now takes a `RunningStage` enum instance ([#8858](https://github.com/Lightning-AI/lightning/pull/8858)) +- Changed `rank_zero_warn` to `NotImplementedError` in the `{train, val, test, predict}_dataloader` hooks that `Lightning(Data)Module` uses ([#9161](https://github.com/Lightning-AI/lightning/pull/9161)) +- Moved `block_ddp_sync_behaviour` out of `TrainingBatchLoop` to loop utilities ([#9192](https://github.com/Lightning-AI/lightning/pull/9192)) +- Executing the `optimizer_closure` is now required when overriding the `optimizer_step` hook ([#9360](https://github.com/Lightning-AI/lightning/pull/9360)) +- Changed logging of `LightningModule` and `LightningDataModule` hyperparameters to raise an exception only if there are colliding keys with different values ([#9496](https://github.com/Lightning-AI/lightning/pull/9496)) +- `seed_everything` now fails when an invalid seed value is passed instead of selecting a random seed ([#8787](https://github.com/Lightning-AI/lightning/pull/8787)) +- The Trainer now calls `TrainingTypePlugin` collective APIs directly instead of going through the Accelerator reference ([#9677](https://github.com/Lightning-AI/lightning/pull/9677), [#9901](https://github.com/Lightning-AI/lightning/pull/9901)) +- The tuner now uses a unique filename to save a temporary checkpoint ([#9682](https://github.com/Lightning-AI/lightning/pull/9682)) +- Changed `HorovodPlugin.all_gather` to return a `torch.Tensor` instead of a list ([#9696](https://github.com/Lightning-AI/lightning/pull/9696)) - Changed Trainer connectors to be protected attributes: - * Configuration Validator ([#9779](https://github.com/PyTorchLightning/pytorch-lightning/pull/9779)) -- The `current_epoch` and `global_step` attributes now get restored irrespective of the Trainer task ([#9413](https://github.com/PyTorchLightning/pytorch-lightning/pull/9413)) -- Trainer now raises an exception when requesting `amp_level` with native `amp_backend` ([#9755](https://github.com/PyTorchLightning/pytorch-lightning/pull/9755)) -- Update the logic to check for accumulation steps with deepspeed ([#9826](https://github.com/PyTorchLightning/pytorch-lightning/pull/9826)) -- `pytorch_lightning.utilities.grads.grad_norm` now raises an exception if parameter `norm_type <= 0` ([#9765](https://github.com/PyTorchLightning/pytorch-lightning/pull/9765)) -- Updated error message for interactive incompatible plugins ([#9896](https://github.com/PyTorchLightning/pytorch-lightning/pull/9896)) -- Moved the `optimizer_step` and `clip_gradients` hook from the `Accelerator` and `TrainingTypePlugin` into the `PrecisionPlugin` ([#10143](https://github.com/PyTorchLightning/pytorch-lightning/pull/10143), [#10029](https://github.com/PyTorchLightning/pytorch-lightning/pull/10029)) -- `NativeMixedPrecisionPlugin` and its subclasses now take an optional `GradScaler` instance ([#10055](https://github.com/PyTorchLightning/pytorch-lightning/pull/10055)) -- Trainer is now raising a `MisconfigurationException` instead of a warning if `Trainer.{validate/test}` is missing required methods ([#10016](https://github.com/PyTorchLightning/pytorch-lightning/pull/10016)) -- Changed default value of the `max_steps` Trainer argument from `None` to -1 ([#9460](https://github.com/PyTorchLightning/pytorch-lightning/pull/9460)) -- LightningModule now raises an error when calling `log(on_step=False, on_epoch=False)` ([#10227](https://github.com/PyTorchLightning/pytorch-lightning/pull/10227)) -- Quantization aware training observers are now disabled by default during validating/testing/predicting stages ([#8540](https://github.com/PyTorchLightning/pytorch-lightning/pull/8540)) -- Raised `MisconfigurationException` when total length of `dataloader` across ranks is zero, and give warning when total length is non-zero, but only local rank length is zero. ([#9827](https://github.com/PyTorchLightning/pytorch-lightning/pull/9827)) -- Changed the model size calculation using `ByteCounter` ([#10123](https://github.com/PyTorchLightning/pytorch-lightning/pull/10123)) -- Enabled `on_load_checkpoint` for `LightningDataModule` for all `trainer_fn` ([#10238](https://github.com/PyTorchLightning/pytorch-lightning/pull/10238)) -- Allowed separate config files for parameters with class type when LightningCLI is in `subclass_mode=False` ([#10286](https://github.com/PyTorchLightning/pytorch-lightning/pull/10286)) + * Configuration Validator ([#9779](https://github.com/Lightning-AI/lightning/pull/9779)) +- The `current_epoch` and `global_step` attributes now get restored irrespective of the Trainer task ([#9413](https://github.com/Lightning-AI/lightning/pull/9413)) +- Trainer now raises an exception when requesting `amp_level` with native `amp_backend` ([#9755](https://github.com/Lightning-AI/lightning/pull/9755)) +- Update the logic to check for accumulation steps with deepspeed ([#9826](https://github.com/Lightning-AI/lightning/pull/9826)) +- `pytorch_lightning.utilities.grads.grad_norm` now raises an exception if parameter `norm_type <= 0` ([#9765](https://github.com/Lightning-AI/lightning/pull/9765)) +- Updated error message for interactive incompatible plugins ([#9896](https://github.com/Lightning-AI/lightning/pull/9896)) +- Moved the `optimizer_step` and `clip_gradients` hook from the `Accelerator` and `TrainingTypePlugin` into the `PrecisionPlugin` ([#10143](https://github.com/Lightning-AI/lightning/pull/10143), [#10029](https://github.com/Lightning-AI/lightning/pull/10029)) +- `NativeMixedPrecisionPlugin` and its subclasses now take an optional `GradScaler` instance ([#10055](https://github.com/Lightning-AI/lightning/pull/10055)) +- Trainer is now raising a `MisconfigurationException` instead of a warning if `Trainer.{validate/test}` is missing required methods ([#10016](https://github.com/Lightning-AI/lightning/pull/10016)) +- Changed default value of the `max_steps` Trainer argument from `None` to -1 ([#9460](https://github.com/Lightning-AI/lightning/pull/9460)) +- LightningModule now raises an error when calling `log(on_step=False, on_epoch=False)` ([#10227](https://github.com/Lightning-AI/lightning/pull/10227)) +- Quantization aware training observers are now disabled by default during validating/testing/predicting stages ([#8540](https://github.com/Lightning-AI/lightning/pull/8540)) +- Raised `MisconfigurationException` when total length of `dataloader` across ranks is zero, and give warning when total length is non-zero, but only local rank length is zero. ([#9827](https://github.com/Lightning-AI/lightning/pull/9827)) +- Changed the model size calculation using `ByteCounter` ([#10123](https://github.com/Lightning-AI/lightning/pull/10123)) +- Enabled `on_load_checkpoint` for `LightningDataModule` for all `trainer_fn` ([#10238](https://github.com/Lightning-AI/lightning/pull/10238)) +- Allowed separate config files for parameters with class type when LightningCLI is in `subclass_mode=False` ([#10286](https://github.com/Lightning-AI/lightning/pull/10286)) ### Deprecated -- Deprecated Trainer argument `terminate_on_nan` in favor of `detect_anomaly`([#9175](https://github.com/PyTorchLightning/pytorch-lightning/pull/9175)) -- Deprecated `Trainer.terminate_on_nan` public attribute access ([#9849](https://github.com/PyTorchLightning/pytorch-lightning/pull/9849)) -- Deprecated `LightningModule.summarize()` in favor of `pytorch_lightning.utilities.model_summary.summarize()` ([#8513](https://github.com/PyTorchLightning/pytorch-lightning/pull/8513)) -- Deprecated `LightningModule.model_size` ([#8343](https://github.com/PyTorchLightning/pytorch-lightning/pull/8343)) -- Deprecated `DataModule` properties: `train_transforms`, `val_transforms`, `test_transforms`, `size`, `dims` ([#8851](https://github.com/PyTorchLightning/pytorch-lightning/pull/8851)) -- Deprecated `add_to_queue`, `get_from_queue` from `LightningModule` in favor of corresponding methods in the `DDPSpawnPlugin` ([#9118](https://github.com/PyTorchLightning/pytorch-lightning/pull/9118)) -- Deprecated `LightningModule.get_progress_bar_dict` and `Trainer.progress_bar_dict` in favor of `pytorch_lightning.callbacks.progress.base.get_standard_metrics` and `ProgressBarBase.get_metrics` ([#8985](https://github.com/PyTorchLightning/pytorch-lightning/pull/8985)) -- Deprecated `prepare_data_per_node` flag on Trainer and set it as a property of `DataHooks`, accessible in the `LightningModule` and `LightningDataModule` ([#8958](https://github.com/PyTorchLightning/pytorch-lightning/pull/8958)) -- Deprecated the `TestTubeLogger` ([#9065](https://github.com/PyTorchLightning/pytorch-lightning/pull/9065)) -- Deprecated `on_{train/val/test/predict}_dataloader()` from `LightningModule` and `LightningDataModule` ([#9098](https://github.com/PyTorchLightning/pytorch-lightning/pull/9098)) -- Deprecated `on_keyboard_interrupt` callback hook in favor of new `on_exception` hook ([#9260](https://github.com/PyTorchLightning/pytorch-lightning/pull/9260)) -- Deprecated passing `process_position` to the `Trainer` constructor in favor of adding the `ProgressBar` callback with `process_position` directly to the list of callbacks ([#9222](https://github.com/PyTorchLightning/pytorch-lightning/pull/9222)) -- Deprecated passing `flush_logs_every_n_steps` as a Trainer argument, instead pass it to the logger init if supported ([#9366](https://github.com/PyTorchLightning/pytorch-lightning/pull/9366)) -- Deprecated `LightningLoggerBase.close`, `LoggerCollection.close` in favor of `LightningLoggerBase.finalize`, `LoggerCollection.finalize` ([#9422](https://github.com/PyTorchLightning/pytorch-lightning/pull/9422)) -- Deprecated passing `progress_bar_refresh_rate` to the `Trainer` constructor in favor of adding the `ProgressBar` callback with `refresh_rate` directly to the list of callbacks, or passing `enable_progress_bar=False` to disable the progress bar ([#9616](https://github.com/PyTorchLightning/pytorch-lightning/pull/9616)) -- Deprecated `LightningDistributed` and moved the broadcast logic to `DDPPlugin` and `DDPSpawnPlugin` directly ([#9691](https://github.com/PyTorchLightning/pytorch-lightning/pull/9691)) -- Deprecated passing `stochastic_weight_avg` to the `Trainer` constructor in favor of adding the `StochasticWeightAveraging` callback directly to the list of callbacks ([#8989](https://github.com/PyTorchLightning/pytorch-lightning/pull/8989)) -- Deprecated Accelerator collective API `barrier`, `broadcast`, and `all_gather` in favor of calling the `TrainingTypePlugin` collective API directly ([#9677](https://github.com/PyTorchLightning/pytorch-lightning/pull/9677)) -- Deprecated `checkpoint_callback` from the `Trainer` constructor in favor of `enable_checkpointing` ([#9754](https://github.com/PyTorchLightning/pytorch-lightning/pull/9754)) -- Deprecated the `LightningModule.on_post_move_to_device` method ([#9525](https://github.com/PyTorchLightning/pytorch-lightning/pull/9525)) -- Deprecated `pytorch_lightning.core.decorators.parameter_validation` in favor of `pytorch_lightning.utilities.parameter_tying.set_shared_parameters` ([#9525](https://github.com/PyTorchLightning/pytorch-lightning/pull/9525)) -- Deprecated passing `weights_summary` to the `Trainer` constructor in favor of adding the `ModelSummary` callback with `max_depth` directly to the list of callbacks ([#9699](https://github.com/PyTorchLightning/pytorch-lightning/pull/9699)) -- Deprecated `log_gpu_memory`, `gpu_metrics`, and util funcs in favor of `DeviceStatsMonitor` callback ([#9921](https://github.com/PyTorchLightning/pytorch-lightning/pull/9921)) -- Deprecated `GPUStatsMonitor` and `XLAStatsMonitor` in favor of `DeviceStatsMonitor` callback ([#9924](https://github.com/PyTorchLightning/pytorch-lightning/pull/9924)) -- Deprecated setting `Trainer(max_steps=None)`; To turn off the limit, set `Trainer(max_steps=-1)` (default) ([#9460](https://github.com/PyTorchLightning/pytorch-lightning/pull/9460)) -- Deprecated access to the `AcceleratorConnector.is_slurm_managing_tasks` attribute and marked it as protected ([#10101](https://github.com/PyTorchLightning/pytorch-lightning/pull/10101)) -- Deprecated access to the `AcceleratorConnector.configure_slurm_ddp` method and marked it as protected ([#10101](https://github.com/PyTorchLightning/pytorch-lightning/pull/10101)) -- Deprecated passing `resume_from_checkpoint` to the `Trainer` constructor in favor of `trainer.fit(ckpt_path=)` ([#10061](https://github.com/PyTorchLightning/pytorch-lightning/pull/10061)) -- Deprecated `ClusterEnvironment.creates_children()` in favor of `ClusterEnvironment.creates_processes_externally` (property) ([#10106](https://github.com/PyTorchLightning/pytorch-lightning/pull/10106)) -- Deprecated `PrecisionPlugin.master_params()` in favor of `PrecisionPlugin.main_params()` ([#10105](https://github.com/PyTorchLightning/pytorch-lightning/pull/10105)) -- Deprecated `lr_sch_names` from `LearningRateMonitor` ([#10066](https://github.com/PyTorchLightning/pytorch-lightning/pull/10066)) -- Deprecated `ProgressBar` callback in favor of `TQDMProgressBar` ([#10134](https://github.com/PyTorchLightning/pytorch-lightning/pull/10134)) +- Deprecated Trainer argument `terminate_on_nan` in favor of `detect_anomaly`([#9175](https://github.com/Lightning-AI/lightning/pull/9175)) +- Deprecated `Trainer.terminate_on_nan` public attribute access ([#9849](https://github.com/Lightning-AI/lightning/pull/9849)) +- Deprecated `LightningModule.summarize()` in favor of `pytorch_lightning.utilities.model_summary.summarize()` ([#8513](https://github.com/Lightning-AI/lightning/pull/8513)) +- Deprecated `LightningModule.model_size` ([#8343](https://github.com/Lightning-AI/lightning/pull/8343)) +- Deprecated `DataModule` properties: `train_transforms`, `val_transforms`, `test_transforms`, `size`, `dims` ([#8851](https://github.com/Lightning-AI/lightning/pull/8851)) +- Deprecated `add_to_queue`, `get_from_queue` from `LightningModule` in favor of corresponding methods in the `DDPSpawnPlugin` ([#9118](https://github.com/Lightning-AI/lightning/pull/9118)) +- Deprecated `LightningModule.get_progress_bar_dict` and `Trainer.progress_bar_dict` in favor of `pytorch_lightning.callbacks.progress.base.get_standard_metrics` and `ProgressBarBase.get_metrics` ([#8985](https://github.com/Lightning-AI/lightning/pull/8985)) +- Deprecated `prepare_data_per_node` flag on Trainer and set it as a property of `DataHooks`, accessible in the `LightningModule` and `LightningDataModule` ([#8958](https://github.com/Lightning-AI/lightning/pull/8958)) +- Deprecated the `TestTubeLogger` ([#9065](https://github.com/Lightning-AI/lightning/pull/9065)) +- Deprecated `on_{train/val/test/predict}_dataloader()` from `LightningModule` and `LightningDataModule` ([#9098](https://github.com/Lightning-AI/lightning/pull/9098)) +- Deprecated `on_keyboard_interrupt` callback hook in favor of new `on_exception` hook ([#9260](https://github.com/Lightning-AI/lightning/pull/9260)) +- Deprecated passing `process_position` to the `Trainer` constructor in favor of adding the `ProgressBar` callback with `process_position` directly to the list of callbacks ([#9222](https://github.com/Lightning-AI/lightning/pull/9222)) +- Deprecated passing `flush_logs_every_n_steps` as a Trainer argument, instead pass it to the logger init if supported ([#9366](https://github.com/Lightning-AI/lightning/pull/9366)) +- Deprecated `LightningLoggerBase.close`, `LoggerCollection.close` in favor of `LightningLoggerBase.finalize`, `LoggerCollection.finalize` ([#9422](https://github.com/Lightning-AI/lightning/pull/9422)) +- Deprecated passing `progress_bar_refresh_rate` to the `Trainer` constructor in favor of adding the `ProgressBar` callback with `refresh_rate` directly to the list of callbacks, or passing `enable_progress_bar=False` to disable the progress bar ([#9616](https://github.com/Lightning-AI/lightning/pull/9616)) +- Deprecated `LightningDistributed` and moved the broadcast logic to `DDPPlugin` and `DDPSpawnPlugin` directly ([#9691](https://github.com/Lightning-AI/lightning/pull/9691)) +- Deprecated passing `stochastic_weight_avg` to the `Trainer` constructor in favor of adding the `StochasticWeightAveraging` callback directly to the list of callbacks ([#8989](https://github.com/Lightning-AI/lightning/pull/8989)) +- Deprecated Accelerator collective API `barrier`, `broadcast`, and `all_gather` in favor of calling the `TrainingTypePlugin` collective API directly ([#9677](https://github.com/Lightning-AI/lightning/pull/9677)) +- Deprecated `checkpoint_callback` from the `Trainer` constructor in favor of `enable_checkpointing` ([#9754](https://github.com/Lightning-AI/lightning/pull/9754)) +- Deprecated the `LightningModule.on_post_move_to_device` method ([#9525](https://github.com/Lightning-AI/lightning/pull/9525)) +- Deprecated `pytorch_lightning.core.decorators.parameter_validation` in favor of `pytorch_lightning.utilities.parameter_tying.set_shared_parameters` ([#9525](https://github.com/Lightning-AI/lightning/pull/9525)) +- Deprecated passing `weights_summary` to the `Trainer` constructor in favor of adding the `ModelSummary` callback with `max_depth` directly to the list of callbacks ([#9699](https://github.com/Lightning-AI/lightning/pull/9699)) +- Deprecated `log_gpu_memory`, `gpu_metrics`, and util funcs in favor of `DeviceStatsMonitor` callback ([#9921](https://github.com/Lightning-AI/lightning/pull/9921)) +- Deprecated `GPUStatsMonitor` and `XLAStatsMonitor` in favor of `DeviceStatsMonitor` callback ([#9924](https://github.com/Lightning-AI/lightning/pull/9924)) +- Deprecated setting `Trainer(max_steps=None)`; To turn off the limit, set `Trainer(max_steps=-1)` (default) ([#9460](https://github.com/Lightning-AI/lightning/pull/9460)) +- Deprecated access to the `AcceleratorConnector.is_slurm_managing_tasks` attribute and marked it as protected ([#10101](https://github.com/Lightning-AI/lightning/pull/10101)) +- Deprecated access to the `AcceleratorConnector.configure_slurm_ddp` method and marked it as protected ([#10101](https://github.com/Lightning-AI/lightning/pull/10101)) +- Deprecated passing `resume_from_checkpoint` to the `Trainer` constructor in favor of `trainer.fit(ckpt_path=)` ([#10061](https://github.com/Lightning-AI/lightning/pull/10061)) +- Deprecated `ClusterEnvironment.creates_children()` in favor of `ClusterEnvironment.creates_processes_externally` (property) ([#10106](https://github.com/Lightning-AI/lightning/pull/10106)) +- Deprecated `PrecisionPlugin.master_params()` in favor of `PrecisionPlugin.main_params()` ([#10105](https://github.com/Lightning-AI/lightning/pull/10105)) +- Deprecated `lr_sch_names` from `LearningRateMonitor` ([#10066](https://github.com/Lightning-AI/lightning/pull/10066)) +- Deprecated `ProgressBar` callback in favor of `TQDMProgressBar` ([#10134](https://github.com/Lightning-AI/lightning/pull/10134)) ### Removed -- Removed deprecated `metrics` ([#8586](https://github.com/PyTorchLightning/pytorch-lightning/pull/8586/)) -- Removed the deprecated `outputs` argument in both the `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#8587](https://github.com/PyTorchLightning/pytorch-lightning/pull/8587)) -- Removed the deprecated `TrainerLoggingMixin` class ([#8609](https://github.com/PyTorchLightning/pytorch-lightning/pull/8609)) -- Removed the deprecated `TrainerTrainingTricksMixin` class ([#8679](https://github.com/PyTorchLightning/pytorch-lightning/pull/8679)) -- Removed the deprecated `optimizer_idx` from `training_step` as an accepted argument in manual optimization ([#8576](https://github.com/PyTorchLightning/pytorch-lightning/pull/8576)) -- Removed support for the deprecated `on_save_checkpoint` signature. The hook now takes a `checkpoint` positional parameter ([#8697](https://github.com/PyTorchLightning/pytorch-lightning/pull/8697)) -- Removed support for the deprecated `on_load_checkpoint` signature. The hook now takes a `pl_module` positional parameter ([#8697](https://github.com/PyTorchLightning/pytorch-lightning/pull/8697)) -- Removed the deprecated `save_function` property in `ModelCheckpoint` ([#8680](https://github.com/PyTorchLightning/pytorch-lightning/pull/8680)) -- Removed the deprecated `model` argument from `ModelCheckpoint.save_checkpoint` ([#8688](https://github.com/PyTorchLightning/pytorch-lightning/pull/8688)) -- Removed the deprecated `sync_step` argument from `WandbLogger` ([#8763](https://github.com/PyTorchLightning/pytorch-lightning/pull/8763)) -- Removed the deprecated `Trainer.truncated_bptt_steps` in favor of `LightningModule.truncated_bptt_steps` ([#8826](https://github.com/PyTorchLightning/pytorch-lightning/pull/8826)) -- Removed `LightningModule.write_predictions` and `LightningModule.write_predictions_dict` ([#8850](https://github.com/PyTorchLightning/pytorch-lightning/pull/8850)) -- Removed `on_reset_*_dataloader` hooks in TrainingType Plugins and Accelerators ([#8858](https://github.com/PyTorchLightning/pytorch-lightning/pull/8858)) -- Removed deprecated `GradInformation` module in favor of `pytorch_lightning.utilities.grads` ([#8831](https://github.com/PyTorchLightning/pytorch-lightning/pull/8831/)) -- Removed `TrainingTypePlugin.on_save` and `Accelerator.on_save` ([#9023](https://github.com/PyTorchLightning/pytorch-lightning/pull/9023)) -- Removed `{Accelerator,TrainingTypePlugin,PrecisionPlugin}.post_optimizer_step` ([#9746](https://github.com/PyTorchLightning/pytorch-lightning/pull/9746)) -- Removed deprecated `connect_precision_plugin` and `connect_training_type_plugin` from `Accelerator` ([#9019](https://github.com/PyTorchLightning/pytorch-lightning/pull/9019)) -- Removed `on_train_epoch_end` from `Accelerator` ([#9035](https://github.com/PyTorchLightning/pytorch-lightning/pull/9035)) -- Removed `InterBatchProcessor` in favor of `DataLoaderIterDataFetcher` ([#9052](https://github.com/PyTorchLightning/pytorch-lightning/pull/9052)) -- Removed `Plugin` in `base_plugin.py` in favor of accessing `TrainingTypePlugin` and `PrecisionPlugin` directly instead ([#9066](https://github.com/PyTorchLightning/pytorch-lightning/pull/9066)) -- Removed `teardown` from `ParallelPlugin` ([#8943](https://github.com/PyTorchLightning/pytorch-lightning/pull/8943)) -- Removed deprecated `profiled_functions` argument from `PyTorchProfiler` ([#9178](https://github.com/PyTorchLightning/pytorch-lightning/pull/9178)) -- Removed deprecated `pytorch_lighting.utilities.argparse_utils` module ([#9166](https://github.com/PyTorchLightning/pytorch-lightning/pull/9166)) -- Removed deprecated property `Trainer.running_sanity_check` in favor of `Trainer.sanity_checking` ([#9209](https://github.com/PyTorchLightning/pytorch-lightning/pull/9209)) -- Removed deprecated `BaseProfiler.output_filename` arg from it and its descendants in favor of `dirpath` and `filename` ([#9214](https://github.com/PyTorchLightning/pytorch-lightning/pull/9214)) -- Removed deprecated property `ModelCheckpoint.period` in favor of `ModelCheckpoint.every_n_epochs` ([#9213](https://github.com/PyTorchLightning/pytorch-lightning/pull/9213)) -- Removed deprecated `auto_move_data` decorator ([#9231](https://github.com/PyTorchLightning/pytorch-lightning/pull/9231)) -- Removed deprecated property `LightningModule.datamodule` in favor of `Trainer.datamodule` ([#9233](https://github.com/PyTorchLightning/pytorch-lightning/pull/9233)) -- Removed deprecated properties `DeepSpeedPlugin.cpu_offload*` in favor of `offload_optimizer`, `offload_parameters` and `pin_memory` ([#9244](https://github.com/PyTorchLightning/pytorch-lightning/pull/9244)) -- Removed deprecated property `AcceleratorConnector.is_using_torchelastic` in favor of `TorchElasticEnvironment.is_using_torchelastic()` ([#9729](https://github.com/PyTorchLightning/pytorch-lightning/pull/9729)) -- Removed `pytorch_lightning.utilities.debugging.InternalDebugger` ([#9680](https://github.com/PyTorchLightning/pytorch-lightning/pull/9680)) -- Removed `call_configure_sharded_model_hook` property from `Accelerator` and `TrainingTypePlugin` ([#9612](https://github.com/PyTorchLightning/pytorch-lightning/pull/9612)) -- Removed `TrainerProperties` mixin and moved property definitions directly into `Trainer` ([#9495](https://github.com/PyTorchLightning/pytorch-lightning/pull/9495)) -- Removed a redundant warning with `ModelCheckpoint(monitor=None)` callback ([#9875](https://github.com/PyTorchLightning/pytorch-lightning/pull/9875)) -- Remove `epoch` from `trainer.logged_metrics` ([#9904](https://github.com/PyTorchLightning/pytorch-lightning/pull/9904)) -- Remove deprecated `distributed_backend` from `Trainer` ([#10017](https://github.com/PyTorchLightning/pytorch-lightning/pull/10017)) -- Removed `process_idx` from the `{DDPSpawnPlugin,TPUSpawnPlugin}.new_process` methods ([#10022](https://github.com/PyTorchLightning/pytorch-lightning/pull/10022)) -- Removed automatic patching of `{train,val,test,predict}_dataloader()` on the `LightningModule` ([#9764](https://github.com/PyTorchLightning/pytorch-lightning/pull/9764)) -- Removed `pytorch_lightning.trainer.connectors.OptimizerConnector` ([#10120](https://github.com/PyTorchLightning/pytorch-lightning/pull/10120)) - -### Fixed - -- Fixed ImageNet evaluation in example ([#10179](https://github.com/PyTorchLightning/pytorch-lightning/pull/10179)) -- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/PyTorchLightning/pytorch-lightning/pull/8685)) -- Fixed `move_metrics_to_cpu` moving the loss to CPU while training on device ([#9308](https://github.com/PyTorchLightning/pytorch-lightning/pull/9308)) -- Fixed incorrect main progress bar indicator when resuming training mid-epoch ([#9310](https://github.com/PyTorchLightning/pytorch-lightning/pull/9310)) -- Fixed an issue with freeing memory of datafetchers during teardown ([#9387](https://github.com/PyTorchLightning/pytorch-lightning/pull/9387)) -- Fixed a bug where the training step output needed to be `deepcopy`-ed ([#9349](https://github.com/PyTorchLightning/pytorch-lightning/pull/9349)) -- Fixed an issue with freeing memory allocated by the data iterators in `Loop.on_run_end` ([#9386](https://github.com/PyTorchLightning/pytorch-lightning/pull/9386), [#9915](https://github.com/PyTorchLightning/pytorch-lightning/pull/9915)) -- Fixed `BasePredictionWriter` not returning the batch indices in a non-distributed setting ([#9432](https://github.com/PyTorchLightning/pytorch-lightning/pull/9432)) -- Fixed an error when running in XLA environments with no TPU attached ([#9572](https://github.com/PyTorchLightning/pytorch-lightning/pull/9572)) -- Fixed check on torchmetrics logged whose `compute()` output is a multielement tensor ([#9582](https://github.com/PyTorchLightning/pytorch-lightning/pull/9582)) -- Fixed gradient accumulation for `DDPShardedPlugin` ([#9122](https://github.com/PyTorchLightning/pytorch-lightning/pull/9122)) -- Fixed missing DeepSpeed distributed call ([#9540](https://github.com/PyTorchLightning/pytorch-lightning/pull/9540)) -- Fixed an issue with wrapped LightningModule during evaluation; The LightningModule no longer gets wrapped with data-parallel modules when not fitting in `DDPPlugin`, `DDPSpawnPlugin`, `DDPShardedPlugin`, `DDPSpawnShardedPlugin` ([#9096](https://github.com/PyTorchLightning/pytorch-lightning/pull/9096)) -- Fixed `trainer.accumulate_grad_batches` to be an int on init. The default value for it is now `None` inside Trainer ([#9652](https://github.com/PyTorchLightning/pytorch-lightning/pull/9652)) -- Fixed `broadcast` in `DDPPlugin` and `DDPSpawnPlugin` to respect the `src` input ([#9691](https://github.com/PyTorchLightning/pytorch-lightning/pull/9691)) -- Fixed `self.log(on_epoch=True, reduce_fx=sum))` for the `on_batch_start` and `on_train_batch_start` hooks ([#9791](https://github.com/PyTorchLightning/pytorch-lightning/pull/9791)) -- Fixed `self.log(on_epoch=True)` for the `on_batch_start` and `on_train_batch_start` hooks ([#9780](https://github.com/PyTorchLightning/pytorch-lightning/pull/9780)) -- Fixed restoring training state during `Trainer.fit` only ([#9413](https://github.com/PyTorchLightning/pytorch-lightning/pull/9413)) -- Fixed DeepSpeed and Lightning both calling the scheduler ([#9788](https://github.com/PyTorchLightning/pytorch-lightning/pull/9788)) -- Fixed missing arguments when saving hyperparameters from the parent class but not from the child class ([#9800](https://github.com/PyTorchLightning/pytorch-lightning/pull/9800)) -- Fixed DeepSpeed GPU device IDs ([#9847](https://github.com/PyTorchLightning/pytorch-lightning/pull/9847)) -- Reset `val_dataloader` in `tuner/batch_size_scaling` ([#9857](https://github.com/PyTorchLightning/pytorch-lightning/pull/9857)) -- Fixed use of `LightningCLI` in computer_vision_fine_tuning.py example ([#9934](https://github.com/PyTorchLightning/pytorch-lightning/pull/9934)) -- Fixed issue with non-init dataclass fields in `apply_to_collection` ([#9963](https://github.com/PyTorchLightning/pytorch-lightning/pull/9963)) -- Reset `val_dataloader` in `tuner/batch_size_scaling` for binsearch ([#9975](https://github.com/PyTorchLightning/pytorch-lightning/pull/9975)) -- Fixed logic to check for spawn in dataloader `TrainerDataLoadingMixin._worker_check` ([#9902](https://github.com/PyTorchLightning/pytorch-lightning/pull/9902)) -- Fixed `train_dataloader` getting loaded twice when resuming from a checkpoint during `Trainer.fit()` ([#9671](https://github.com/PyTorchLightning/pytorch-lightning/pull/9671)) -- Fixed `LearningRateMonitor` logging with multiple param groups optimizer with no scheduler ([#10044](https://github.com/PyTorchLightning/pytorch-lightning/pull/10044)) -- Fixed undesired side effects being caused by `Trainer` patching dataloader methods on the `LightningModule` ([#9764](https://github.com/PyTorchLightning/pytorch-lightning/pull/9764)) -- Fixed gradients not being unscaled when clipping or logging the gradient norm ([#9287](https://github.com/PyTorchLightning/pytorch-lightning/pull/9287)) -- Fixed `on_before_optimizer_step` getting called before the optimizer closure (including backward) has run ([#10167](https://github.com/PyTorchLightning/pytorch-lightning/pull/10167)) -- Fixed monitor value in `ModelCheckpoint` getting moved to the wrong device in a special case where it becomes NaN ([#10118](https://github.com/PyTorchLightning/pytorch-lightning/pull/10118)) -- Fixed creation of `dirpath` in `BaseProfiler` if it doesn't exist ([#10073](https://github.com/PyTorchLightning/pytorch-lightning/pull/10073)) -- Fixed incorrect handling of sigterm ([#10189](https://github.com/PyTorchLightning/pytorch-lightning/pull/10189)) -- Fixed bug where `log(on_step=True, on_epoch=True, sync_dist=True)` wouldn't reduce the value on step ([#10227](https://github.com/PyTorchLightning/pytorch-lightning/pull/10227)) -- Fixed an issue with `pl.utilities.seed.reset_seed` converting the `PL_SEED_WORKERS` environment variable to `bool` ([#10099](https://github.com/PyTorchLightning/pytorch-lightning/pull/10099)) -- Fixed iterating over a logger collection when `fast_dev_run > 0` ([#10232](https://github.com/PyTorchLightning/pytorch-lightning/pull/10232)) -- Fixed `batch_size` in `ResultCollection` not being reset to 1 on epoch end ([#10242](https://github.com/PyTorchLightning/pytorch-lightning/pull/10242)) -- Fixed `distrib_type` not being set when training plugin instances are being passed to the Trainer ([#10251](https://github.com/PyTorchLightning/pytorch-lightning/pull/10251)) +- Removed deprecated `metrics` ([#8586](https://github.com/Lightning-AI/lightning/pull/8586/)) +- Removed the deprecated `outputs` argument in both the `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#8587](https://github.com/Lightning-AI/lightning/pull/8587)) +- Removed the deprecated `TrainerLoggingMixin` class ([#8609](https://github.com/Lightning-AI/lightning/pull/8609)) +- Removed the deprecated `TrainerTrainingTricksMixin` class ([#8679](https://github.com/Lightning-AI/lightning/pull/8679)) +- Removed the deprecated `optimizer_idx` from `training_step` as an accepted argument in manual optimization ([#8576](https://github.com/Lightning-AI/lightning/pull/8576)) +- Removed support for the deprecated `on_save_checkpoint` signature. The hook now takes a `checkpoint` positional parameter ([#8697](https://github.com/Lightning-AI/lightning/pull/8697)) +- Removed support for the deprecated `on_load_checkpoint` signature. The hook now takes a `pl_module` positional parameter ([#8697](https://github.com/Lightning-AI/lightning/pull/8697)) +- Removed the deprecated `save_function` property in `ModelCheckpoint` ([#8680](https://github.com/Lightning-AI/lightning/pull/8680)) +- Removed the deprecated `model` argument from `ModelCheckpoint.save_checkpoint` ([#8688](https://github.com/Lightning-AI/lightning/pull/8688)) +- Removed the deprecated `sync_step` argument from `WandbLogger` ([#8763](https://github.com/Lightning-AI/lightning/pull/8763)) +- Removed the deprecated `Trainer.truncated_bptt_steps` in favor of `LightningModule.truncated_bptt_steps` ([#8826](https://github.com/Lightning-AI/lightning/pull/8826)) +- Removed `LightningModule.write_predictions` and `LightningModule.write_predictions_dict` ([#8850](https://github.com/Lightning-AI/lightning/pull/8850)) +- Removed `on_reset_*_dataloader` hooks in TrainingType Plugins and Accelerators ([#8858](https://github.com/Lightning-AI/lightning/pull/8858)) +- Removed deprecated `GradInformation` module in favor of `pytorch_lightning.utilities.grads` ([#8831](https://github.com/Lightning-AI/lightning/pull/8831/)) +- Removed `TrainingTypePlugin.on_save` and `Accelerator.on_save` ([#9023](https://github.com/Lightning-AI/lightning/pull/9023)) +- Removed `{Accelerator,TrainingTypePlugin,PrecisionPlugin}.post_optimizer_step` ([#9746](https://github.com/Lightning-AI/lightning/pull/9746)) +- Removed deprecated `connect_precision_plugin` and `connect_training_type_plugin` from `Accelerator` ([#9019](https://github.com/Lightning-AI/lightning/pull/9019)) +- Removed `on_train_epoch_end` from `Accelerator` ([#9035](https://github.com/Lightning-AI/lightning/pull/9035)) +- Removed `InterBatchProcessor` in favor of `DataLoaderIterDataFetcher` ([#9052](https://github.com/Lightning-AI/lightning/pull/9052)) +- Removed `Plugin` in `base_plugin.py` in favor of accessing `TrainingTypePlugin` and `PrecisionPlugin` directly instead ([#9066](https://github.com/Lightning-AI/lightning/pull/9066)) +- Removed `teardown` from `ParallelPlugin` ([#8943](https://github.com/Lightning-AI/lightning/pull/8943)) +- Removed deprecated `profiled_functions` argument from `PyTorchProfiler` ([#9178](https://github.com/Lightning-AI/lightning/pull/9178)) +- Removed deprecated `pytorch_lighting.utilities.argparse_utils` module ([#9166](https://github.com/Lightning-AI/lightning/pull/9166)) +- Removed deprecated property `Trainer.running_sanity_check` in favor of `Trainer.sanity_checking` ([#9209](https://github.com/Lightning-AI/lightning/pull/9209)) +- Removed deprecated `BaseProfiler.output_filename` arg from it and its descendants in favor of `dirpath` and `filename` ([#9214](https://github.com/Lightning-AI/lightning/pull/9214)) +- Removed deprecated property `ModelCheckpoint.period` in favor of `ModelCheckpoint.every_n_epochs` ([#9213](https://github.com/Lightning-AI/lightning/pull/9213)) +- Removed deprecated `auto_move_data` decorator ([#9231](https://github.com/Lightning-AI/lightning/pull/9231)) +- Removed deprecated property `LightningModule.datamodule` in favor of `Trainer.datamodule` ([#9233](https://github.com/Lightning-AI/lightning/pull/9233)) +- Removed deprecated properties `DeepSpeedPlugin.cpu_offload*` in favor of `offload_optimizer`, `offload_parameters` and `pin_memory` ([#9244](https://github.com/Lightning-AI/lightning/pull/9244)) +- Removed deprecated property `AcceleratorConnector.is_using_torchelastic` in favor of `TorchElasticEnvironment.is_using_torchelastic()` ([#9729](https://github.com/Lightning-AI/lightning/pull/9729)) +- Removed `pytorch_lightning.utilities.debugging.InternalDebugger` ([#9680](https://github.com/Lightning-AI/lightning/pull/9680)) +- Removed `call_configure_sharded_model_hook` property from `Accelerator` and `TrainingTypePlugin` ([#9612](https://github.com/Lightning-AI/lightning/pull/9612)) +- Removed `TrainerProperties` mixin and moved property definitions directly into `Trainer` ([#9495](https://github.com/Lightning-AI/lightning/pull/9495)) +- Removed a redundant warning with `ModelCheckpoint(monitor=None)` callback ([#9875](https://github.com/Lightning-AI/lightning/pull/9875)) +- Remove `epoch` from `trainer.logged_metrics` ([#9904](https://github.com/Lightning-AI/lightning/pull/9904)) +- Remove deprecated `distributed_backend` from `Trainer` ([#10017](https://github.com/Lightning-AI/lightning/pull/10017)) +- Removed `process_idx` from the `{DDPSpawnPlugin,TPUSpawnPlugin}.new_process` methods ([#10022](https://github.com/Lightning-AI/lightning/pull/10022)) +- Removed automatic patching of `{train,val,test,predict}_dataloader()` on the `LightningModule` ([#9764](https://github.com/Lightning-AI/lightning/pull/9764)) +- Removed `pytorch_lightning.trainer.connectors.OptimizerConnector` ([#10120](https://github.com/Lightning-AI/lightning/pull/10120)) + +### Fixed + +- Fixed ImageNet evaluation in example ([#10179](https://github.com/Lightning-AI/lightning/pull/10179)) +- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8685](https://github.com/Lightning-AI/lightning/pull/8685)) +- Fixed `move_metrics_to_cpu` moving the loss to CPU while training on device ([#9308](https://github.com/Lightning-AI/lightning/pull/9308)) +- Fixed incorrect main progress bar indicator when resuming training mid-epoch ([#9310](https://github.com/Lightning-AI/lightning/pull/9310)) +- Fixed an issue with freeing memory of datafetchers during teardown ([#9387](https://github.com/Lightning-AI/lightning/pull/9387)) +- Fixed a bug where the training step output needed to be `deepcopy`-ed ([#9349](https://github.com/Lightning-AI/lightning/pull/9349)) +- Fixed an issue with freeing memory allocated by the data iterators in `Loop.on_run_end` ([#9386](https://github.com/Lightning-AI/lightning/pull/9386), [#9915](https://github.com/Lightning-AI/lightning/pull/9915)) +- Fixed `BasePredictionWriter` not returning the batch indices in a non-distributed setting ([#9432](https://github.com/Lightning-AI/lightning/pull/9432)) +- Fixed an error when running in XLA environments with no TPU attached ([#9572](https://github.com/Lightning-AI/lightning/pull/9572)) +- Fixed check on torchmetrics logged whose `compute()` output is a multielement tensor ([#9582](https://github.com/Lightning-AI/lightning/pull/9582)) +- Fixed gradient accumulation for `DDPShardedPlugin` ([#9122](https://github.com/Lightning-AI/lightning/pull/9122)) +- Fixed missing DeepSpeed distributed call ([#9540](https://github.com/Lightning-AI/lightning/pull/9540)) +- Fixed an issue with wrapped LightningModule during evaluation; The LightningModule no longer gets wrapped with data-parallel modules when not fitting in `DDPPlugin`, `DDPSpawnPlugin`, `DDPShardedPlugin`, `DDPSpawnShardedPlugin` ([#9096](https://github.com/Lightning-AI/lightning/pull/9096)) +- Fixed `trainer.accumulate_grad_batches` to be an int on init. The default value for it is now `None` inside Trainer ([#9652](https://github.com/Lightning-AI/lightning/pull/9652)) +- Fixed `broadcast` in `DDPPlugin` and `DDPSpawnPlugin` to respect the `src` input ([#9691](https://github.com/Lightning-AI/lightning/pull/9691)) +- Fixed `self.log(on_epoch=True, reduce_fx=sum))` for the `on_batch_start` and `on_train_batch_start` hooks ([#9791](https://github.com/Lightning-AI/lightning/pull/9791)) +- Fixed `self.log(on_epoch=True)` for the `on_batch_start` and `on_train_batch_start` hooks ([#9780](https://github.com/Lightning-AI/lightning/pull/9780)) +- Fixed restoring training state during `Trainer.fit` only ([#9413](https://github.com/Lightning-AI/lightning/pull/9413)) +- Fixed DeepSpeed and Lightning both calling the scheduler ([#9788](https://github.com/Lightning-AI/lightning/pull/9788)) +- Fixed missing arguments when saving hyperparameters from the parent class but not from the child class ([#9800](https://github.com/Lightning-AI/lightning/pull/9800)) +- Fixed DeepSpeed GPU device IDs ([#9847](https://github.com/Lightning-AI/lightning/pull/9847)) +- Reset `val_dataloader` in `tuner/batch_size_scaling` ([#9857](https://github.com/Lightning-AI/lightning/pull/9857)) +- Fixed use of `LightningCLI` in computer_vision_fine_tuning.py example ([#9934](https://github.com/Lightning-AI/lightning/pull/9934)) +- Fixed issue with non-init dataclass fields in `apply_to_collection` ([#9963](https://github.com/Lightning-AI/lightning/pull/9963)) +- Reset `val_dataloader` in `tuner/batch_size_scaling` for binsearch ([#9975](https://github.com/Lightning-AI/lightning/pull/9975)) +- Fixed logic to check for spawn in dataloader `TrainerDataLoadingMixin._worker_check` ([#9902](https://github.com/Lightning-AI/lightning/pull/9902)) +- Fixed `train_dataloader` getting loaded twice when resuming from a checkpoint during `Trainer.fit()` ([#9671](https://github.com/Lightning-AI/lightning/pull/9671)) +- Fixed `LearningRateMonitor` logging with multiple param groups optimizer with no scheduler ([#10044](https://github.com/Lightning-AI/lightning/pull/10044)) +- Fixed undesired side effects being caused by `Trainer` patching dataloader methods on the `LightningModule` ([#9764](https://github.com/Lightning-AI/lightning/pull/9764)) +- Fixed gradients not being unscaled when clipping or logging the gradient norm ([#9287](https://github.com/Lightning-AI/lightning/pull/9287)) +- Fixed `on_before_optimizer_step` getting called before the optimizer closure (including backward) has run ([#10167](https://github.com/Lightning-AI/lightning/pull/10167)) +- Fixed monitor value in `ModelCheckpoint` getting moved to the wrong device in a special case where it becomes NaN ([#10118](https://github.com/Lightning-AI/lightning/pull/10118)) +- Fixed creation of `dirpath` in `BaseProfiler` if it doesn't exist ([#10073](https://github.com/Lightning-AI/lightning/pull/10073)) +- Fixed incorrect handling of sigterm ([#10189](https://github.com/Lightning-AI/lightning/pull/10189)) +- Fixed bug where `log(on_step=True, on_epoch=True, sync_dist=True)` wouldn't reduce the value on step ([#10227](https://github.com/Lightning-AI/lightning/pull/10227)) +- Fixed an issue with `pl.utilities.seed.reset_seed` converting the `PL_SEED_WORKERS` environment variable to `bool` ([#10099](https://github.com/Lightning-AI/lightning/pull/10099)) +- Fixed iterating over a logger collection when `fast_dev_run > 0` ([#10232](https://github.com/Lightning-AI/lightning/pull/10232)) +- Fixed `batch_size` in `ResultCollection` not being reset to 1 on epoch end ([#10242](https://github.com/Lightning-AI/lightning/pull/10242)) +- Fixed `distrib_type` not being set when training plugin instances are being passed to the Trainer ([#10251](https://github.com/Lightning-AI/lightning/pull/10251)) ## [1.4.9] - 2021-09-30 -- Fixed `lr_find` to generate same results on multiple calls ([#9704](https://github.com/PyTorchLightning/pytorch-lightning/pull/9704)) -- Fixed `reset` metrics on validation epoch end ([#9717](https://github.com/PyTorchLightning/pytorch-lightning/pull/9717)) -- Fixed input validation for `gradient_clip_val`, `gradient_clip_algorithm`, `track_grad_norm` and `terminate_on_nan` Trainer arguments ([#9595](https://github.com/PyTorchLightning/pytorch-lightning/pull/9595)) -- Reset metrics before each task starts ([#9410](https://github.com/PyTorchLightning/pytorch-lightning/pull/9410)) +- Fixed `lr_find` to generate same results on multiple calls ([#9704](https://github.com/Lightning-AI/lightning/pull/9704)) +- Fixed `reset` metrics on validation epoch end ([#9717](https://github.com/Lightning-AI/lightning/pull/9717)) +- Fixed input validation for `gradient_clip_val`, `gradient_clip_algorithm`, `track_grad_norm` and `terminate_on_nan` Trainer arguments ([#9595](https://github.com/Lightning-AI/lightning/pull/9595)) +- Reset metrics before each task starts ([#9410](https://github.com/Lightning-AI/lightning/pull/9410)) ## [1.4.8] - 2021-09-22 -- Fixed error reporting in DDP process reconciliation when processes are launched by an external agent ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389)) -- Added PL_RECONCILE_PROCESS environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/PyTorchLightning/pytorch-lightning/pull/9389)) -- Fixed `add_argparse_args` raising `TypeError` when args are typed as `typing.Generic` in Python 3.6 ([#9554](https://github.com/PyTorchLightning/pytorch-lightning/pull/9554)) -- Fixed back-compatibility for saving hyperparameters from a single container and inferring its argument name by reverting [#9125](https://github.com/PyTorchLightning/pytorch-lightning/pull/9125) ([#9642](https://github.com/PyTorchLightning/pytorch-lightning/pull/9642)) +- Fixed error reporting in DDP process reconciliation when processes are launched by an external agent ([#9389](https://github.com/Lightning-AI/lightning/pull/9389)) +- Added PL_RECONCILE_PROCESS environment variable to enable process reconciliation regardless of cluster environment settings ([#9389](https://github.com/Lightning-AI/lightning/pull/9389)) +- Fixed `add_argparse_args` raising `TypeError` when args are typed as `typing.Generic` in Python 3.6 ([#9554](https://github.com/Lightning-AI/lightning/pull/9554)) +- Fixed back-compatibility for saving hyperparameters from a single container and inferring its argument name by reverting [#9125](https://github.com/Lightning-AI/lightning/pull/9125) ([#9642](https://github.com/Lightning-AI/lightning/pull/9642)) ## [1.4.7] - 2021-09-14 -- Fixed logging of nan parameters ([#9364](https://github.com/PyTorchLightning/pytorch-lightning/pull/9364)) -- Fixed `replace_sampler` missing the batch size under specific conditions ([#9367](https://github.com/PyTorchLightning/pytorch-lightning/pull/9367)) -- Pass init args to ShardedDataParallel ([#9483](https://github.com/PyTorchLightning/pytorch-lightning/pull/9483)) -- Fixed collision of user argument when using ShardedDDP ([#9512](https://github.com/PyTorchLightning/pytorch-lightning/pull/9512)) -- Fixed DeepSpeed crash for RNNs ([#9489](https://github.com/PyTorchLightning/pytorch-lightning/pull/9489)) +- Fixed logging of nan parameters ([#9364](https://github.com/Lightning-AI/lightning/pull/9364)) +- Fixed `replace_sampler` missing the batch size under specific conditions ([#9367](https://github.com/Lightning-AI/lightning/pull/9367)) +- Pass init args to ShardedDataParallel ([#9483](https://github.com/Lightning-AI/lightning/pull/9483)) +- Fixed collision of user argument when using ShardedDDP ([#9512](https://github.com/Lightning-AI/lightning/pull/9512)) +- Fixed DeepSpeed crash for RNNs ([#9489](https://github.com/Lightning-AI/lightning/pull/9489)) ## [1.4.6] - 2021-09-07 -- Fixed an issues with export to ONNX format when a model has multiple inputs ([#8800](https://github.com/PyTorchLightning/pytorch-lightning/pull/8800)) -- Removed deprecation warnings being called for `on_{task}_dataloader` ([#9279](https://github.com/PyTorchLightning/pytorch-lightning/pull/9279)) +- Fixed an issues with export to ONNX format when a model has multiple inputs ([#8800](https://github.com/Lightning-AI/lightning/pull/8800)) +- Removed deprecation warnings being called for `on_{task}_dataloader` ([#9279](https://github.com/Lightning-AI/lightning/pull/9279)) - Fixed save/load/resume from checkpoint for DeepSpeed Plugin ( - [#8397](https://github.com/PyTorchLightning/pytorch-lightning/pull/8397), - [#8644](https://github.com/PyTorchLightning/pytorch-lightning/pull/8644), - [#8627](https://github.com/PyTorchLightning/pytorch-lightning/pull/8627)) -- Fixed `EarlyStopping` running on train epoch end when `check_val_every_n_epoch>1` is set ([#9156](https://github.com/PyTorchLightning/pytorch-lightning/pull/9156)) -- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8333](https://github.com/PyTorchLightning/pytorch-lightning/pull/8333)) -- Fixed the Apex and DeepSpeed plugin closure running after the `on_before_optimizer_step` hook ([#9288](https://github.com/PyTorchLightning/pytorch-lightning/pull/9288)) -- Fixed the Native AMP plugin closure not running with manual optimization ([#9288](https://github.com/PyTorchLightning/pytorch-lightning/pull/9288)) -- Fixed bug where data-loading functions where not getting the correct running stage passed ([#8858](https://github.com/PyTorchLightning/pytorch-lightning/pull/8858)) -- Fixed intra-epoch evaluation outputs staying in memory when the respective `*_epoch_end` hook wasn't overridden ([#9261](https://github.com/PyTorchLightning/pytorch-lightning/pull/9261)) -- Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/PyTorchLightning/pytorch-lightning/pull/9267)) -- Fixed PyTorch Profiler not enabled for manual optimization ([#9316](https://github.com/PyTorchLightning/pytorch-lightning/pull/9316)) -- Fixed inspection of other args when a container is specified in `save_hyperparameters` ([#9125](https://github.com/PyTorchLightning/pytorch-lightning/pull/9125)) -- Fixed signature of `Timer.on_train_epoch_end` and `StochasticWeightAveraging.on_train_epoch_end` to prevent unwanted deprecation warnings ([#9347](https://github.com/PyTorchLightning/pytorch-lightning/pull/9347)) + [#8397](https://github.com/Lightning-AI/lightning/pull/8397), + [#8644](https://github.com/Lightning-AI/lightning/pull/8644), + [#8627](https://github.com/Lightning-AI/lightning/pull/8627)) +- Fixed `EarlyStopping` running on train epoch end when `check_val_every_n_epoch>1` is set ([#9156](https://github.com/Lightning-AI/lightning/pull/9156)) +- Fixed an issue with logger outputs not being finalized correctly after prediction runs ([#8333](https://github.com/Lightning-AI/lightning/pull/8333)) +- Fixed the Apex and DeepSpeed plugin closure running after the `on_before_optimizer_step` hook ([#9288](https://github.com/Lightning-AI/lightning/pull/9288)) +- Fixed the Native AMP plugin closure not running with manual optimization ([#9288](https://github.com/Lightning-AI/lightning/pull/9288)) +- Fixed bug where data-loading functions where not getting the correct running stage passed ([#8858](https://github.com/Lightning-AI/lightning/pull/8858)) +- Fixed intra-epoch evaluation outputs staying in memory when the respective `*_epoch_end` hook wasn't overridden ([#9261](https://github.com/Lightning-AI/lightning/pull/9261)) +- Fixed error handling in DDP process reconciliation when `_sync_dir` was not initialized ([#9267](https://github.com/Lightning-AI/lightning/pull/9267)) +- Fixed PyTorch Profiler not enabled for manual optimization ([#9316](https://github.com/Lightning-AI/lightning/pull/9316)) +- Fixed inspection of other args when a container is specified in `save_hyperparameters` ([#9125](https://github.com/Lightning-AI/lightning/pull/9125)) +- Fixed signature of `Timer.on_train_epoch_end` and `StochasticWeightAveraging.on_train_epoch_end` to prevent unwanted deprecation warnings ([#9347](https://github.com/Lightning-AI/lightning/pull/9347)) ## [1.4.5] - 2021-08-31 -- Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/PyTorchLightning/pytorch-lightning/pull/9142)) -- Fixed not setting a default value for `max_epochs` if `max_time` was specified on the `Trainer` constructor ([#9072](https://github.com/PyTorchLightning/pytorch-lightning/pull/9072)) -- Fixed the CometLogger, no longer modifies the metrics in place. Instead creates a copy of metrics before performing any operations ([#9150](https://github.com/PyTorchLightning/pytorch-lightning/pull/9150)) -- Fixed `DDP` "CUDA error: initialization error" due to a `copy` instead of `deepcopy` on `ResultCollection` ([#9239](https://github.com/PyTorchLightning/pytorch-lightning/pull/9239)) +- Fixed reduction using `self.log(sync_dict=True, reduce_fx={mean,max})` ([#9142](https://github.com/Lightning-AI/lightning/pull/9142)) +- Fixed not setting a default value for `max_epochs` if `max_time` was specified on the `Trainer` constructor ([#9072](https://github.com/Lightning-AI/lightning/pull/9072)) +- Fixed the CometLogger, no longer modifies the metrics in place. Instead creates a copy of metrics before performing any operations ([#9150](https://github.com/Lightning-AI/lightning/pull/9150)) +- Fixed `DDP` "CUDA error: initialization error" due to a `copy` instead of `deepcopy` on `ResultCollection` ([#9239](https://github.com/Lightning-AI/lightning/pull/9239)) ## [1.4.4] - 2021-08-24 -- Fixed a bug in the binary search mode of auto batch size scaling where exception was raised if the first trainer run resulted in OOM ([#8954](https://github.com/PyTorchLightning/pytorch-lightning/pull/8954)) -- Fixed a bug causing logging with `log_gpu_memory='min_max'` not working ([#9013](https://github.com/PyTorchLightning/pytorch-lightning/pull/9013)) +- Fixed a bug in the binary search mode of auto batch size scaling where exception was raised if the first trainer run resulted in OOM ([#8954](https://github.com/Lightning-AI/lightning/pull/8954)) +- Fixed a bug causing logging with `log_gpu_memory='min_max'` not working ([#9013](https://github.com/Lightning-AI/lightning/pull/9013)) ## [1.4.3] - 2021-08-17 -- Fixed plateau scheduler stepping on incomplete epoch ([#8861](https://github.com/PyTorchLightning/pytorch-lightning/pull/8861)) -- Fixed infinite loop with `CycleIterator` and multiple loaders ([#8889](https://github.com/PyTorchLightning/pytorch-lightning/pull/8889)) -- Fixed `StochasticWeightAveraging` with a list of learning rates not applying them to each param group ([#8747](https://github.com/PyTorchLightning/pytorch-lightning/pull/8747)) -- Restore original loaders if replaced by entrypoint ([#8885](https://github.com/PyTorchLightning/pytorch-lightning/pull/8885)) -- Fixed lost reference to `_Metadata` object in `ResultMetricCollection` ([#8932](https://github.com/PyTorchLightning/pytorch-lightning/pull/8932)) -- Ensure the existence of `DDPPlugin._sync_dir` in `reconciliate_processes` ([#8939](https://github.com/PyTorchLightning/pytorch-lightning/pull/8939)) +- Fixed plateau scheduler stepping on incomplete epoch ([#8861](https://github.com/Lightning-AI/lightning/pull/8861)) +- Fixed infinite loop with `CycleIterator` and multiple loaders ([#8889](https://github.com/Lightning-AI/lightning/pull/8889)) +- Fixed `StochasticWeightAveraging` with a list of learning rates not applying them to each param group ([#8747](https://github.com/Lightning-AI/lightning/pull/8747)) +- Restore original loaders if replaced by entrypoint ([#8885](https://github.com/Lightning-AI/lightning/pull/8885)) +- Fixed lost reference to `_Metadata` object in `ResultMetricCollection` ([#8932](https://github.com/Lightning-AI/lightning/pull/8932)) +- Ensure the existence of `DDPPlugin._sync_dir` in `reconciliate_processes` ([#8939](https://github.com/Lightning-AI/lightning/pull/8939)) ## [1.4.2] - 2021-08-10 -- Fixed recursive call for `apply_to_collection(include_none=False)` ([#8719](https://github.com/PyTorchLightning/pytorch-lightning/pull/8719)) -- Fixed truncated backprop through time enablement when set as a property on the LightningModule and not the Trainer ([#8804](https://github.com/PyTorchLightning/pytorch-lightning/pull/8804/)) -- Fixed comments and exception message for metrics_to_scalars ([#8782](https://github.com/PyTorchLightning/pytorch-lightning/pull/8782/)) -- Fixed typo error in LightningLoggerBase.after_save_checkpoint docstring ([#8737](https://github.com/PyTorchLightning/pytorch-lightning/pull/8737/)) +- Fixed recursive call for `apply_to_collection(include_none=False)` ([#8719](https://github.com/Lightning-AI/lightning/pull/8719)) +- Fixed truncated backprop through time enablement when set as a property on the LightningModule and not the Trainer ([#8804](https://github.com/Lightning-AI/lightning/pull/8804/)) +- Fixed comments and exception message for metrics_to_scalars ([#8782](https://github.com/Lightning-AI/lightning/pull/8782/)) +- Fixed typo error in LightningLoggerBase.after_save_checkpoint docstring ([#8737](https://github.com/Lightning-AI/lightning/pull/8737/)) ## [1.4.1] - 2021-08-03 -- Fixed `trainer.fit_loop.split_idx` always returning `None` ([#8601](https://github.com/PyTorchLightning/pytorch-lightning/pull/8601)) -- Fixed references for `ResultCollection.extra` ([#8622](https://github.com/PyTorchLightning/pytorch-lightning/pull/8622)) -- Fixed reference issues during epoch end result collection ([#8621](https://github.com/PyTorchLightning/pytorch-lightning/pull/8621)) -- Fixed horovod auto-detection when horovod is not installed and the launcher is `mpirun` ([#8610](https://github.com/PyTorchLightning/pytorch-lightning/pull/8610)) -- Fixed an issue with `training_step` outputs not getting collected correctly for `training_epoch_end` ([#8613](https://github.com/PyTorchLightning/pytorch-lightning/pull/8613)) -- Fixed distributed types support for CPUs ([#8667](https://github.com/PyTorchLightning/pytorch-lightning/pull/8667)) -- Fixed a deadlock issue with DDP and torchelastic ([#8655](https://github.com/PyTorchLightning/pytorch-lightning/pull/8655)) -- Fixed `accelerator=ddp` choice for CPU ([#8645](https://github.com/PyTorchLightning/pytorch-lightning/pull/8645)) +- Fixed `trainer.fit_loop.split_idx` always returning `None` ([#8601](https://github.com/Lightning-AI/lightning/pull/8601)) +- Fixed references for `ResultCollection.extra` ([#8622](https://github.com/Lightning-AI/lightning/pull/8622)) +- Fixed reference issues during epoch end result collection ([#8621](https://github.com/Lightning-AI/lightning/pull/8621)) +- Fixed horovod auto-detection when horovod is not installed and the launcher is `mpirun` ([#8610](https://github.com/Lightning-AI/lightning/pull/8610)) +- Fixed an issue with `training_step` outputs not getting collected correctly for `training_epoch_end` ([#8613](https://github.com/Lightning-AI/lightning/pull/8613)) +- Fixed distributed types support for CPUs ([#8667](https://github.com/Lightning-AI/lightning/pull/8667)) +- Fixed a deadlock issue with DDP and torchelastic ([#8655](https://github.com/Lightning-AI/lightning/pull/8655)) +- Fixed `accelerator=ddp` choice for CPU ([#8645](https://github.com/Lightning-AI/lightning/pull/8645)) ## [1.4.0] - 2021-07-27 ### Added -- Added `extract_batch_size` utility and corresponding tests to extract batch dimension from multiple batch types ([#8357](https://github.com/PyTorchLightning/pytorch-lightning/pull/8357/)) -- Added support for named parameter groups in `LearningRateMonitor` ([#7987](https://github.com/PyTorchLightning/pytorch-lightning/pull/7987)) -- Added `dataclass` support for `pytorch_lightning.utilities.apply_to_collection` ([#7935](https://github.com/PyTorchLightning/pytorch-lightning/pull/7935)) -- Added support to `LightningModule.to_torchscript` for saving to custom filesystems with `fsspec` ([#7617](https://github.com/PyTorchLightning/pytorch-lightning/pull/7617)) +- Added `extract_batch_size` utility and corresponding tests to extract batch dimension from multiple batch types ([#8357](https://github.com/Lightning-AI/lightning/pull/8357/)) +- Added support for named parameter groups in `LearningRateMonitor` ([#7987](https://github.com/Lightning-AI/lightning/pull/7987)) +- Added `dataclass` support for `pytorch_lightning.utilities.apply_to_collection` ([#7935](https://github.com/Lightning-AI/lightning/pull/7935)) +- Added support to `LightningModule.to_torchscript` for saving to custom filesystems with `fsspec` ([#7617](https://github.com/Lightning-AI/lightning/pull/7617)) - Added `KubeflowEnvironment` for use with the `PyTorchJob` operator in Kubeflow -- Added LightningCLI support for config files on object stores ([#7521](https://github.com/PyTorchLightning/pytorch-lightning/pull/7521)) -- Added `ModelPruning(prune_on_train_epoch_end=True|False)` to choose when to apply pruning ([#7704](https://github.com/PyTorchLightning/pytorch-lightning/pull/7704)) -- Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/PyTorchLightning/pytorch-lightning/pull/7515)) +- Added LightningCLI support for config files on object stores ([#7521](https://github.com/Lightning-AI/lightning/pull/7521)) +- Added `ModelPruning(prune_on_train_epoch_end=True|False)` to choose when to apply pruning ([#7704](https://github.com/Lightning-AI/lightning/pull/7704)) +- Added support for checkpointing based on a provided time interval during training ([#7515](https://github.com/Lightning-AI/lightning/pull/7515)) - Progress tracking - * Added dataclasses for progress tracking ([#6603](https://github.com/PyTorchLightning/pytorch-lightning/pull/6603), - [#7574](https://github.com/PyTorchLightning/pytorch-lightning/pull/7574), - [#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140), - [#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362)) - * Add `{,load_}state_dict` to the progress tracking dataclasses ([#8140](https://github.com/PyTorchLightning/pytorch-lightning/pull/8140)) - * Connect the progress tracking dataclasses to the loops ([#8244](https://github.com/PyTorchLightning/pytorch-lightning/pull/8244), - [#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362)) - * Do not reset the progress tracking dataclasses total counters ([#8475](https://github.com/PyTorchLightning/pytorch-lightning/pull/8475)) -- Added support for passing a `LightningDataModule` positionally as the second argument to `trainer.{validate,test,predict}` ([#7431](https://github.com/PyTorchLightning/pytorch-lightning/pull/7431)) -- Added argument `trainer.predict(ckpt_path)` ([#7430](https://github.com/PyTorchLightning/pytorch-lightning/pull/7430)) -- Added `clip_grad_by_value` support for TPUs ([#7025](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025)) -- Added support for passing any class to `is_overridden` ([#7918](https://github.com/PyTorchLightning/pytorch-lightning/pull/7918)) -- Added `sub_dir` parameter to `TensorBoardLogger` ([#6195](https://github.com/PyTorchLightning/pytorch-lightning/pull/6195)) -- Added correct `dataloader_idx` to batch transfer hooks ([#6241](https://github.com/PyTorchLightning/pytorch-lightning/pull/6241)) -- Added `include_none=bool` argument to `apply_to_collection` ([#7769](https://github.com/PyTorchLightning/pytorch-lightning/pull/7769)) -- Added `apply_to_collections` to apply a function to two zipped collections ([#7769](https://github.com/PyTorchLightning/pytorch-lightning/pull/7769)) -- Added `ddp_fully_sharded` support ([#7487](https://github.com/PyTorchLightning/pytorch-lightning/pull/7487)) -- Added `should_rank_save_checkpoint` property to Training Plugins ([#7684](https://github.com/PyTorchLightning/pytorch-lightning/pull/7684)) -- Added `log_grad_norm` hook to `LightningModule` to customize the logging of gradient norms ([#7873](https://github.com/PyTorchLightning/pytorch-lightning/pull/7873)) -- Added `save_config_filename` init argument to `LightningCLI` to ease resolving name conflicts ([#7741](https://github.com/PyTorchLightning/pytorch-lightning/pull/7741)) -- Added `save_config_overwrite` init argument to `LightningCLI` to ease overwriting existing config files ([#8059](https://github.com/PyTorchLightning/pytorch-lightning/pull/8059)) -- Added reset dataloader hooks to Training Plugins and Accelerators ([#7861](https://github.com/PyTorchLightning/pytorch-lightning/pull/7861)) -- Added trainer stage hooks for Training Plugins and Accelerators ([#7864](https://github.com/PyTorchLightning/pytorch-lightning/pull/7864)) -- Added the `on_before_optimizer_step` hook ([#8048](https://github.com/PyTorchLightning/pytorch-lightning/pull/8048)) -- Added IPU Accelerator ([#7867](https://github.com/PyTorchLightning/pytorch-lightning/pull/7867)) + * Added dataclasses for progress tracking ([#6603](https://github.com/Lightning-AI/lightning/pull/6603), + [#7574](https://github.com/Lightning-AI/lightning/pull/7574), + [#8140](https://github.com/Lightning-AI/lightning/pull/8140), + [#8362](https://github.com/Lightning-AI/lightning/pull/8362)) + * Add `{,load_}state_dict` to the progress tracking dataclasses ([#8140](https://github.com/Lightning-AI/lightning/pull/8140)) + * Connect the progress tracking dataclasses to the loops ([#8244](https://github.com/Lightning-AI/lightning/pull/8244), + [#8362](https://github.com/Lightning-AI/lightning/pull/8362)) + * Do not reset the progress tracking dataclasses total counters ([#8475](https://github.com/Lightning-AI/lightning/pull/8475)) +- Added support for passing a `LightningDataModule` positionally as the second argument to `trainer.{validate,test,predict}` ([#7431](https://github.com/Lightning-AI/lightning/pull/7431)) +- Added argument `trainer.predict(ckpt_path)` ([#7430](https://github.com/Lightning-AI/lightning/pull/7430)) +- Added `clip_grad_by_value` support for TPUs ([#7025](https://github.com/Lightning-AI/lightning/pull/7025)) +- Added support for passing any class to `is_overridden` ([#7918](https://github.com/Lightning-AI/lightning/pull/7918)) +- Added `sub_dir` parameter to `TensorBoardLogger` ([#6195](https://github.com/Lightning-AI/lightning/pull/6195)) +- Added correct `dataloader_idx` to batch transfer hooks ([#6241](https://github.com/Lightning-AI/lightning/pull/6241)) +- Added `include_none=bool` argument to `apply_to_collection` ([#7769](https://github.com/Lightning-AI/lightning/pull/7769)) +- Added `apply_to_collections` to apply a function to two zipped collections ([#7769](https://github.com/Lightning-AI/lightning/pull/7769)) +- Added `ddp_fully_sharded` support ([#7487](https://github.com/Lightning-AI/lightning/pull/7487)) +- Added `should_rank_save_checkpoint` property to Training Plugins ([#7684](https://github.com/Lightning-AI/lightning/pull/7684)) +- Added `log_grad_norm` hook to `LightningModule` to customize the logging of gradient norms ([#7873](https://github.com/Lightning-AI/lightning/pull/7873)) +- Added `save_config_filename` init argument to `LightningCLI` to ease resolving name conflicts ([#7741](https://github.com/Lightning-AI/lightning/pull/7741)) +- Added `save_config_overwrite` init argument to `LightningCLI` to ease overwriting existing config files ([#8059](https://github.com/Lightning-AI/lightning/pull/8059)) +- Added reset dataloader hooks to Training Plugins and Accelerators ([#7861](https://github.com/Lightning-AI/lightning/pull/7861)) +- Added trainer stage hooks for Training Plugins and Accelerators ([#7864](https://github.com/Lightning-AI/lightning/pull/7864)) +- Added the `on_before_optimizer_step` hook ([#8048](https://github.com/Lightning-AI/lightning/pull/8048)) +- Added IPU Accelerator ([#7867](https://github.com/Lightning-AI/lightning/pull/7867)) - Fault-tolerant training - * Added `{,load_}state_dict` to `ResultCollection` ([#7948](https://github.com/PyTorchLightning/pytorch-lightning/pull/7948)) - * Added `{,load_}state_dict` to `Loops` ([#8197](https://github.com/PyTorchLightning/pytorch-lightning/pull/8197)) - * Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/PyTorchLightning/pytorch-lightning/pull/8307)) - * Set `Loop.restarting=False` at the end of the first iteration ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362)) - * Save the loops state with the checkpoint (opt-in) ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362)) - * Save a checkpoint to restore the state on exception (opt-in) ([#8362](https://github.com/PyTorchLightning/pytorch-lightning/pull/8362)) - * Added `state_dict` and `load_state_dict` utilities for `CombinedLoader` + utilities for dataloader ([#8364](https://github.com/PyTorchLightning/pytorch-lightning/pull/8364)) -- Added `rank_zero_only` to `LightningModule.log` function ([#7966](https://github.com/PyTorchLightning/pytorch-lightning/pull/7966)) -- Added `metric_attribute` to `LightningModule.log` function ([#7966](https://github.com/PyTorchLightning/pytorch-lightning/pull/7966)) -- Added a warning if `Trainer(log_every_n_steps)` is a value too high for the training dataloader ([#7734](https://github.com/PyTorchLightning/pytorch-lightning/pull/7734)) -- Added LightningCLI support for argument links applied on instantiation ([#7895](https://github.com/PyTorchLightning/pytorch-lightning/pull/7895)) -- Added LightningCLI support for configurable callbacks that should always be present ([#7964](https://github.com/PyTorchLightning/pytorch-lightning/pull/7964)) -- Added DeepSpeed Infinity Support, and updated to DeepSpeed 0.4.0 ([#7234](https://github.com/PyTorchLightning/pytorch-lightning/pull/7234)) -- Added support for `torch.nn.UninitializedParameter` in `ModelSummary` ([#7642](https://github.com/PyTorchLightning/pytorch-lightning/pull/7642)) -- Added support `LightningModule.save_hyperparameters` when `LightningModule` is a dataclass ([#7992](https://github.com/PyTorchLightning/pytorch-lightning/pull/7992)) -- Added support for overriding `optimizer_zero_grad` and `optimizer_step` when using accumulate_grad_batches ([#7980](https://github.com/PyTorchLightning/pytorch-lightning/pull/7980)) -- Added `logger` boolean flag to `save_hyperparameters` ([#7960](https://github.com/PyTorchLightning/pytorch-lightning/pull/7960)) -- Added support for calling scripts using the module syntax (`python -m package.script`) ([#8073](https://github.com/PyTorchLightning/pytorch-lightning/pull/8073)) -- Added support for optimizers and learning rate schedulers to `LightningCLI` ([#8093](https://github.com/PyTorchLightning/pytorch-lightning/pull/8093)) -- Added XLA Profiler ([#8014](https://github.com/PyTorchLightning/pytorch-lightning/pull/8014)) -- Added `PrecisionPlugin.{pre,post}_backward` ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- Added `on_load_checkpoint` and `on_save_checkpoint` hooks to the `PrecisionPlugin` base class ([#7831](https://github.com/PyTorchLightning/pytorch-lightning/pull/7831)) -- Added `max_depth` parameter in `ModelSummary` ([#8062](https://github.com/PyTorchLightning/pytorch-lightning/pull/8062)) -- Added `XLAStatsMonitor` callback ([#8235](https://github.com/PyTorchLightning/pytorch-lightning/pull/8235)) -- Added `restore` function and `restarting` attribute to base `Loop` ([#8247](https://github.com/PyTorchLightning/pytorch-lightning/pull/8247)) -- Added support for `save_hyperparameters` in `LightningDataModule` ([#3792](https://github.com/PyTorchLightning/pytorch-lightning/pull/3792)) -- Added the `ModelCheckpoint(save_on_train_epoch_end)` to choose when to run the saving logic ([#8389](https://github.com/PyTorchLightning/pytorch-lightning/pull/8389)) -- Added `LSFEnvironment` for distributed training with the LSF resource manager `jsrun` ([#5102](https://github.com/PyTorchLightning/pytorch-lightning/pull/5102)) -- Added support for `accelerator='cpu'|'gpu'|'tpu'|'ipu'|'auto'` ([#7808](https://github.com/PyTorchLightning/pytorch-lightning/pull/7808)) -- Added `tpu_spawn_debug` to plugin registry ([#7933](https://github.com/PyTorchLightning/pytorch-lightning/pull/7933)) -- Enabled traditional/manual launching of DDP processes through `LOCAL_RANK` and `NODE_RANK` environment variable assignments ([#7480](https://github.com/PyTorchLightning/pytorch-lightning/pull/7480)) -- Added `quantize_on_fit_end` argument to `QuantizationAwareTraining` ([#8464](https://github.com/PyTorchLightning/pytorch-lightning/pull/8464)) -- Added experimental support for loop specialization ([#8226](https://github.com/PyTorchLightning/pytorch-lightning/pull/8226)) -- Added support for `devices` flag to Trainer ([#8440](https://github.com/PyTorchLightning/pytorch-lightning/pull/8440)) -- Added private `prevent_trainer_and_dataloaders_deepcopy` context manager on the `LightningModule` ([#8472](https://github.com/PyTorchLightning/pytorch-lightning/pull/8472)) -- Added support for providing callables to the Lightning CLI instead of types ([#8400](https://github.com/PyTorchLightning/pytorch-lightning/pull/8400)) + * Added `{,load_}state_dict` to `ResultCollection` ([#7948](https://github.com/Lightning-AI/lightning/pull/7948)) + * Added `{,load_}state_dict` to `Loops` ([#8197](https://github.com/Lightning-AI/lightning/pull/8197)) + * Added `FastForwardSampler` and `CaptureIterableDataset` ([#8307](https://github.com/Lightning-AI/lightning/pull/8307)) + * Set `Loop.restarting=False` at the end of the first iteration ([#8362](https://github.com/Lightning-AI/lightning/pull/8362)) + * Save the loops state with the checkpoint (opt-in) ([#8362](https://github.com/Lightning-AI/lightning/pull/8362)) + * Save a checkpoint to restore the state on exception (opt-in) ([#8362](https://github.com/Lightning-AI/lightning/pull/8362)) + * Added `state_dict` and `load_state_dict` utilities for `CombinedLoader` + utilities for dataloader ([#8364](https://github.com/Lightning-AI/lightning/pull/8364)) +- Added `rank_zero_only` to `LightningModule.log` function ([#7966](https://github.com/Lightning-AI/lightning/pull/7966)) +- Added `metric_attribute` to `LightningModule.log` function ([#7966](https://github.com/Lightning-AI/lightning/pull/7966)) +- Added a warning if `Trainer(log_every_n_steps)` is a value too high for the training dataloader ([#7734](https://github.com/Lightning-AI/lightning/pull/7734)) +- Added LightningCLI support for argument links applied on instantiation ([#7895](https://github.com/Lightning-AI/lightning/pull/7895)) +- Added LightningCLI support for configurable callbacks that should always be present ([#7964](https://github.com/Lightning-AI/lightning/pull/7964)) +- Added DeepSpeed Infinity Support, and updated to DeepSpeed 0.4.0 ([#7234](https://github.com/Lightning-AI/lightning/pull/7234)) +- Added support for `torch.nn.UninitializedParameter` in `ModelSummary` ([#7642](https://github.com/Lightning-AI/lightning/pull/7642)) +- Added support `LightningModule.save_hyperparameters` when `LightningModule` is a dataclass ([#7992](https://github.com/Lightning-AI/lightning/pull/7992)) +- Added support for overriding `optimizer_zero_grad` and `optimizer_step` when using accumulate_grad_batches ([#7980](https://github.com/Lightning-AI/lightning/pull/7980)) +- Added `logger` boolean flag to `save_hyperparameters` ([#7960](https://github.com/Lightning-AI/lightning/pull/7960)) +- Added support for calling scripts using the module syntax (`python -m package.script`) ([#8073](https://github.com/Lightning-AI/lightning/pull/8073)) +- Added support for optimizers and learning rate schedulers to `LightningCLI` ([#8093](https://github.com/Lightning-AI/lightning/pull/8093)) +- Added XLA Profiler ([#8014](https://github.com/Lightning-AI/lightning/pull/8014)) +- Added `PrecisionPlugin.{pre,post}_backward` ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- Added `on_load_checkpoint` and `on_save_checkpoint` hooks to the `PrecisionPlugin` base class ([#7831](https://github.com/Lightning-AI/lightning/pull/7831)) +- Added `max_depth` parameter in `ModelSummary` ([#8062](https://github.com/Lightning-AI/lightning/pull/8062)) +- Added `XLAStatsMonitor` callback ([#8235](https://github.com/Lightning-AI/lightning/pull/8235)) +- Added `restore` function and `restarting` attribute to base `Loop` ([#8247](https://github.com/Lightning-AI/lightning/pull/8247)) +- Added support for `save_hyperparameters` in `LightningDataModule` ([#3792](https://github.com/Lightning-AI/lightning/pull/3792)) +- Added the `ModelCheckpoint(save_on_train_epoch_end)` to choose when to run the saving logic ([#8389](https://github.com/Lightning-AI/lightning/pull/8389)) +- Added `LSFEnvironment` for distributed training with the LSF resource manager `jsrun` ([#5102](https://github.com/Lightning-AI/lightning/pull/5102)) +- Added support for `accelerator='cpu'|'gpu'|'tpu'|'ipu'|'auto'` ([#7808](https://github.com/Lightning-AI/lightning/pull/7808)) +- Added `tpu_spawn_debug` to plugin registry ([#7933](https://github.com/Lightning-AI/lightning/pull/7933)) +- Enabled traditional/manual launching of DDP processes through `LOCAL_RANK` and `NODE_RANK` environment variable assignments ([#7480](https://github.com/Lightning-AI/lightning/pull/7480)) +- Added `quantize_on_fit_end` argument to `QuantizationAwareTraining` ([#8464](https://github.com/Lightning-AI/lightning/pull/8464)) +- Added experimental support for loop specialization ([#8226](https://github.com/Lightning-AI/lightning/pull/8226)) +- Added support for `devices` flag to Trainer ([#8440](https://github.com/Lightning-AI/lightning/pull/8440)) +- Added private `prevent_trainer_and_dataloaders_deepcopy` context manager on the `LightningModule` ([#8472](https://github.com/Lightning-AI/lightning/pull/8472)) +- Added support for providing callables to the Lightning CLI instead of types ([#8400](https://github.com/Lightning-AI/lightning/pull/8400)) ### Changed -- Decoupled device parsing logic from Accelerator connector to Trainer ([#8180](https://github.com/PyTorchLightning/pytorch-lightning/pull/8180)) -- Changed the `Trainer`'s `checkpoint_callback` argument to allow only boolean values ([#7539](https://github.com/PyTorchLightning/pytorch-lightning/pull/7539)) -- Log epoch metrics before the `on_evaluation_end` hook ([#7272](https://github.com/PyTorchLightning/pytorch-lightning/pull/7272)) -- Explicitly disallow calling `self.log(on_epoch=False)` during epoch-only or single-call hooks ([#7874](https://github.com/PyTorchLightning/pytorch-lightning/pull/7874)) +- Decoupled device parsing logic from Accelerator connector to Trainer ([#8180](https://github.com/Lightning-AI/lightning/pull/8180)) +- Changed the `Trainer`'s `checkpoint_callback` argument to allow only boolean values ([#7539](https://github.com/Lightning-AI/lightning/pull/7539)) +- Log epoch metrics before the `on_evaluation_end` hook ([#7272](https://github.com/Lightning-AI/lightning/pull/7272)) +- Explicitly disallow calling `self.log(on_epoch=False)` during epoch-only or single-call hooks ([#7874](https://github.com/Lightning-AI/lightning/pull/7874)) - Changed these `Trainer` methods to be protected: `call_setup_hook`, `call_configure_sharded_model`, `pre_dispatch`, `dispatch`, `post_dispatch`, `call_teardown_hook`, `run_train`, `run_sanity_check`, `run_evaluate`, `run_evaluation`, `run_predict`, `track_output_for_epoch_end` -- Changed `metrics_to_scalars` to work with any collection or value ([#7888](https://github.com/PyTorchLightning/pytorch-lightning/pull/7888)) -- Changed `clip_grad_norm` to use `torch.nn.utils.clip_grad_norm_` ([#7025](https://github.com/PyTorchLightning/pytorch-lightning/pull/7025)) -- Validation is now always run inside the training epoch scope ([#7357](https://github.com/PyTorchLightning/pytorch-lightning/pull/7357)) -- `ModelCheckpoint` now runs at the end of the training epoch by default ([#8389](https://github.com/PyTorchLightning/pytorch-lightning/pull/8389)) -- `EarlyStopping` now runs at the end of the training epoch by default ([#8286](https://github.com/PyTorchLightning/pytorch-lightning/pull/8286)) +- Changed `metrics_to_scalars` to work with any collection or value ([#7888](https://github.com/Lightning-AI/lightning/pull/7888)) +- Changed `clip_grad_norm` to use `torch.nn.utils.clip_grad_norm_` ([#7025](https://github.com/Lightning-AI/lightning/pull/7025)) +- Validation is now always run inside the training epoch scope ([#7357](https://github.com/Lightning-AI/lightning/pull/7357)) +- `ModelCheckpoint` now runs at the end of the training epoch by default ([#8389](https://github.com/Lightning-AI/lightning/pull/8389)) +- `EarlyStopping` now runs at the end of the training epoch by default ([#8286](https://github.com/Lightning-AI/lightning/pull/8286)) - Refactored Loops - * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/PyTorchLightning/pytorch-lightning/pull/7437)) - * Refactored result handling in training loop ([#7506](https://github.com/PyTorchLightning/pytorch-lightning/pull/7506)) - * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/PyTorchLightning/pytorch-lightning/pull/7507)) - * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/PyTorchLightning/pytorch-lightning/pull/7526)) - * Simplified "should run validation" logic ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682)) - * Simplified logic for updating the learning rate for schedulers ([#7682](https://github.com/PyTorchLightning/pytorch-lightning/pull/7682)) - * Removed the `on_epoch` guard from the "should stop" validation check ([#7701](https://github.com/PyTorchLightning/pytorch-lightning/pull/7701)) - * Refactored internal loop interface; added new classes `FitLoop`, `TrainingEpochLoop`, `TrainingBatchLoop` ([#7871](https://github.com/PyTorchLightning/pytorch-lightning/pull/7871), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077)) - * Removed `pytorch_lightning/trainer/training_loop.py` ([#7985](https://github.com/PyTorchLightning/pytorch-lightning/pull/7985)) - * Refactored evaluation loop interface; added new classes `DataLoaderLoop`, `EvaluationLoop`, `EvaluationEpochLoop` ([#7990](https://github.com/PyTorchLightning/pytorch-lightning/pull/7990), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077)) - * Removed `pytorch_lightning/trainer/evaluation_loop.py` ([#8056](https://github.com/PyTorchLightning/pytorch-lightning/pull/8056)) - * Restricted public access to several internal functions ([#8024](https://github.com/PyTorchLightning/pytorch-lightning/pull/8024)) - * Refactored trainer `_run_*` functions and separate evaluation loops ([#8065](https://github.com/PyTorchLightning/pytorch-lightning/pull/8065)) - * Refactored prediction loop interface; added new classes `PredictionLoop`, `PredictionEpochLoop` ([#7700](https://github.com/PyTorchLightning/pytorch-lightning/pull/7700), [#8077](https://github.com/PyTorchLightning/pytorch-lightning/pull/8077)) - * Removed `pytorch_lightning/trainer/predict_loop.py` ([#8094](https://github.com/PyTorchLightning/pytorch-lightning/pull/8094)) - * Moved result teardown to the loops ([#8245](https://github.com/PyTorchLightning/pytorch-lightning/pull/8245)) - * Improve `Loop` API to better handle children `state_dict` and `progress` ([#8334](https://github.com/PyTorchLightning/pytorch-lightning/pull/8334)) + * Moved attributes `global_step`, `current_epoch`, `max/min_steps`, `max/min_epochs`, `batch_idx`, and `total_batch_idx` to TrainLoop ([#7437](https://github.com/Lightning-AI/lightning/pull/7437)) + * Refactored result handling in training loop ([#7506](https://github.com/Lightning-AI/lightning/pull/7506)) + * Moved attributes `hiddens` and `split_idx` to TrainLoop ([#7507](https://github.com/Lightning-AI/lightning/pull/7507)) + * Refactored the logic around manual and automatic optimization inside the optimizer loop ([#7526](https://github.com/Lightning-AI/lightning/pull/7526)) + * Simplified "should run validation" logic ([#7682](https://github.com/Lightning-AI/lightning/pull/7682)) + * Simplified logic for updating the learning rate for schedulers ([#7682](https://github.com/Lightning-AI/lightning/pull/7682)) + * Removed the `on_epoch` guard from the "should stop" validation check ([#7701](https://github.com/Lightning-AI/lightning/pull/7701)) + * Refactored internal loop interface; added new classes `FitLoop`, `TrainingEpochLoop`, `TrainingBatchLoop` ([#7871](https://github.com/Lightning-AI/lightning/pull/7871), [#8077](https://github.com/Lightning-AI/lightning/pull/8077)) + * Removed `pytorch_lightning/trainer/training_loop.py` ([#7985](https://github.com/Lightning-AI/lightning/pull/7985)) + * Refactored evaluation loop interface; added new classes `DataLoaderLoop`, `EvaluationLoop`, `EvaluationEpochLoop` ([#7990](https://github.com/Lightning-AI/lightning/pull/7990), [#8077](https://github.com/Lightning-AI/lightning/pull/8077)) + * Removed `pytorch_lightning/trainer/evaluation_loop.py` ([#8056](https://github.com/Lightning-AI/lightning/pull/8056)) + * Restricted public access to several internal functions ([#8024](https://github.com/Lightning-AI/lightning/pull/8024)) + * Refactored trainer `_run_*` functions and separate evaluation loops ([#8065](https://github.com/Lightning-AI/lightning/pull/8065)) + * Refactored prediction loop interface; added new classes `PredictionLoop`, `PredictionEpochLoop` ([#7700](https://github.com/Lightning-AI/lightning/pull/7700), [#8077](https://github.com/Lightning-AI/lightning/pull/8077)) + * Removed `pytorch_lightning/trainer/predict_loop.py` ([#8094](https://github.com/Lightning-AI/lightning/pull/8094)) + * Moved result teardown to the loops ([#8245](https://github.com/Lightning-AI/lightning/pull/8245)) + * Improve `Loop` API to better handle children `state_dict` and `progress` ([#8334](https://github.com/Lightning-AI/lightning/pull/8334)) - Refactored logging - * Renamed and moved `core/step_result.py` to `trainer/connectors/logger_connector/result.py` ([#7736](https://github.com/PyTorchLightning/pytorch-lightning/pull/7736)) - * Dramatically simplify the `LoggerConnector` ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) - * `trainer.{logged,progress_bar,callback}_metrics` are now updated on-demand ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) - * Completely overhaul the `Result` object in favor of `ResultMetric` ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) - * Improve epoch-level reduction time and overall memory usage ([#7882](https://github.com/PyTorchLightning/pytorch-lightning/pull/7882)) - * Allow passing `self.log(batch_size=...)` ([#7891](https://github.com/PyTorchLightning/pytorch-lightning/pull/7891)) - * Each of the training loops now keeps its own results collection ([#7891](https://github.com/PyTorchLightning/pytorch-lightning/pull/7891)) - * Remove `EpochResultStore` and `HookResultStore` in favor of `ResultCollection` ([#7909](https://github.com/PyTorchLightning/pytorch-lightning/pull/7909)) - * Remove `MetricsHolder` ([#7909](https://github.com/PyTorchLightning/pytorch-lightning/pull/7909)) -- Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/PyTorchLightning/pytorch-lightning/pull/7421/)) -- Changed the behaviour when logging evaluation step metrics to no longer append `/epoch_*` to the metric name ([#7351](https://github.com/PyTorchLightning/pytorch-lightning/pull/7351)) -- Raised `ValueError` when a `None` value is `self.log`-ed ([#7771](https://github.com/PyTorchLightning/pytorch-lightning/pull/7771)) -- Changed `resolve_training_type_plugins` to allow setting `num_nodes` and `sync_batchnorm` from `Trainer` setting ([#7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026)) -- Default `seed_everything(workers=True)` in the `LightningCLI` ([#7504](https://github.com/PyTorchLightning/pytorch-lightning/pull/7504)) -- Changed `model.state_dict()` in `CheckpointConnector` to allow `training_type_plugin` to customize the model's `state_dict()` ([#7474](https://github.com/PyTorchLightning/pytorch-lightning/pull/7474)) -- `MLflowLogger` now uses the env variable `MLFLOW_TRACKING_URI` as default tracking URI ([#7457](https://github.com/PyTorchLightning/pytorch-lightning/pull/7457)) -- Changed `Trainer` arg and functionality from `reload_dataloaders_every_epoch` to `reload_dataloaders_every_n_epochs` ([#5043](https://github.com/PyTorchLightning/pytorch-lightning/pull/5043)) -- Changed `WandbLogger(log_model={True/'all'})` to log models as artifacts ([#6231](https://github.com/PyTorchLightning/pytorch-lightning/pull/6231)) -- MLFlowLogger now accepts `run_name` as an constructor argument ([#7622](https://github.com/PyTorchLightning/pytorch-lightning/pull/7622)) -- Changed `teardown()` in `Accelerator` to allow `training_type_plugin` to customize `teardown` logic ([#7579](https://github.com/PyTorchLightning/pytorch-lightning/pull/7579)) -- `Trainer.fit` now raises an error when using manual optimization with unsupported features such as `gradient_clip_val` or `accumulate_grad_batches` ([#7788](https://github.com/PyTorchLightning/pytorch-lightning/pull/7788)) -- Accelerator hooks are called regardless if `LightningModule` overrides the same hooks ([#7826](https://github.com/PyTorchLightning/pytorch-lightning/pull/7826)) -- Moved profilers to their own file ([#7822](https://github.com/PyTorchLightning/pytorch-lightning/pull/7822)) -- The `on_after_backward` hook is now called on accumulating iterations. Use the `on_before_optimizer_step` hook to mimic the old behaviour ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- The mixed precision loss is no longer unscaled before the `on_after_backward` hook. Use the `on_before_optimizer_step` hook to mimic the old behaviour ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- The `TrainingTypePlugin.{pre,post}_backward` hooks no longer take the `optimizer, opt_idx, should_accumulate` arguments ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- The `PrecisionPlugin.backward` hooks no longer returns a value ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- The `PrecisionPlugin.backward` hooks no longer takes a `should_accumulate` argument ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- Added the `on_before_backward` hook ([#7865](https://github.com/PyTorchLightning/pytorch-lightning/pull/7865)) -- `LightningCLI` now aborts with a clearer message if config already exists and disables save config during `fast_dev_run`([#7963](https://github.com/PyTorchLightning/pytorch-lightning/pull/7963)) -- Saved the `LightningCLI` config on `setup` and only on the main process ([#8017](https://github.com/PyTorchLightning/pytorch-lightning/pull/8017)) -- Dropped the `LightningCLI` `ArgumentParser` when pickling ([#8017](https://github.com/PyTorchLightning/pytorch-lightning/pull/8017)) -- Skip `broadcast` if distributed not initialized for the spawn plugins ([#8017](https://github.com/PyTorchLightning/pytorch-lightning/pull/8017)) -- `Trainer(resume_from_checkpoint=...)` now restores the model directly after `LightningModule.setup()`, which is before `LightningModule.configure_sharded_model()` ([#7652](https://github.com/PyTorchLightning/pytorch-lightning/pull/7652)) -- Moved `torch.cuda.set_device()` to enable collective calls earlier in setup ([#8312](https://github.com/PyTorchLightning/pytorch-lightning/pull/8312)) -- Used XLA utility API to move data to CPU (Single TPU core) ([#8078](https://github.com/PyTorchLightning/pytorch-lightning/pull/8078)) -- Improved error messages in `replace_sampler` when the `DataLoader` attributes are not included in the signature or the signature is missing optional arguments ([#8519](https://github.com/PyTorchLightning/pytorch-lightning/pull/8519)) -- Moved `DeviceDtypeModuleMixin` and `HyperparametersMixin` mixin to `core` ([#8396](https://github.com/PyTorchLightning/pytorch-lightning/pull/8396)) -- Return the `default_root_dir` as the `log_dir` when the logger is a `LoggerCollection` ([#8187](https://github.com/PyTorchLightning/pytorch-lightning/pull/8187)) + * Renamed and moved `core/step_result.py` to `trainer/connectors/logger_connector/result.py` ([#7736](https://github.com/Lightning-AI/lightning/pull/7736)) + * Dramatically simplify the `LoggerConnector` ([#7882](https://github.com/Lightning-AI/lightning/pull/7882)) + * `trainer.{logged,progress_bar,callback}_metrics` are now updated on-demand ([#7882](https://github.com/Lightning-AI/lightning/pull/7882)) + * Completely overhaul the `Result` object in favor of `ResultMetric` ([#7882](https://github.com/Lightning-AI/lightning/pull/7882)) + * Improve epoch-level reduction time and overall memory usage ([#7882](https://github.com/Lightning-AI/lightning/pull/7882)) + * Allow passing `self.log(batch_size=...)` ([#7891](https://github.com/Lightning-AI/lightning/pull/7891)) + * Each of the training loops now keeps its own results collection ([#7891](https://github.com/Lightning-AI/lightning/pull/7891)) + * Remove `EpochResultStore` and `HookResultStore` in favor of `ResultCollection` ([#7909](https://github.com/Lightning-AI/lightning/pull/7909)) + * Remove `MetricsHolder` ([#7909](https://github.com/Lightning-AI/lightning/pull/7909)) +- Moved `ignore_scalar_return_in_dp` warning suppression to the DataParallelPlugin class ([#7421](https://github.com/Lightning-AI/lightning/pull/7421/)) +- Changed the behaviour when logging evaluation step metrics to no longer append `/epoch_*` to the metric name ([#7351](https://github.com/Lightning-AI/lightning/pull/7351)) +- Raised `ValueError` when a `None` value is `self.log`-ed ([#7771](https://github.com/Lightning-AI/lightning/pull/7771)) +- Changed `resolve_training_type_plugins` to allow setting `num_nodes` and `sync_batchnorm` from `Trainer` setting ([#7026](https://github.com/Lightning-AI/lightning/pull/7026)) +- Default `seed_everything(workers=True)` in the `LightningCLI` ([#7504](https://github.com/Lightning-AI/lightning/pull/7504)) +- Changed `model.state_dict()` in `CheckpointConnector` to allow `training_type_plugin` to customize the model's `state_dict()` ([#7474](https://github.com/Lightning-AI/lightning/pull/7474)) +- `MLflowLogger` now uses the env variable `MLFLOW_TRACKING_URI` as default tracking URI ([#7457](https://github.com/Lightning-AI/lightning/pull/7457)) +- Changed `Trainer` arg and functionality from `reload_dataloaders_every_epoch` to `reload_dataloaders_every_n_epochs` ([#5043](https://github.com/Lightning-AI/lightning/pull/5043)) +- Changed `WandbLogger(log_model={True/'all'})` to log models as artifacts ([#6231](https://github.com/Lightning-AI/lightning/pull/6231)) +- MLFlowLogger now accepts `run_name` as an constructor argument ([#7622](https://github.com/Lightning-AI/lightning/pull/7622)) +- Changed `teardown()` in `Accelerator` to allow `training_type_plugin` to customize `teardown` logic ([#7579](https://github.com/Lightning-AI/lightning/pull/7579)) +- `Trainer.fit` now raises an error when using manual optimization with unsupported features such as `gradient_clip_val` or `accumulate_grad_batches` ([#7788](https://github.com/Lightning-AI/lightning/pull/7788)) +- Accelerator hooks are called regardless if `LightningModule` overrides the same hooks ([#7826](https://github.com/Lightning-AI/lightning/pull/7826)) +- Moved profilers to their own file ([#7822](https://github.com/Lightning-AI/lightning/pull/7822)) +- The `on_after_backward` hook is now called on accumulating iterations. Use the `on_before_optimizer_step` hook to mimic the old behaviour ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- The mixed precision loss is no longer unscaled before the `on_after_backward` hook. Use the `on_before_optimizer_step` hook to mimic the old behaviour ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- The `TrainingTypePlugin.{pre,post}_backward` hooks no longer take the `optimizer, opt_idx, should_accumulate` arguments ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- The `PrecisionPlugin.backward` hooks no longer returns a value ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- The `PrecisionPlugin.backward` hooks no longer takes a `should_accumulate` argument ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- Added the `on_before_backward` hook ([#7865](https://github.com/Lightning-AI/lightning/pull/7865)) +- `LightningCLI` now aborts with a clearer message if config already exists and disables save config during `fast_dev_run`([#7963](https://github.com/Lightning-AI/lightning/pull/7963)) +- Saved the `LightningCLI` config on `setup` and only on the main process ([#8017](https://github.com/Lightning-AI/lightning/pull/8017)) +- Dropped the `LightningCLI` `ArgumentParser` when pickling ([#8017](https://github.com/Lightning-AI/lightning/pull/8017)) +- Skip `broadcast` if distributed not initialized for the spawn plugins ([#8017](https://github.com/Lightning-AI/lightning/pull/8017)) +- `Trainer(resume_from_checkpoint=...)` now restores the model directly after `LightningModule.setup()`, which is before `LightningModule.configure_sharded_model()` ([#7652](https://github.com/Lightning-AI/lightning/pull/7652)) +- Moved `torch.cuda.set_device()` to enable collective calls earlier in setup ([#8312](https://github.com/Lightning-AI/lightning/pull/8312)) +- Used XLA utility API to move data to CPU (Single TPU core) ([#8078](https://github.com/Lightning-AI/lightning/pull/8078)) +- Improved error messages in `replace_sampler` when the `DataLoader` attributes are not included in the signature or the signature is missing optional arguments ([#8519](https://github.com/Lightning-AI/lightning/pull/8519)) +- Moved `DeviceDtypeModuleMixin` and `HyperparametersMixin` mixin to `core` ([#8396](https://github.com/Lightning-AI/lightning/pull/8396)) +- Return the `default_root_dir` as the `log_dir` when the logger is a `LoggerCollection` ([#8187](https://github.com/Lightning-AI/lightning/pull/8187)) ### Deprecated -- Deprecated `LightningModule.loaded_optimizer_states_dict` ([#8229](https://github.com/PyTorchLightning/pytorch-lightning/pull/8229)) -- Standardized the dataloaders arguments of `trainer.{fit,valdiate,test,tune}` ([#7431](https://github.com/PyTorchLightning/pytorch-lightning/pull/7431)) -- Deprecated `DataModule` properties: `has_prepared_data`, `has_setup_fit`, `has_setup_validate`, `has_setup_test`, `has_setup_predict`, `has_teardown_fit`, `has_teardown_validate`, `has_teardown_test`, `has_teardown_predict` ([#7657](https://github.com/PyTorchLightning/pytorch-lightning/pull/7657/)) -- Deprecated `TrainerModelHooksMixin` in favor of `pytorch_lightning.utilities.signature_utils` ([#7422](https://github.com/PyTorchLightning/pytorch-lightning/pull/7422)) -- Deprecated `num_nodes` and `sync_batchnorm` arguments in `DDPPlugin` and `DDPSpawnPlugin` ([#7026](https://github.com/PyTorchLightning/pytorch-lightning/pull/7026)) -- Deprecated `self.log(sync_dist_op)` in favor of `self.log(reduce_fx)`. ([#7891](https://github.com/PyTorchLightning/pytorch-lightning/pull/7891)) -- Deprecated `is_overridden(model=...)` in favor of `is_overridden(instance=...)` ([#7918](https://github.com/PyTorchLightning/pytorch-lightning/pull/7918)) -- Deprecated automatically detaching returned extras with grads ([#7994](https://github.com/PyTorchLightning/pytorch-lightning/pull/7994)) -- Deprecated default value of `monitor` argument in EarlyStopping callback to enforce `monitor` as a required argument ([#7907](https://github.com/PyTorchLightning/pytorch-lightning/pull/7907)) -- Deprecated importing `rank_zero_{warn,deprecation}` directly from `pytorch_lightning.utilities.distributed` ([#8085](https://github.com/PyTorchLightning/pytorch-lightning/pull/8085)) -- Deprecated the use of `CheckpointConnector.hpc_load()` in favor of `CheckpointConnector.restore()` ([#7652](https://github.com/PyTorchLightning/pytorch-lightning/pull/7652)) -- Deprecated `ModelCheckpoint(every_n_val_epochs)` in favor of `ModelCheckpoint(every_n_epochs)` ([#8383](https://github.com/PyTorchLightning/pytorch-lightning/pull/8383)) -- Deprecated `DDPPlugin.task_idx` in favor of `DDPPlugin.local_rank` ([#8203](https://github.com/PyTorchLightning/pytorch-lightning/pull/8203)) -- Deprecated the `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#8025](https://github.com/PyTorchLightning/pytorch-lightning/pull/8025)) -- Deprecated the `Trainer.disable_validation` property in favor of `not Trainer.enable_validation` ([#8291](https://github.com/PyTorchLightning/pytorch-lightning/pull/8291)) -- Deprecated `mode` parameter in `ModelSummary` in favor of `max_depth` ([#8062](https://github.com/PyTorchLightning/pytorch-lightning/pull/8062)) -- Deprecated `reload_dataloaders_every_epoch` argument of `Trainer` in favor of `reload_dataloaders_every_n_epochs` ([#5043](https://github.com/PyTorchLightning/pytorch-lightning/pull/5043)) -- Deprecated `distributed_backend` argument for `Trainer` ([#8575](https://github.com/PyTorchLightning/pytorch-lightning/pull/8575)) +- Deprecated `LightningModule.loaded_optimizer_states_dict` ([#8229](https://github.com/Lightning-AI/lightning/pull/8229)) +- Standardized the dataloaders arguments of `trainer.{fit,valdiate,test,tune}` ([#7431](https://github.com/Lightning-AI/lightning/pull/7431)) +- Deprecated `DataModule` properties: `has_prepared_data`, `has_setup_fit`, `has_setup_validate`, `has_setup_test`, `has_setup_predict`, `has_teardown_fit`, `has_teardown_validate`, `has_teardown_test`, `has_teardown_predict` ([#7657](https://github.com/Lightning-AI/lightning/pull/7657/)) +- Deprecated `TrainerModelHooksMixin` in favor of `pytorch_lightning.utilities.signature_utils` ([#7422](https://github.com/Lightning-AI/lightning/pull/7422)) +- Deprecated `num_nodes` and `sync_batchnorm` arguments in `DDPPlugin` and `DDPSpawnPlugin` ([#7026](https://github.com/Lightning-AI/lightning/pull/7026)) +- Deprecated `self.log(sync_dist_op)` in favor of `self.log(reduce_fx)`. ([#7891](https://github.com/Lightning-AI/lightning/pull/7891)) +- Deprecated `is_overridden(model=...)` in favor of `is_overridden(instance=...)` ([#7918](https://github.com/Lightning-AI/lightning/pull/7918)) +- Deprecated automatically detaching returned extras with grads ([#7994](https://github.com/Lightning-AI/lightning/pull/7994)) +- Deprecated default value of `monitor` argument in EarlyStopping callback to enforce `monitor` as a required argument ([#7907](https://github.com/Lightning-AI/lightning/pull/7907)) +- Deprecated importing `rank_zero_{warn,deprecation}` directly from `pytorch_lightning.utilities.distributed` ([#8085](https://github.com/Lightning-AI/lightning/pull/8085)) +- Deprecated the use of `CheckpointConnector.hpc_load()` in favor of `CheckpointConnector.restore()` ([#7652](https://github.com/Lightning-AI/lightning/pull/7652)) +- Deprecated `ModelCheckpoint(every_n_val_epochs)` in favor of `ModelCheckpoint(every_n_epochs)` ([#8383](https://github.com/Lightning-AI/lightning/pull/8383)) +- Deprecated `DDPPlugin.task_idx` in favor of `DDPPlugin.local_rank` ([#8203](https://github.com/Lightning-AI/lightning/pull/8203)) +- Deprecated the `Trainer.train_loop` property in favor of `Trainer.fit_loop` ([#8025](https://github.com/Lightning-AI/lightning/pull/8025)) +- Deprecated the `Trainer.disable_validation` property in favor of `not Trainer.enable_validation` ([#8291](https://github.com/Lightning-AI/lightning/pull/8291)) +- Deprecated `mode` parameter in `ModelSummary` in favor of `max_depth` ([#8062](https://github.com/Lightning-AI/lightning/pull/8062)) +- Deprecated `reload_dataloaders_every_epoch` argument of `Trainer` in favor of `reload_dataloaders_every_n_epochs` ([#5043](https://github.com/Lightning-AI/lightning/pull/5043)) +- Deprecated `distributed_backend` argument for `Trainer` ([#8575](https://github.com/Lightning-AI/lightning/pull/8575)) ### Removed -- Dropped official support/testing for PyTorch <1.6 ([#8288](https://github.com/PyTorchLightning/pytorch-lightning/pull/8288)) -- Removed `ProfilerConnector` ([#7654](https://github.com/PyTorchLightning/pytorch-lightning/pull/7654)) -- Pruned deprecated classif. metrics from `pytorch_lightning.metrics.functional.classification` ([#7499](https://github.com/PyTorchLightning/pytorch-lightning/pull/7499)) -- Removed deprecated data parallel classes `LightningDataParallel` and `LightningDistributedDataParallel` from `pytorch_lightning.overrides.data_parallel` ([#7510](https://github.com/PyTorchLightning/pytorch-lightning/pull/7510)) -- Removed deprecated trainer attributes - `get_model` and `accelerator_backend` ([#7502](https://github.com/PyTorchLightning/pytorch-lightning/pull/7502)) -- Removed support for automatically monitoring the `val_loss` key with `ModelCheckpoint`. Pass your `monitor` of choice to the `ModelCheckpoint` instance instead ([#8293](https://github.com/PyTorchLightning/pytorch-lightning/pull/8293)) -- Removed support for `self.log(tbptt_reduce_fx)` and `self.log(tbptt_pad_token)`. Please, open a discussion explaining your use-case if you relied on these. ([#7644](https://github.com/PyTorchLightning/pytorch-lightning/pull/7644)) -- Removed deprecated utils modules `model_utils`, `warning_utils`, `xla_device_utils` and partially `argparse_utils` ([#7503](https://github.com/PyTorchLightning/pytorch-lightning/pull/7503)) -- Removed `RPCPlugin` and `RPCSequentialPlugin`. If you were successfully using these plugins, please open a GitHub discussion about your use case ([#8101](https://github.com/PyTorchLightning/pytorch-lightning/pull/8101)) -- Removed deprecated trainer attributes - `on_cpu`, `on_tpu`, `use_tpu`, `on_gpu`, `use_dp`, `use_ddp`, `use_ddp2`, `use_horovod`, `use_single_gpu` ([#7501](https://github.com/PyTorchLightning/pytorch-lightning/pull/7501)) -- Removed deprecated `optimizer` argument in `LightningModule.manual_backward()`; Toggling optimizers in manual optimization should be done using `LightningModule.{un}toggle_optimizer()` ([#8287](https://github.com/PyTorchLightning/pytorch-lightning/pull/8287)) -- Removed DeepSpeed FP16 Exception as FP32 is now supported ([#8462](https://github.com/PyTorchLightning/pytorch-lightning/pull/8462)) -- Removed environment variable `PL_EXP_VERSION` from DDP subprocesses ([7403](https://github.com/PyTorchLightning/pytorch-lightning/pull/7403)) - -### Fixed - -- Fixed the `GPUStatsMonitor` callbacks to use the correct GPU IDs if `CUDA_VISIBLE_DEVICES` set ([#8260](https://github.com/PyTorchLightning/pytorch-lightning/pull/8260)) -- Fixed `lr_scheduler` checkpointed state by calling `update_lr_schedulers` before saving checkpoints ([#7877](https://github.com/PyTorchLightning/pytorch-lightning/pull/7877)) -- Fixed ambiguous warning when both overfit and train dataloader shuffling are enabled ([#7685](https://github.com/PyTorchLightning/pytorch-lightning/pull/7685)) -- Fixed dev debugger memory growing due to tracking events even when disabled ([#7875](https://github.com/PyTorchLightning/pytorch-lightning/pull/7875)) -- Fixed `None` loss keys getting added in `training_epoch_end` when using manual optimization and not returning a loss ([#7772](https://github.com/PyTorchLightning/pytorch-lightning/pull/7772)) -- Fixed a bug where `precision=64` with `accelerator='ddp_spawn'` would throw a pickle error ([#6924](https://github.com/PyTorchLightning/pytorch-lightning/pull/6924)) -- Do not override the existing `epoch` value in `logged_metrics` when already logged by the user ([#7982](https://github.com/PyTorchLightning/pytorch-lightning/pull/7982)) -- Support for manual optimization with DeepSpeed ([#7970](https://github.com/PyTorchLightning/pytorch-lightning/pull/7970)) -- Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/PyTorchLightning/pytorch-lightning/pull/7941)) -- Fixed passing the `stage` argument of `Callback.{setup,teardown}` as a keyword ([#7973](https://github.com/PyTorchLightning/pytorch-lightning/pull/7973)) -- Fixed metrics generated during `validation sanity checking` are cleaned on end ([#8171](https://github.com/PyTorchLightning/pytorch-lightning/pull/8171)) -- Fixed `log_gpu_memory` metrics not being added to `logging` when nothing else is logged ([#8174](https://github.com/PyTorchLightning/pytorch-lightning/pull/8174)) -- Fixed a bug where calling `log` with a `Metric` instance would raise an error if it was a nested attribute of the model ([#8181](https://github.com/PyTorchLightning/pytorch-lightning/pull/8181)) -- Fixed a bug where using `precision=64` would cause buffers with complex dtype to be cast to real ([#8208](https://github.com/PyTorchLightning/pytorch-lightning/pull/8208)) -- Fixed `is_overridden` returning true for wrapped functions with no changes ([#8296](https://github.com/PyTorchLightning/pytorch-lightning/pull/8296)) -- Fixed a bug where `truncated_bptt_steps` would throw an AttributeError when the target RNN has multiple hidden states ([#8145](https://github.com/PyTorchLightning/pytorch-lightning/pull/8145)) -- Fixed `self.optimizers()` not returning a single optimizer if it had been wrapped ([#8326](https://github.com/PyTorchLightning/pytorch-lightning/pull/8326)) -- Fixed the `on_after_backward` hook not getting called when using manual optimization and no plugins ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- Fixed the `LightningModule.backward` hook only getting called with the `apex` plugin when using manual optimization ([#8328](https://github.com/PyTorchLightning/pytorch-lightning/pull/8328)) -- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/PyTorchLightning/pytorch-lightning/pull/7378)) -- Fixed passing a custom `DDPPlugin` when choosing `accelerator="ddp_cpu"` for the accelerator ([#6208](https://github.com/PyTorchLightning/pytorch-lightning/pull/6208)) -- Fixed missing call to `LightningModule.untoggle_optimizer` in training loop when running gradient accumulation with multiple optimizers ([#8284](https://github.com/PyTorchLightning/pytorch-lightning/pull/8284)) -- Fixed hash of LightningEnum to work with value instead of name ([#8421](https://github.com/PyTorchLightning/pytorch-lightning/pull/8421)). -- Fixed a bug where an extra checkpoint was saved at the end of training if the `val_check_interval` did not align with the number of training batches ([#7724](https://github.com/PyTorchLightning/pytorch-lightning/pull/7724)) -- Fixed hash of LightningEnum to work with value instead of name([#8421](https://github.com/PyTorchLightning/pytorch-lightning/pull/8421)). -- Fixed `move_data_to_device` to return the batch if the object `to` function didn't return `self` ([#8433](https://github.com/PyTorchLightning/pytorch-lightning/pull/8433)) -- Fixed progress bar updates for Pod Training ([#8258](https://github.com/PyTorchLightning/pytorch-lightning/pull/8258)) -- Fixed clearing dataloader references before attaching new dataloaders in consecutive `Trainer.{fit,validate,test,predict}´ runs ([#8442](https://github.com/PyTorchLightning/pytorch-lightning/pull/8442)) -- Fixed memory leaks on GPU by moving `optimizer_states`, `ResultCollection.extra`, `ResultMetric` attributes, and `LoggerConnector` metrics to `cpu`. Also, delete the DDP wrapper on `teardown` ([#8490](https://github.com/PyTorchLightning/pytorch-lightning/pull/8490)) -- Fixed `SWA` callback using LightningModule `prevent_trainer_and_dataloaders_deepcopy` to avoid OOM ([#8472](https://github.com/PyTorchLightning/pytorch-lightning/pull/8472)) -- Fixed `ModelPruning` callback `on_save_checkpoint` to avoid making a `deepcopy` potentially leading to OOM ([#8472](https://github.com/PyTorchLightning/pytorch-lightning/pull/8472)) -- Fixed the sampler replacement logic for `DataLoader`s which do not define all `DataLoader` attributes as `__init__` parameters ([#8519](https://github.com/PyTorchLightning/pytorch-lightning/pull/8519)) -- Fixed DeepSpeed Windows support ([#8488](https://github.com/PyTorchLightning/pytorch-lightning/pull/8488)) -- Fixed DeepSpeed not properly setting the trainer `lr_schedulers` attribute ([#8527](https://github.com/PyTorchLightning/pytorch-lightning/pull/8527)) -- Fixed experiment version and log-dir divergence in DDP when using multiple `Trainer` instances in sequence ([7403](https://github.com/PyTorchLightning/pytorch-lightning/pull/7403)) -- Enabled manual optimization for TPUs ([#8458](https://github.com/PyTorchLightning/pytorch-lightning/pull/8458)) -- Fixed `accumulate_grad_batches` not been recomputed during model reload ([#5334](https://github.com/PyTorchLightning/pytorch-lightning/pull/5334)) -- Fixed a `TypeError` when wrapping optimizers in the `HorovodPlugin` and running `Trainer.test` ([#7840](https://github.com/PyTorchLightning/pytorch-lightning/pull/7840)) -- Fixed `BackboneFinetuning` restoration ([#8501](https://github.com/PyTorchLightning/pytorch-lightning/pull/8501)) -- Fixed `lr_scheduler` with metric (e.g. `torch.optim.lr_scheduler.ReduceLROnPlateau`) when using `automatic_optimization = False` ([#7643](https://github.com/PyTorchLightning/pytorch-lightning/pull/7643)) -- Fixed `DeepSpeed` breaking with no schedulers ([#8580](https://github.com/PyTorchLightning/pytorch-lightning/pull/8580)) +- Dropped official support/testing for PyTorch <1.6 ([#8288](https://github.com/Lightning-AI/lightning/pull/8288)) +- Removed `ProfilerConnector` ([#7654](https://github.com/Lightning-AI/lightning/pull/7654)) +- Pruned deprecated classif. metrics from `pytorch_lightning.metrics.functional.classification` ([#7499](https://github.com/Lightning-AI/lightning/pull/7499)) +- Removed deprecated data parallel classes `LightningDataParallel` and `LightningDistributedDataParallel` from `pytorch_lightning.overrides.data_parallel` ([#7510](https://github.com/Lightning-AI/lightning/pull/7510)) +- Removed deprecated trainer attributes - `get_model` and `accelerator_backend` ([#7502](https://github.com/Lightning-AI/lightning/pull/7502)) +- Removed support for automatically monitoring the `val_loss` key with `ModelCheckpoint`. Pass your `monitor` of choice to the `ModelCheckpoint` instance instead ([#8293](https://github.com/Lightning-AI/lightning/pull/8293)) +- Removed support for `self.log(tbptt_reduce_fx)` and `self.log(tbptt_pad_token)`. Please, open a discussion explaining your use-case if you relied on these. ([#7644](https://github.com/Lightning-AI/lightning/pull/7644)) +- Removed deprecated utils modules `model_utils`, `warning_utils`, `xla_device_utils` and partially `argparse_utils` ([#7503](https://github.com/Lightning-AI/lightning/pull/7503)) +- Removed `RPCPlugin` and `RPCSequentialPlugin`. If you were successfully using these plugins, please open a GitHub discussion about your use case ([#8101](https://github.com/Lightning-AI/lightning/pull/8101)) +- Removed deprecated trainer attributes - `on_cpu`, `on_tpu`, `use_tpu`, `on_gpu`, `use_dp`, `use_ddp`, `use_ddp2`, `use_horovod`, `use_single_gpu` ([#7501](https://github.com/Lightning-AI/lightning/pull/7501)) +- Removed deprecated `optimizer` argument in `LightningModule.manual_backward()`; Toggling optimizers in manual optimization should be done using `LightningModule.{un}toggle_optimizer()` ([#8287](https://github.com/Lightning-AI/lightning/pull/8287)) +- Removed DeepSpeed FP16 Exception as FP32 is now supported ([#8462](https://github.com/Lightning-AI/lightning/pull/8462)) +- Removed environment variable `PL_EXP_VERSION` from DDP subprocesses ([7403](https://github.com/Lightning-AI/lightning/pull/7403)) + +### Fixed + +- Fixed the `GPUStatsMonitor` callbacks to use the correct GPU IDs if `CUDA_VISIBLE_DEVICES` set ([#8260](https://github.com/Lightning-AI/lightning/pull/8260)) +- Fixed `lr_scheduler` checkpointed state by calling `update_lr_schedulers` before saving checkpoints ([#7877](https://github.com/Lightning-AI/lightning/pull/7877)) +- Fixed ambiguous warning when both overfit and train dataloader shuffling are enabled ([#7685](https://github.com/Lightning-AI/lightning/pull/7685)) +- Fixed dev debugger memory growing due to tracking events even when disabled ([#7875](https://github.com/Lightning-AI/lightning/pull/7875)) +- Fixed `None` loss keys getting added in `training_epoch_end` when using manual optimization and not returning a loss ([#7772](https://github.com/Lightning-AI/lightning/pull/7772)) +- Fixed a bug where `precision=64` with `accelerator='ddp_spawn'` would throw a pickle error ([#6924](https://github.com/Lightning-AI/lightning/pull/6924)) +- Do not override the existing `epoch` value in `logged_metrics` when already logged by the user ([#7982](https://github.com/Lightning-AI/lightning/pull/7982)) +- Support for manual optimization with DeepSpeed ([#7970](https://github.com/Lightning-AI/lightning/pull/7970)) +- Fixed `dataloader_idx` argument value when predicting with only one `DataLoader` ([#7941](https://github.com/Lightning-AI/lightning/pull/7941)) +- Fixed passing the `stage` argument of `Callback.{setup,teardown}` as a keyword ([#7973](https://github.com/Lightning-AI/lightning/pull/7973)) +- Fixed metrics generated during `validation sanity checking` are cleaned on end ([#8171](https://github.com/Lightning-AI/lightning/pull/8171)) +- Fixed `log_gpu_memory` metrics not being added to `logging` when nothing else is logged ([#8174](https://github.com/Lightning-AI/lightning/pull/8174)) +- Fixed a bug where calling `log` with a `Metric` instance would raise an error if it was a nested attribute of the model ([#8181](https://github.com/Lightning-AI/lightning/pull/8181)) +- Fixed a bug where using `precision=64` would cause buffers with complex dtype to be cast to real ([#8208](https://github.com/Lightning-AI/lightning/pull/8208)) +- Fixed `is_overridden` returning true for wrapped functions with no changes ([#8296](https://github.com/Lightning-AI/lightning/pull/8296)) +- Fixed a bug where `truncated_bptt_steps` would throw an AttributeError when the target RNN has multiple hidden states ([#8145](https://github.com/Lightning-AI/lightning/pull/8145)) +- Fixed `self.optimizers()` not returning a single optimizer if it had been wrapped ([#8326](https://github.com/Lightning-AI/lightning/pull/8326)) +- Fixed the `on_after_backward` hook not getting called when using manual optimization and no plugins ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- Fixed the `LightningModule.backward` hook only getting called with the `apex` plugin when using manual optimization ([#8328](https://github.com/Lightning-AI/lightning/pull/8328)) +- Fixed moving batch to device before sending it to the `on_*_batch_start`/`on_*_batch_end` callbacks and model hooks ([#7378](https://github.com/Lightning-AI/lightning/pull/7378)) +- Fixed passing a custom `DDPPlugin` when choosing `accelerator="ddp_cpu"` for the accelerator ([#6208](https://github.com/Lightning-AI/lightning/pull/6208)) +- Fixed missing call to `LightningModule.untoggle_optimizer` in training loop when running gradient accumulation with multiple optimizers ([#8284](https://github.com/Lightning-AI/lightning/pull/8284)) +- Fixed hash of LightningEnum to work with value instead of name ([#8421](https://github.com/Lightning-AI/lightning/pull/8421)). +- Fixed a bug where an extra checkpoint was saved at the end of training if the `val_check_interval` did not align with the number of training batches ([#7724](https://github.com/Lightning-AI/lightning/pull/7724)) +- Fixed hash of LightningEnum to work with value instead of name([#8421](https://github.com/Lightning-AI/lightning/pull/8421)). +- Fixed `move_data_to_device` to return the batch if the object `to` function didn't return `self` ([#8433](https://github.com/Lightning-AI/lightning/pull/8433)) +- Fixed progress bar updates for Pod Training ([#8258](https://github.com/Lightning-AI/lightning/pull/8258)) +- Fixed clearing dataloader references before attaching new dataloaders in consecutive `Trainer.{fit,validate,test,predict}´ runs ([#8442](https://github.com/Lightning-AI/lightning/pull/8442)) +- Fixed memory leaks on GPU by moving `optimizer_states`, `ResultCollection.extra`, `ResultMetric` attributes, and `LoggerConnector` metrics to `cpu`. Also, delete the DDP wrapper on `teardown` ([#8490](https://github.com/Lightning-AI/lightning/pull/8490)) +- Fixed `SWA` callback using LightningModule `prevent_trainer_and_dataloaders_deepcopy` to avoid OOM ([#8472](https://github.com/Lightning-AI/lightning/pull/8472)) +- Fixed `ModelPruning` callback `on_save_checkpoint` to avoid making a `deepcopy` potentially leading to OOM ([#8472](https://github.com/Lightning-AI/lightning/pull/8472)) +- Fixed the sampler replacement logic for `DataLoader`s which do not define all `DataLoader` attributes as `__init__` parameters ([#8519](https://github.com/Lightning-AI/lightning/pull/8519)) +- Fixed DeepSpeed Windows support ([#8488](https://github.com/Lightning-AI/lightning/pull/8488)) +- Fixed DeepSpeed not properly setting the trainer `lr_schedulers` attribute ([#8527](https://github.com/Lightning-AI/lightning/pull/8527)) +- Fixed experiment version and log-dir divergence in DDP when using multiple `Trainer` instances in sequence ([7403](https://github.com/Lightning-AI/lightning/pull/7403)) +- Enabled manual optimization for TPUs ([#8458](https://github.com/Lightning-AI/lightning/pull/8458)) +- Fixed `accumulate_grad_batches` not been recomputed during model reload ([#5334](https://github.com/Lightning-AI/lightning/pull/5334)) +- Fixed a `TypeError` when wrapping optimizers in the `HorovodPlugin` and running `Trainer.test` ([#7840](https://github.com/Lightning-AI/lightning/pull/7840)) +- Fixed `BackboneFinetuning` restoration ([#8501](https://github.com/Lightning-AI/lightning/pull/8501)) +- Fixed `lr_scheduler` with metric (e.g. `torch.optim.lr_scheduler.ReduceLROnPlateau`) when using `automatic_optimization = False` ([#7643](https://github.com/Lightning-AI/lightning/pull/7643)) +- Fixed `DeepSpeed` breaking with no schedulers ([#8580](https://github.com/Lightning-AI/lightning/pull/8580)) ## [1.3.8] - 2021-07-01 ### Fixed -- Fixed a sync deadlock when checkpointing a `LightningModule` that uses a torchmetrics 0.4 `Metric` ([#8218](https://github.com/PyTorchLightning/pytorch-lightning/pull/8218)) -- Fixed compatibility TorchMetrics v0.4 ([#8206](https://github.com/PyTorchLightning/pytorch-lightning/pull/8206)) -- Added torchelastic check when sanitizing GPUs ([#8095](https://github.com/PyTorchLightning/pytorch-lightning/pull/8095)) -- Fixed a DDP info message that was never shown ([#8111](https://github.com/PyTorchLightning/pytorch-lightning/pull/8111)) -- Fixed metrics deprecation message at module import level ([#8163](https://github.com/PyTorchLightning/pytorch-lightning/pull/8163)) -- Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/PyTorchLightning/pytorch-lightning/pull/8170)) -- Added a mechanism to detect `deadlock` for `DDP` when only 1 process trigger an `Exception`. The mechanism will `kill the processes` when it happens ([#8167](https://github.com/PyTorchLightning/pytorch-lightning/pull/8167)) -- Fixed NCCL error when selecting non-consecutive device ids ([#8165](https://github.com/PyTorchLightning/pytorch-lightning/pull/8165)) -- Fixed SWA to also work with `IterableDataset` ([#8172](https://github.com/PyTorchLightning/pytorch-lightning/pull/8172)) +- Fixed a sync deadlock when checkpointing a `LightningModule` that uses a torchmetrics 0.4 `Metric` ([#8218](https://github.com/Lightning-AI/lightning/pull/8218)) +- Fixed compatibility TorchMetrics v0.4 ([#8206](https://github.com/Lightning-AI/lightning/pull/8206)) +- Added torchelastic check when sanitizing GPUs ([#8095](https://github.com/Lightning-AI/lightning/pull/8095)) +- Fixed a DDP info message that was never shown ([#8111](https://github.com/Lightning-AI/lightning/pull/8111)) +- Fixed metrics deprecation message at module import level ([#8163](https://github.com/Lightning-AI/lightning/pull/8163)) +- Fixed a bug where an infinite recursion would be triggered when using the `BaseFinetuning` callback on a model that contains a `ModuleDict` ([#8170](https://github.com/Lightning-AI/lightning/pull/8170)) +- Added a mechanism to detect `deadlock` for `DDP` when only 1 process trigger an `Exception`. The mechanism will `kill the processes` when it happens ([#8167](https://github.com/Lightning-AI/lightning/pull/8167)) +- Fixed NCCL error when selecting non-consecutive device ids ([#8165](https://github.com/Lightning-AI/lightning/pull/8165)) +- Fixed SWA to also work with `IterableDataset` ([#8172](https://github.com/Lightning-AI/lightning/pull/8172)) ## [1.3.7] - 2021-06-22 ### Fixed -- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/PyTorchLightning/pytorch-lightning/pull/7975)) -- Fixed deprecation messages not showing due to incorrect stacklevel ([#8002](https://github.com/PyTorchLightning/pytorch-lightning/pull/8002), [#8005](https://github.com/PyTorchLightning/pytorch-lightning/pull/8005)) -- Fixed setting a `DistributedSampler` when using a distributed plugin in a custom accelerator ([#7814](https://github.com/PyTorchLightning/pytorch-lightning/pull/7814)) -- Improved `PyTorchProfiler` chrome traces names ([#8009](https://github.com/PyTorchLightning/pytorch-lightning/pull/8009)) -- Fixed moving the best score to device in `EarlyStopping` callback for TPU devices ([#7959](https://github.com/PyTorchLightning/pytorch-lightning/pull/7959)) -- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/PyTorchLightning/pytorch-lightning/pull/7916)) +- Fixed a bug where skipping an optimizer while using amp causes amp to trigger an assertion error ([#7975](https://github.com/Lightning-AI/lightning/pull/7975)) +- Fixed deprecation messages not showing due to incorrect stacklevel ([#8002](https://github.com/Lightning-AI/lightning/pull/8002), [#8005](https://github.com/Lightning-AI/lightning/pull/8005)) +- Fixed setting a `DistributedSampler` when using a distributed plugin in a custom accelerator ([#7814](https://github.com/Lightning-AI/lightning/pull/7814)) +- Improved `PyTorchProfiler` chrome traces names ([#8009](https://github.com/Lightning-AI/lightning/pull/8009)) +- Fixed moving the best score to device in `EarlyStopping` callback for TPU devices ([#7959](https://github.com/Lightning-AI/lightning/pull/7959)) +- Fixes access to `callback_metrics` in ddp_spawn ([#7916](https://github.com/Lightning-AI/lightning/pull/7916)) ## [1.3.6] - 2021-06-15 ### Fixed -- Fixed logs overwriting issue for remote filesystems ([#7889](https://github.com/PyTorchLightning/pytorch-lightning/pull/7889)) -- Fixed `DataModule.prepare_data` could only be called on the global rank 0 process ([#7945](https://github.com/PyTorchLightning/pytorch-lightning/pull/7945)) -- Fixed setting `worker_init_fn` to seed dataloaders correctly when using DDP ([#7942](https://github.com/PyTorchLightning/pytorch-lightning/pull/7942)) -- Fixed `BaseFinetuning` callback to properly handle parent modules w/ parameters ([#7931](https://github.com/PyTorchLightning/pytorch-lightning/pull/7931)) +- Fixed logs overwriting issue for remote filesystems ([#7889](https://github.com/Lightning-AI/lightning/pull/7889)) +- Fixed `DataModule.prepare_data` could only be called on the global rank 0 process ([#7945](https://github.com/Lightning-AI/lightning/pull/7945)) +- Fixed setting `worker_init_fn` to seed dataloaders correctly when using DDP ([#7942](https://github.com/Lightning-AI/lightning/pull/7942)) +- Fixed `BaseFinetuning` callback to properly handle parent modules w/ parameters ([#7931](https://github.com/Lightning-AI/lightning/pull/7931)) ## [1.3.5] - 2021-06-08 ### Added -- Added warning to Training Step output ([#7779](https://github.com/PyTorchLightning/pytorch-lightning/pull/7779)) +- Added warning to Training Step output ([#7779](https://github.com/Lightning-AI/lightning/pull/7779)) ### Fixed -- Fixed `LearningRateMonitor` and `BackboneFinetuning` ([#7835](https://github.com/PyTorchLightning/pytorch-lightning/pull/7835)) -- Minor improvements to `apply_to_collection` and type signature of `log_dict` ([#7851](https://github.com/PyTorchLightning/pytorch-lightning/pull/7851)) -- Fixed docker versions ([#7834](https://github.com/PyTorchLightning/pytorch-lightning/pull/7834)) -- Fixed sharded training check for fp16 precision ([#7825](https://github.com/PyTorchLightning/pytorch-lightning/pull/7825)) -- Fixed support for torch Module type hints in LightningCLI ([#7807](https://github.com/PyTorchLightning/pytorch-lightning/pull/7807)) +- Fixed `LearningRateMonitor` and `BackboneFinetuning` ([#7835](https://github.com/Lightning-AI/lightning/pull/7835)) +- Minor improvements to `apply_to_collection` and type signature of `log_dict` ([#7851](https://github.com/Lightning-AI/lightning/pull/7851)) +- Fixed docker versions ([#7834](https://github.com/Lightning-AI/lightning/pull/7834)) +- Fixed sharded training check for fp16 precision ([#7825](https://github.com/Lightning-AI/lightning/pull/7825)) +- Fixed support for torch Module type hints in LightningCLI ([#7807](https://github.com/Lightning-AI/lightning/pull/7807)) ### Changed -- Move `training_output` validation to after `train_step_end` ([#7868](https://github.com/PyTorchLightning/pytorch-lightning/pull/7868)) +- Move `training_output` validation to after `train_step_end` ([#7868](https://github.com/Lightning-AI/lightning/pull/7868)) ## [1.3.4] - 2021-06-01 ### Fixed -- Fixed info message when max training time reached ([#7780](https://github.com/PyTorchLightning/pytorch-lightning/pull/7780)) -- Fixed missing `__len__` method to `IndexBatchSamplerWrapper` ([#7681](https://github.com/PyTorchLightning/pytorch-lightning/pull/7681)) +- Fixed info message when max training time reached ([#7780](https://github.com/Lightning-AI/lightning/pull/7780)) +- Fixed missing `__len__` method to `IndexBatchSamplerWrapper` ([#7681](https://github.com/Lightning-AI/lightning/pull/7681)) ## [1.3.3] - 2021-05-27 ### Changed -- Changed calling of `untoggle_optimizer(opt_idx)` out of the closure function ([#7563](https://github.com/PyTorchLightning/pytorch-lightning/pull/7563)) +- Changed calling of `untoggle_optimizer(opt_idx)` out of the closure function ([#7563](https://github.com/Lightning-AI/lightning/pull/7563)) ### Fixed -- Fixed `ProgressBar` pickling after calling `trainer.predict` ([#7608](https://github.com/PyTorchLightning/pytorch-lightning/pull/7608)) -- Fixed broadcasting in multi-node, multi-gpu DDP using torch 1.7 ([#7592](https://github.com/PyTorchLightning/pytorch-lightning/pull/7592)) -- Fixed dataloaders are not reset when tuning the model ([#7566](https://github.com/PyTorchLightning/pytorch-lightning/pull/7566)) -- Fixed print errors in `ProgressBar` when `trainer.fit` is not called ([#7674](https://github.com/PyTorchLightning/pytorch-lightning/pull/7674)) -- Fixed global step update when the epoch is skipped ([#7677](https://github.com/PyTorchLightning/pytorch-lightning/pull/7677)) -- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/PyTorchLightning/pytorch-lightning/pull/7692)) +- Fixed `ProgressBar` pickling after calling `trainer.predict` ([#7608](https://github.com/Lightning-AI/lightning/pull/7608)) +- Fixed broadcasting in multi-node, multi-gpu DDP using torch 1.7 ([#7592](https://github.com/Lightning-AI/lightning/pull/7592)) +- Fixed dataloaders are not reset when tuning the model ([#7566](https://github.com/Lightning-AI/lightning/pull/7566)) +- Fixed print errors in `ProgressBar` when `trainer.fit` is not called ([#7674](https://github.com/Lightning-AI/lightning/pull/7674)) +- Fixed global step update when the epoch is skipped ([#7677](https://github.com/Lightning-AI/lightning/pull/7677)) +- Fixed training loop total batch counter when accumulate grad batches was enabled ([#7692](https://github.com/Lightning-AI/lightning/pull/7692)) ## [1.3.2] - 2021-05-18 ### Changed -- `DataModule`s now avoid duplicate `{setup,teardown,prepare_data}` calls for the same stage ([#7238](https://github.com/PyTorchLightning/pytorch-lightning/pull/7238)) +- `DataModule`s now avoid duplicate `{setup,teardown,prepare_data}` calls for the same stage ([#7238](https://github.com/Lightning-AI/lightning/pull/7238)) ### Fixed -- Fixed parsing of multiple training dataloaders ([#7433](https://github.com/PyTorchLightning/pytorch-lightning/pull/7433)) -- Fixed recursive passing of `wrong_type` keyword argument in `pytorch_lightning.utilities.apply_to_collection` ([#7433](https://github.com/PyTorchLightning/pytorch-lightning/pull/7433)) -- Fixed setting correct `DistribType` for `ddp_cpu` (spawn) backend ([#7492](https://github.com/PyTorchLightning/pytorch-lightning/pull/7492)) -- Fixed incorrect number of calls to LR scheduler when `check_val_every_n_epoch > 1` ([#7032](https://github.com/PyTorchLightning/pytorch-lightning/pull/7032)) +- Fixed parsing of multiple training dataloaders ([#7433](https://github.com/Lightning-AI/lightning/pull/7433)) +- Fixed recursive passing of `wrong_type` keyword argument in `pytorch_lightning.utilities.apply_to_collection` ([#7433](https://github.com/Lightning-AI/lightning/pull/7433)) +- Fixed setting correct `DistribType` for `ddp_cpu` (spawn) backend ([#7492](https://github.com/Lightning-AI/lightning/pull/7492)) +- Fixed incorrect number of calls to LR scheduler when `check_val_every_n_epoch > 1` ([#7032](https://github.com/Lightning-AI/lightning/pull/7032)) ## [1.3.1] - 2021-05-11 ### Fixed -- Fixed DeepSpeed with IterableDatasets ([#7362](https://github.com/PyTorchLightning/pytorch-lightning/pull/7362)) -- Fixed `Trainer.current_epoch` not getting restored after tuning ([#7434](https://github.com/PyTorchLightning/pytorch-lightning/pull/7434)) -- Fixed local rank displayed in console log ([#7395](https://github.com/PyTorchLightning/pytorch-lightning/pull/7395)) +- Fixed DeepSpeed with IterableDatasets ([#7362](https://github.com/Lightning-AI/lightning/pull/7362)) +- Fixed `Trainer.current_epoch` not getting restored after tuning ([#7434](https://github.com/Lightning-AI/lightning/pull/7434)) +- Fixed local rank displayed in console log ([#7395](https://github.com/Lightning-AI/lightning/pull/7395)) ## [1.3.0] - 2021-05-06 ### Added -- Added support for the `EarlyStopping` callback to run at the end of the training epoch ([#6944](https://github.com/PyTorchLightning/pytorch-lightning/pull/6944)) -- Added synchronization points before and after `setup` hooks are run ([#7202](https://github.com/PyTorchLightning/pytorch-lightning/pull/7202)) -- Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942)) -- Added utils for metrics to scalar conversions ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180)) -- Added utils for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834)) -- Added more explicit exception message when trying to execute `trainer.test()` or `trainer.validate()` with `fast_dev_run=True` ([#6667](https://github.com/PyTorchLightning/pytorch-lightning/pull/6667)) +- Added support for the `EarlyStopping` callback to run at the end of the training epoch ([#6944](https://github.com/Lightning-AI/lightning/pull/6944)) +- Added synchronization points before and after `setup` hooks are run ([#7202](https://github.com/Lightning-AI/lightning/pull/7202)) +- Added a `teardown` hook to `ClusterEnvironment` ([#6942](https://github.com/Lightning-AI/lightning/pull/6942)) +- Added utils for metrics to scalar conversions ([#7180](https://github.com/Lightning-AI/lightning/pull/7180)) +- Added utils for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/Lightning-AI/lightning/pull/6834)) +- Added more explicit exception message when trying to execute `trainer.test()` or `trainer.validate()` with `fast_dev_run=True` ([#6667](https://github.com/Lightning-AI/lightning/pull/6667)) - Added `LightningCLI` class to provide simple reproducibility with minimum boilerplate training CLI ( - [#4492](https://github.com/PyTorchLightning/pytorch-lightning/pull/4492), - [#6862](https://github.com/PyTorchLightning/pytorch-lightning/pull/6862), - [#7156](https://github.com/PyTorchLightning/pytorch-lightning/pull/7156), - [#7299](https://github.com/PyTorchLightning/pytorch-lightning/pull/7299)) -- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#6123](https://github.com/PyTorchLightning/pytorch-lightning/pull/6123)). -- Added a way to print to terminal without breaking up the progress bar ([#5470](https://github.com/PyTorchLightning/pytorch-lightning/pull/5470)) -- Added support to checkpoint after training steps in `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146)) -- Added `TrainerStatus.{INITIALIZING,RUNNING,FINISHED,INTERRUPTED}` ([#7173](https://github.com/PyTorchLightning/pytorch-lightning/pull/7173)) -- Added `Trainer.validate()` method to perform one evaluation epoch over the validation set ([#4948](https://github.com/PyTorchLightning/pytorch-lightning/pull/4948)) -- Added `LightningEnvironment` for Lightning-specific DDP ([#5915](https://github.com/PyTorchLightning/pytorch-lightning/pull/5915)) -- Added `teardown()` hook to LightningDataModule ([#4673](https://github.com/PyTorchLightning/pytorch-lightning/pull/4673)) -- Added `auto_insert_metric_name` parameter to `ModelCheckpoint` ([#6277](https://github.com/PyTorchLightning/pytorch-lightning/pull/6277)) -- Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/PyTorchLightning/pytorch-lightning/pull/6274)) -- Added `teardown` method to `BaseProfiler` to enable subclasses defining post-profiling steps outside of `__del__` ([#6370](https://github.com/PyTorchLightning/pytorch-lightning/pull/6370)) -- Added `setup` method to `BaseProfiler` to enable subclasses defining pre-profiling steps for every process ([#6633](https://github.com/PyTorchLightning/pytorch-lightning/pull/6633)) -- Added no return warning to predict ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139)) -- Added `Trainer.predict` config validation ([#6543](https://github.com/PyTorchLightning/pytorch-lightning/pull/6543)) -- Added `AbstractProfiler` interface ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621)) -- Added support for including module names for forward in the autograd trace of `PyTorchProfiler` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349)) -- Added support for the PyTorch 1.8.1 autograd profiler ([#6618](https://github.com/PyTorchLightning/pytorch-lightning/pull/6618)) -- Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/PyTorchLightning/pytorch-lightning/pull/6120)) -- Added `configure_sharded_model` hook ([#6679](https://github.com/PyTorchLightning/pytorch-lightning/pull/6679)) -- Added support for `precision=64`, enabling training with double precision ([#6595](https://github.com/PyTorchLightning/pytorch-lightning/pull/6595)) -- Added support for DDP communication hooks ([#6736](https://github.com/PyTorchLightning/pytorch-lightning/pull/6736)) -- Added `artifact_location` argument to `MLFlowLogger` which will be passed to the `MlflowClient.create_experiment` call ([#6677](https://github.com/PyTorchLightning/pytorch-lightning/pull/6677)) + [#4492](https://github.com/Lightning-AI/lightning/pull/4492), + [#6862](https://github.com/Lightning-AI/lightning/pull/6862), + [#7156](https://github.com/Lightning-AI/lightning/pull/7156), + [#7299](https://github.com/Lightning-AI/lightning/pull/7299)) +- Added `gradient_clip_algorithm` argument to Trainer for gradient clipping by value ([#6123](https://github.com/Lightning-AI/lightning/pull/6123)). +- Added a way to print to terminal without breaking up the progress bar ([#5470](https://github.com/Lightning-AI/lightning/pull/5470)) +- Added support to checkpoint after training steps in `ModelCheckpoint` callback ([#6146](https://github.com/Lightning-AI/lightning/pull/6146)) +- Added `TrainerStatus.{INITIALIZING,RUNNING,FINISHED,INTERRUPTED}` ([#7173](https://github.com/Lightning-AI/lightning/pull/7173)) +- Added `Trainer.validate()` method to perform one evaluation epoch over the validation set ([#4948](https://github.com/Lightning-AI/lightning/pull/4948)) +- Added `LightningEnvironment` for Lightning-specific DDP ([#5915](https://github.com/Lightning-AI/lightning/pull/5915)) +- Added `teardown()` hook to LightningDataModule ([#4673](https://github.com/Lightning-AI/lightning/pull/4673)) +- Added `auto_insert_metric_name` parameter to `ModelCheckpoint` ([#6277](https://github.com/Lightning-AI/lightning/pull/6277)) +- Added arg to `self.log` that enables users to give custom names when dealing with multiple dataloaders ([#6274](https://github.com/Lightning-AI/lightning/pull/6274)) +- Added `teardown` method to `BaseProfiler` to enable subclasses defining post-profiling steps outside of `__del__` ([#6370](https://github.com/Lightning-AI/lightning/pull/6370)) +- Added `setup` method to `BaseProfiler` to enable subclasses defining pre-profiling steps for every process ([#6633](https://github.com/Lightning-AI/lightning/pull/6633)) +- Added no return warning to predict ([#6139](https://github.com/Lightning-AI/lightning/pull/6139)) +- Added `Trainer.predict` config validation ([#6543](https://github.com/Lightning-AI/lightning/pull/6543)) +- Added `AbstractProfiler` interface ([#6621](https://github.com/Lightning-AI/lightning/pull/6621)) +- Added support for including module names for forward in the autograd trace of `PyTorchProfiler` ([#6349](https://github.com/Lightning-AI/lightning/pull/6349)) +- Added support for the PyTorch 1.8.1 autograd profiler ([#6618](https://github.com/Lightning-AI/lightning/pull/6618)) +- Added `outputs` parameter to callback's `on_validation_epoch_end` & `on_test_epoch_end` hooks ([#6120](https://github.com/Lightning-AI/lightning/pull/6120)) +- Added `configure_sharded_model` hook ([#6679](https://github.com/Lightning-AI/lightning/pull/6679)) +- Added support for `precision=64`, enabling training with double precision ([#6595](https://github.com/Lightning-AI/lightning/pull/6595)) +- Added support for DDP communication hooks ([#6736](https://github.com/Lightning-AI/lightning/pull/6736)) +- Added `artifact_location` argument to `MLFlowLogger` which will be passed to the `MlflowClient.create_experiment` call ([#6677](https://github.com/Lightning-AI/lightning/pull/6677)) - Added `model` parameter to precision plugins' `clip_gradients` signature ( - [#6764](https://github.com/PyTorchLightning/pytorch-lightning/pull/6764), - [#7231](https://github.com/PyTorchLightning/pytorch-lightning/pull/7231)) -- Added `is_last_batch` attribute to `Trainer` ([#6825](https://github.com/PyTorchLightning/pytorch-lightning/pull/6825)) -- Added `LightningModule.lr_schedulers()` for manual optimization ([#6567](https://github.com/PyTorchLightning/pytorch-lightning/pull/6567)) -- Added `MpModelWrapper` in TPU Spawn ([#7045](https://github.com/PyTorchLightning/pytorch-lightning/pull/7045)) -- Added `max_time` Trainer argument to limit training time ([#6823](https://github.com/PyTorchLightning/pytorch-lightning/pull/6823)) -- Added `on_predict_{batch,epoch}_{start,end}` hooks ([#7141](https://github.com/PyTorchLightning/pytorch-lightning/pull/7141)) -- Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/PyTorchLightning/pytorch-lightning/pull/6868)) -- Added `debug` flag to TPU Training Plugins (PT_XLA_DEBUG) ([#7219](https://github.com/PyTorchLightning/pytorch-lightning/pull/7219)) -- Added new `UnrepeatedDistributedSampler` and `IndexBatchSamplerWrapper` for tracking distributed predictions ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215)) -- Added `trainer.predict(return_predictions=None|False|True)` ([#7215](https://github.com/PyTorchLightning/pytorch-lightning/pull/7215)) -- Added `BasePredictionWriter` callback to implement prediction saving ([#7127](https://github.com/PyTorchLightning/pytorch-lightning/pull/7127)) -- Added `trainer.tune(scale_batch_size_kwargs, lr_find_kwargs)` arguments to configure the tuning algorithms ([#7258](https://github.com/PyTorchLightning/pytorch-lightning/pull/7258)) -- Added `tpu_distributed` check for TPU Spawn barrier ([#7241](https://github.com/PyTorchLightning/pytorch-lightning/pull/7241)) -- Added device updates to TPU Spawn for Pod training ([#7243](https://github.com/PyTorchLightning/pytorch-lightning/pull/7243)) -- Added warning when missing `Callback` and using `resume_from_checkpoint` ([#7254](https://github.com/PyTorchLightning/pytorch-lightning/pull/7254)) -- DeepSpeed single file saving ([#6900](https://github.com/PyTorchLightning/pytorch-lightning/pull/6900)) + [#6764](https://github.com/Lightning-AI/lightning/pull/6764), + [#7231](https://github.com/Lightning-AI/lightning/pull/7231)) +- Added `is_last_batch` attribute to `Trainer` ([#6825](https://github.com/Lightning-AI/lightning/pull/6825)) +- Added `LightningModule.lr_schedulers()` for manual optimization ([#6567](https://github.com/Lightning-AI/lightning/pull/6567)) +- Added `MpModelWrapper` in TPU Spawn ([#7045](https://github.com/Lightning-AI/lightning/pull/7045)) +- Added `max_time` Trainer argument to limit training time ([#6823](https://github.com/Lightning-AI/lightning/pull/6823)) +- Added `on_predict_{batch,epoch}_{start,end}` hooks ([#7141](https://github.com/Lightning-AI/lightning/pull/7141)) +- Added new `EarlyStopping` parameters `stopping_threshold` and `divergence_threshold` ([#6868](https://github.com/Lightning-AI/lightning/pull/6868)) +- Added `debug` flag to TPU Training Plugins (PT_XLA_DEBUG) ([#7219](https://github.com/Lightning-AI/lightning/pull/7219)) +- Added new `UnrepeatedDistributedSampler` and `IndexBatchSamplerWrapper` for tracking distributed predictions ([#7215](https://github.com/Lightning-AI/lightning/pull/7215)) +- Added `trainer.predict(return_predictions=None|False|True)` ([#7215](https://github.com/Lightning-AI/lightning/pull/7215)) +- Added `BasePredictionWriter` callback to implement prediction saving ([#7127](https://github.com/Lightning-AI/lightning/pull/7127)) +- Added `trainer.tune(scale_batch_size_kwargs, lr_find_kwargs)` arguments to configure the tuning algorithms ([#7258](https://github.com/Lightning-AI/lightning/pull/7258)) +- Added `tpu_distributed` check for TPU Spawn barrier ([#7241](https://github.com/Lightning-AI/lightning/pull/7241)) +- Added device updates to TPU Spawn for Pod training ([#7243](https://github.com/Lightning-AI/lightning/pull/7243)) +- Added warning when missing `Callback` and using `resume_from_checkpoint` ([#7254](https://github.com/Lightning-AI/lightning/pull/7254)) +- DeepSpeed single file saving ([#6900](https://github.com/Lightning-AI/lightning/pull/6900)) - Added Training type Plugins Registry ( - [#6982](https://github.com/PyTorchLightning/pytorch-lightning/pull/6982), - [#7063](https://github.com/PyTorchLightning/pytorch-lightning/pull/7063), - [#7214](https://github.com/PyTorchLightning/pytorch-lightning/pull/7214), - [#7224](https://github.com/PyTorchLightning/pytorch-lightning/pull/7224) + [#6982](https://github.com/Lightning-AI/lightning/pull/6982), + [#7063](https://github.com/Lightning-AI/lightning/pull/7063), + [#7214](https://github.com/Lightning-AI/lightning/pull/7214), + [#7224](https://github.com/Lightning-AI/lightning/pull/7224) ) -- Add `ignore` param to `save_hyperparameters` ([#6056](https://github.com/PyTorchLightning/pytorch-lightning/pull/6056)) +- Add `ignore` param to `save_hyperparameters` ([#6056](https://github.com/Lightning-AI/lightning/pull/6056)) ### Changed -- Changed `LightningModule.truncated_bptt_steps` to be property ([#7323](https://github.com/PyTorchLightning/pytorch-lightning/pull/7323)) -- Changed `EarlyStopping` callback from by default running `EarlyStopping.on_validation_end` if only training is run. Set `check_on_train_epoch_end` to run the callback at the end of the train epoch instead of at the end of the validation epoch ([#7069](https://github.com/PyTorchLightning/pytorch-lightning/pull/7069)) -- Renamed `pytorch_lightning.callbacks.swa` to `pytorch_lightning.callbacks.stochastic_weight_avg` ([#6259](https://github.com/PyTorchLightning/pytorch-lightning/pull/6259)) +- Changed `LightningModule.truncated_bptt_steps` to be property ([#7323](https://github.com/Lightning-AI/lightning/pull/7323)) +- Changed `EarlyStopping` callback from by default running `EarlyStopping.on_validation_end` if only training is run. Set `check_on_train_epoch_end` to run the callback at the end of the train epoch instead of at the end of the validation epoch ([#7069](https://github.com/Lightning-AI/lightning/pull/7069)) +- Renamed `pytorch_lightning.callbacks.swa` to `pytorch_lightning.callbacks.stochastic_weight_avg` ([#6259](https://github.com/Lightning-AI/lightning/pull/6259)) - Refactor `RunningStage` and `TrainerState` usage ( - [#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945), - [#7173](https://github.com/PyTorchLightning/pytorch-lightning/pull/7173)) + [#4945](https://github.com/Lightning-AI/lightning/pull/4945), + [#7173](https://github.com/Lightning-AI/lightning/pull/7173)) * Added `RunningStage.SANITY_CHECKING` * Added `TrainerFn.{FITTING,VALIDATING,TESTING,PREDICTING,TUNING}` * Changed `trainer.evaluating` to return `True` if validating or testing -- Changed `setup()` and `teardown()` stage argument to take any of `{fit,validate,test,predict}` ([#6386](https://github.com/PyTorchLightning/pytorch-lightning/pull/6386)) -- Changed profilers to save separate report files per state and rank ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621)) -- The trainer no longer tries to save a checkpoint on exception or run callback's `on_train_end` functions ([#6864](https://github.com/PyTorchLightning/pytorch-lightning/pull/6864)) -- Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349)) -- Disabled `lr_scheduler.step()` in manual optimization ([#6825](https://github.com/PyTorchLightning/pytorch-lightning/pull/6825)) -- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/PyTorchLightning/pytorch-lightning/pull/6762)) -- `pl.seed_everything` will now also set the seed on the `DistributedSampler` ([#7024](https://github.com/PyTorchLightning/pytorch-lightning/pull/7024)) -- Changed default setting for communication of multi-node training using `DDPShardedPlugin` ([#6937](https://github.com/PyTorchLightning/pytorch-lightning/pull/6937)) -- `trainer.tune()` now returns the tuning result ([#7258](https://github.com/PyTorchLightning/pytorch-lightning/pull/7258)) -- `LightningModule.from_datasets()` now accepts `IterableDataset` instances as training datasets. ([#7503](https://github.com/PyTorchLightning/pytorch-lightning/pull/7503)) -- Changed `resume_from_checkpoint` warning to an error when the checkpoint file does not exist ([#7075](https://github.com/PyTorchLightning/pytorch-lightning/pull/7075)) -- Automatically set `sync_batchnorm` for `training_type_plugin` ([#6536](https://github.com/PyTorchLightning/pytorch-lightning/pull/6536)) -- Allowed training type plugin to delay optimizer creation ([#6331](https://github.com/PyTorchLightning/pytorch-lightning/pull/6331)) -- Removed ModelSummary validation from train loop on_trainer_init ([#6610](https://github.com/PyTorchLightning/pytorch-lightning/pull/6610)) -- Moved `save_function` to accelerator ([#6689](https://github.com/PyTorchLightning/pytorch-lightning/pull/6689)) -- Updated DeepSpeed ZeRO ([#6546](https://github.com/PyTorchLightning/pytorch-lightning/pull/6546), - [#6752](https://github.com/PyTorchLightning/pytorch-lightning/pull/6752), - [#6142](https://github.com/PyTorchLightning/pytorch-lightning/pull/6142), - [#6321](https://github.com/PyTorchLightning/pytorch-lightning/pull/6321)) -- Improved verbose logging for `EarlyStopping` callback ([#6811](https://github.com/PyTorchLightning/pytorch-lightning/pull/6811)) -- Run ddp_spawn dataloader checks on Windows ([#6930](https://github.com/PyTorchLightning/pytorch-lightning/pull/6930)) -- Updated mlflow with using `resolve_tags` ([#6746](https://github.com/PyTorchLightning/pytorch-lightning/pull/6746)) -- Moved `save_hyperparameters` to its own function ([#7119](https://github.com/PyTorchLightning/pytorch-lightning/pull/7119)) -- Replaced `_DataModuleWrapper` with `__new__` ([#7289](https://github.com/PyTorchLightning/pytorch-lightning/pull/7289)) -- Reset `current_fx` properties on lightning module in teardown ([#7247](https://github.com/PyTorchLightning/pytorch-lightning/pull/7247)) -- Auto-set `DataLoader.worker_init_fn` with `seed_everything` ([#6960](https://github.com/PyTorchLightning/pytorch-lightning/pull/6960)) -- Remove `model.trainer` call inside of dataloading mixin ([#7317](https://github.com/PyTorchLightning/pytorch-lightning/pull/7317)) -- Split profilers module ([#6261](https://github.com/PyTorchLightning/pytorch-lightning/pull/6261)) -- Ensure accelerator is valid if running interactively ([#5970](https://github.com/PyTorchLightning/pytorch-lightning/pull/5970)) -- Disabled batch transfer in DP mode ([#6098](https://github.com/PyTorchLightning/pytorch-lightning/pull/6098)) +- Changed `setup()` and `teardown()` stage argument to take any of `{fit,validate,test,predict}` ([#6386](https://github.com/Lightning-AI/lightning/pull/6386)) +- Changed profilers to save separate report files per state and rank ([#6621](https://github.com/Lightning-AI/lightning/pull/6621)) +- The trainer no longer tries to save a checkpoint on exception or run callback's `on_train_end` functions ([#6864](https://github.com/Lightning-AI/lightning/pull/6864)) +- Changed `PyTorchProfiler` to use `torch.autograd.profiler.record_function` to record functions ([#6349](https://github.com/Lightning-AI/lightning/pull/6349)) +- Disabled `lr_scheduler.step()` in manual optimization ([#6825](https://github.com/Lightning-AI/lightning/pull/6825)) +- Changed warnings and recommendations for dataloaders in `ddp_spawn` ([#6762](https://github.com/Lightning-AI/lightning/pull/6762)) +- `pl.seed_everything` will now also set the seed on the `DistributedSampler` ([#7024](https://github.com/Lightning-AI/lightning/pull/7024)) +- Changed default setting for communication of multi-node training using `DDPShardedPlugin` ([#6937](https://github.com/Lightning-AI/lightning/pull/6937)) +- `trainer.tune()` now returns the tuning result ([#7258](https://github.com/Lightning-AI/lightning/pull/7258)) +- `LightningModule.from_datasets()` now accepts `IterableDataset` instances as training datasets. ([#7503](https://github.com/Lightning-AI/lightning/pull/7503)) +- Changed `resume_from_checkpoint` warning to an error when the checkpoint file does not exist ([#7075](https://github.com/Lightning-AI/lightning/pull/7075)) +- Automatically set `sync_batchnorm` for `training_type_plugin` ([#6536](https://github.com/Lightning-AI/lightning/pull/6536)) +- Allowed training type plugin to delay optimizer creation ([#6331](https://github.com/Lightning-AI/lightning/pull/6331)) +- Removed ModelSummary validation from train loop on_trainer_init ([#6610](https://github.com/Lightning-AI/lightning/pull/6610)) +- Moved `save_function` to accelerator ([#6689](https://github.com/Lightning-AI/lightning/pull/6689)) +- Updated DeepSpeed ZeRO ([#6546](https://github.com/Lightning-AI/lightning/pull/6546), + [#6752](https://github.com/Lightning-AI/lightning/pull/6752), + [#6142](https://github.com/Lightning-AI/lightning/pull/6142), + [#6321](https://github.com/Lightning-AI/lightning/pull/6321)) +- Improved verbose logging for `EarlyStopping` callback ([#6811](https://github.com/Lightning-AI/lightning/pull/6811)) +- Run ddp_spawn dataloader checks on Windows ([#6930](https://github.com/Lightning-AI/lightning/pull/6930)) +- Updated mlflow with using `resolve_tags` ([#6746](https://github.com/Lightning-AI/lightning/pull/6746)) +- Moved `save_hyperparameters` to its own function ([#7119](https://github.com/Lightning-AI/lightning/pull/7119)) +- Replaced `_DataModuleWrapper` with `__new__` ([#7289](https://github.com/Lightning-AI/lightning/pull/7289)) +- Reset `current_fx` properties on lightning module in teardown ([#7247](https://github.com/Lightning-AI/lightning/pull/7247)) +- Auto-set `DataLoader.worker_init_fn` with `seed_everything` ([#6960](https://github.com/Lightning-AI/lightning/pull/6960)) +- Remove `model.trainer` call inside of dataloading mixin ([#7317](https://github.com/Lightning-AI/lightning/pull/7317)) +- Split profilers module ([#6261](https://github.com/Lightning-AI/lightning/pull/6261)) +- Ensure accelerator is valid if running interactively ([#5970](https://github.com/Lightning-AI/lightning/pull/5970)) +- Disabled batch transfer in DP mode ([#6098](https://github.com/Lightning-AI/lightning/pull/6098)) ### Deprecated -- Deprecated `outputs` in both `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#7339](https://github.com/PyTorchLightning/pytorch-lightning/pull/7339)) -- Deprecated `Trainer.truncated_bptt_steps` in favor of `LightningModule.truncated_bptt_steps` ([#7323](https://github.com/PyTorchLightning/pytorch-lightning/pull/7323)) -- Deprecated `outputs` in both `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#7339](https://github.com/PyTorchLightning/pytorch-lightning/pull/7339)) -- Deprecated `LightningModule.grad_norm` in favor of `pytorch_lightning.utilities.grads.grad_norm` ([#7292](https://github.com/PyTorchLightning/pytorch-lightning/pull/7292)) -- Deprecated the `save_function` property from the `ModelCheckpoint` callback ([#7201](https://github.com/PyTorchLightning/pytorch-lightning/pull/7201)) -- Deprecated `LightningModule.write_predictions` and `LightningModule.write_predictions_dict` ([#7066](https://github.com/PyTorchLightning/pytorch-lightning/pull/7066)) -- Deprecated `TrainerLoggingMixin` in favor of a separate utilities module for metric handling ([#7180](https://github.com/PyTorchLightning/pytorch-lightning/pull/7180)) -- Deprecated `TrainerTrainingTricksMixin` in favor of a separate utilities module for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/PyTorchLightning/pytorch-lightning/pull/6834)) -- `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/PyTorchLightning/pytorch-lightning/pull/6146)) -- Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/PyTorchLightning/pytorch-lightning/pull/4945)) -- Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/PyTorchLightning/pytorch-lightning/pull/6621)) -- Deprecated `PyTorchProfiler(profiled_functions)` in favor of `record_functions` ([#6349](https://github.com/PyTorchLightning/pytorch-lightning/pull/6349)) -- Deprecated `@auto_move_data` in favor of `trainer.predict` ([#6993](https://github.com/PyTorchLightning/pytorch-lightning/pull/6993)) -- Deprecated `Callback.on_load_checkpoint(checkpoint)` in favor of `Callback.on_load_checkpoint(trainer, pl_module, checkpoint)` ([#7253](https://github.com/PyTorchLightning/pytorch-lightning/pull/7253)) +- Deprecated `outputs` in both `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#7339](https://github.com/Lightning-AI/lightning/pull/7339)) +- Deprecated `Trainer.truncated_bptt_steps` in favor of `LightningModule.truncated_bptt_steps` ([#7323](https://github.com/Lightning-AI/lightning/pull/7323)) +- Deprecated `outputs` in both `LightningModule.on_train_epoch_end` and `Callback.on_train_epoch_end` hooks ([#7339](https://github.com/Lightning-AI/lightning/pull/7339)) +- Deprecated `LightningModule.grad_norm` in favor of `pytorch_lightning.utilities.grads.grad_norm` ([#7292](https://github.com/Lightning-AI/lightning/pull/7292)) +- Deprecated the `save_function` property from the `ModelCheckpoint` callback ([#7201](https://github.com/Lightning-AI/lightning/pull/7201)) +- Deprecated `LightningModule.write_predictions` and `LightningModule.write_predictions_dict` ([#7066](https://github.com/Lightning-AI/lightning/pull/7066)) +- Deprecated `TrainerLoggingMixin` in favor of a separate utilities module for metric handling ([#7180](https://github.com/Lightning-AI/lightning/pull/7180)) +- Deprecated `TrainerTrainingTricksMixin` in favor of a separate utilities module for NaN/Inf detection for gradients and parameters ([#6834](https://github.com/Lightning-AI/lightning/pull/6834)) +- `period` has been deprecated in favor of `every_n_val_epochs` in the `ModelCheckpoint` callback ([#6146](https://github.com/Lightning-AI/lightning/pull/6146)) +- Deprecated `trainer.running_sanity_check` in favor of `trainer.sanity_checking` ([#4945](https://github.com/Lightning-AI/lightning/pull/4945)) +- Deprecated `Profiler(output_filename)` in favor of `dirpath` and `filename` ([#6621](https://github.com/Lightning-AI/lightning/pull/6621)) +- Deprecated `PyTorchProfiler(profiled_functions)` in favor of `record_functions` ([#6349](https://github.com/Lightning-AI/lightning/pull/6349)) +- Deprecated `@auto_move_data` in favor of `trainer.predict` ([#6993](https://github.com/Lightning-AI/lightning/pull/6993)) +- Deprecated `Callback.on_load_checkpoint(checkpoint)` in favor of `Callback.on_load_checkpoint(trainer, pl_module, checkpoint)` ([#7253](https://github.com/Lightning-AI/lightning/pull/7253)) - Deprecated metrics in favor of `torchmetrics` ( - [#6505](https://github.com/PyTorchLightning/pytorch-lightning/pull/6505), - [#6530](https://github.com/PyTorchLightning/pytorch-lightning/pull/6530), - [#6540](https://github.com/PyTorchLightning/pytorch-lightning/pull/6540), - [#6547](https://github.com/PyTorchLightning/pytorch-lightning/pull/6547), - [#6515](https://github.com/PyTorchLightning/pytorch-lightning/pull/6515), - [#6572](https://github.com/PyTorchLightning/pytorch-lightning/pull/6572), - [#6573](https://github.com/PyTorchLightning/pytorch-lightning/pull/6573), - [#6584](https://github.com/PyTorchLightning/pytorch-lightning/pull/6584), - [#6636](https://github.com/PyTorchLightning/pytorch-lightning/pull/6636), - [#6637](https://github.com/PyTorchLightning/pytorch-lightning/pull/6637), - [#6649](https://github.com/PyTorchLightning/pytorch-lightning/pull/6649), - [#6659](https://github.com/PyTorchLightning/pytorch-lightning/pull/6659), - [#7131](https://github.com/PyTorchLightning/pytorch-lightning/pull/7131), + [#6505](https://github.com/Lightning-AI/lightning/pull/6505), + [#6530](https://github.com/Lightning-AI/lightning/pull/6530), + [#6540](https://github.com/Lightning-AI/lightning/pull/6540), + [#6547](https://github.com/Lightning-AI/lightning/pull/6547), + [#6515](https://github.com/Lightning-AI/lightning/pull/6515), + [#6572](https://github.com/Lightning-AI/lightning/pull/6572), + [#6573](https://github.com/Lightning-AI/lightning/pull/6573), + [#6584](https://github.com/Lightning-AI/lightning/pull/6584), + [#6636](https://github.com/Lightning-AI/lightning/pull/6636), + [#6637](https://github.com/Lightning-AI/lightning/pull/6637), + [#6649](https://github.com/Lightning-AI/lightning/pull/6649), + [#6659](https://github.com/Lightning-AI/lightning/pull/6659), + [#7131](https://github.com/Lightning-AI/lightning/pull/7131), ) -- Deprecated the `LightningModule.datamodule` getter and setter methods; access them through `Trainer.datamodule` instead ([#7168](https://github.com/PyTorchLightning/pytorch-lightning/pull/7168)) -- Deprecated the use of `Trainer(gpus="i")` (string) for selecting the i-th GPU; from v1.5 this will set the number of GPUs instead of the index ([#6388](https://github.com/PyTorchLightning/pytorch-lightning/pull/6388)) +- Deprecated the `LightningModule.datamodule` getter and setter methods; access them through `Trainer.datamodule` instead ([#7168](https://github.com/Lightning-AI/lightning/pull/7168)) +- Deprecated the use of `Trainer(gpus="i")` (string) for selecting the i-th GPU; from v1.5 this will set the number of GPUs instead of the index ([#6388](https://github.com/Lightning-AI/lightning/pull/6388)) ### Removed -- Removed the `exp_save_path` property from the `LightningModule` ([#7266](https://github.com/PyTorchLightning/pytorch-lightning/pull/7266)) -- Removed training loop explicitly calling `EarlyStopping.on_validation_end` if no validation is run ([#7069](https://github.com/PyTorchLightning/pytorch-lightning/pull/7069)) -- Removed `automatic_optimization` as a property from the training loop in favor of `LightningModule.automatic_optimization` ([#7130](https://github.com/PyTorchLightning/pytorch-lightning/pull/7130)) -- Removed evaluation loop legacy returns for `*_epoch_end` hooks ([#6973](https://github.com/PyTorchLightning/pytorch-lightning/pull/6973)) -- Removed support for passing a bool value to `profiler` argument of Trainer ([#6164](https://github.com/PyTorchLightning/pytorch-lightning/pull/6164)) -- Removed no return warning from val/test step ([#6139](https://github.com/PyTorchLightning/pytorch-lightning/pull/6139)) -- Removed passing a `ModelCheckpoint` instance to `Trainer(checkpoint_callback)` ([#6166](https://github.com/PyTorchLightning/pytorch-lightning/pull/6166)) -- Removed deprecated Trainer argument `enable_pl_optimizer` and `automatic_optimization` ([#6163](https://github.com/PyTorchLightning/pytorch-lightning/pull/6163)) -- Removed deprecated metrics ([#6161](https://github.com/PyTorchLightning/pytorch-lightning/pull/6161)) +- Removed the `exp_save_path` property from the `LightningModule` ([#7266](https://github.com/Lightning-AI/lightning/pull/7266)) +- Removed training loop explicitly calling `EarlyStopping.on_validation_end` if no validation is run ([#7069](https://github.com/Lightning-AI/lightning/pull/7069)) +- Removed `automatic_optimization` as a property from the training loop in favor of `LightningModule.automatic_optimization` ([#7130](https://github.com/Lightning-AI/lightning/pull/7130)) +- Removed evaluation loop legacy returns for `*_epoch_end` hooks ([#6973](https://github.com/Lightning-AI/lightning/pull/6973)) +- Removed support for passing a bool value to `profiler` argument of Trainer ([#6164](https://github.com/Lightning-AI/lightning/pull/6164)) +- Removed no return warning from val/test step ([#6139](https://github.com/Lightning-AI/lightning/pull/6139)) +- Removed passing a `ModelCheckpoint` instance to `Trainer(checkpoint_callback)` ([#6166](https://github.com/Lightning-AI/lightning/pull/6166)) +- Removed deprecated Trainer argument `enable_pl_optimizer` and `automatic_optimization` ([#6163](https://github.com/Lightning-AI/lightning/pull/6163)) +- Removed deprecated metrics ([#6161](https://github.com/Lightning-AI/lightning/pull/6161)) * from `pytorch_lightning.metrics.functional.classification` removed `to_onehot`, `to_categorical`, `get_num_classes`, `roc`, `multiclass_roc`, `average_precision`, `precision_recall_curve`, `multiclass_precision_recall_curve` * from `pytorch_lightning.metrics.functional.reduction` removed `reduce`, `class_reduce` -- Removed deprecated `ModelCheckpoint` arguments `prefix`, `mode="auto"` ([#6162](https://github.com/PyTorchLightning/pytorch-lightning/pull/6162)) -- Removed `mode='auto'` from `EarlyStopping` ([#6167](https://github.com/PyTorchLightning/pytorch-lightning/pull/6167)) -- Removed `epoch` and `step` arguments from `ModelCheckpoint.format_checkpoint_name()`, these are now included in the `metrics` argument ([#7344](https://github.com/PyTorchLightning/pytorch-lightning/pull/7344)) -- Removed legacy references for magic keys in the `Result` object ([#6016](https://github.com/PyTorchLightning/pytorch-lightning/pull/6016)) -- Removed deprecated `LightningModule` `hparams` setter ([#6207](https://github.com/PyTorchLightning/pytorch-lightning/pull/6207)) -- Removed legacy code to log or include metrics in the progress bar by returning them in a dict with the `"log"/"progress_bar"` magic keys. Use `self.log` instead ([#6734](https://github.com/PyTorchLightning/pytorch-lightning/pull/6734)) -- Removed `trainer.fit()` return value of `1`. It has no return now ([#7237](https://github.com/PyTorchLightning/pytorch-lightning/pull/7237)) -- Removed `logger_connector` legacy code ([#6733](https://github.com/PyTorchLightning/pytorch-lightning/pull/6733)) -- Removed unused mixin attributes ([#6487](https://github.com/PyTorchLightning/pytorch-lightning/pull/6487)) - -### Fixed - -- Fixed NaN errors in progress bars when training with iterable datasets with no length defined ([#7306](https://github.com/PyTorchLightning/pytorch-lightning/pull/7306)) -- Fixed attaching train and validation dataloaders when `reload_dataloaders_every_epoch=True` and `num_sanity_val_steps=0` ([#7207](https://github.com/PyTorchLightning/pytorch-lightning/pull/7207)) -- Added a barrier in the accelerator `teardown` to synchronize processes before execution finishes ([#6814](https://github.com/PyTorchLightning/pytorch-lightning/pull/6814)) -- Fixed multi-node DDP sub-process launch by using `local_rank` instead of `global_rank` for main process assertion ([#7061](https://github.com/PyTorchLightning/pytorch-lightning/pull/7061)) -- Fixed incorrect removal of `WORLD_SIZE` environment variable in DDP training when launching with torch distributed/torchelastic ([#6942](https://github.com/PyTorchLightning/pytorch-lightning/pull/6942)) -- Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/PyTorchLightning/pytorch-lightning/pull/6011)) -- Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/PyTorchLightning/pytorch-lightning/pull/6070)) -- Do not print top-k verbose log with `ModelCheckpoint(monitor=None)` ([#6109](https://github.com/PyTorchLightning/pytorch-lightning/pull/6109)) -- Fixed `ModelCheckpoint(save_top_k=0, save_last=True)` not saving the `last` checkpoint ([#6136](https://github.com/PyTorchLightning/pytorch-lightning/pull/6136)) -- Fixed `.teardown(stage='fit')` and `.on_fit_{start,end}()` getting called during `trainer.test` ([#6386](https://github.com/PyTorchLightning/pytorch-lightning/pull/6386)) -- Fixed LightningModule `all_gather` on cpu tensors ([#6416](https://github.com/PyTorchLightning/pytorch-lightning/pull/6416)) -- Fixed torch distributed not available in setup hook for DDP ([#6506](https://github.com/PyTorchLightning/pytorch-lightning/pull/6506)) -- Fixed `trainer.tuner.{lr_find,scale_batch_size}` not setting the `Trainer` state properly ([#7258](https://github.com/PyTorchLightning/pytorch-lightning/pull/7258)) -- Fixed bug where the learning rate schedulers did not follow the optimizer frequencies ([#4868](https://github.com/PyTorchLightning/pytorch-lightning/pull/4868)) -- Fixed pickle error checker to now check for `pickle.PickleError` to catch all pickle errors ([#6917](https://github.com/PyTorchLightning/pytorch-lightning/pull/6917)) -- Fixed a bug where the outputs object passed to `LightningModule.training_epoch_end` was different from the object passed to the `on_train_end_epoch` hook ([#6969](https://github.com/PyTorchLightning/pytorch-lightning/pull/6969)) -- Fixed a bug where the outputs passed to `train_batch_end` would be lists even when using a single optimizer and no truncated backprop through time steps ([#6969](https://github.com/PyTorchLightning/pytorch-lightning/pull/6969)) -- Fixed bug for trainer error handling which would cause hang for distributed training ([#6864](https://github.com/PyTorchLightning/pytorch-lightning/pull/6864)) -- Fixed `self.device` not returning the correct device in replicas of data-parallel ([#6414](https://github.com/PyTorchLightning/pytorch-lightning/pull/6414)) -- Fixed `lr_find` trying beyond `num_training` steps and suggesting a too high learning rate ([#7076](https://github.com/PyTorchLightning/pytorch-lightning/pull/7076)) -- Fixed logger creating incorrect version folder in DDP with repeated `Trainer.fit` calls ([#7077](https://github.com/PyTorchLightning/pytorch-lightning/pull/7077)) -- Fixed metric objects passed directly to `self.log` not being reset correctly ([#7055](https://github.com/PyTorchLightning/pytorch-lightning/pull/7055)) -- Fixed `CombinedLoader` in distributed settings for validation / testing ([#7102](https://github.com/PyTorchLightning/pytorch-lightning/pull/7102)) -- Fixed the save_dir in `WandbLogger` when the run was initiated externally ([#7106](https://github.com/PyTorchLightning/pytorch-lightning/pull/7106)) -- Fixed `num_sanity_val_steps` affecting reproducibility of training data shuffling ([#7014](https://github.com/PyTorchLightning/pytorch-lightning/pull/7014)) -- Fixed resetting device after `fitting/evaluating/predicting` ([#7188](https://github.com/PyTorchLightning/pytorch-lightning/pull/7188)) -- Fixed bug where `trainer.tuner.scale_batch_size(max_trials=0)` would not return the correct batch size result ([#7262](https://github.com/PyTorchLightning/pytorch-lightning/pull/7262)) -- Fixed metrics not being properly logged with `precision=16` and `manual_optimization` ([#7228](https://github.com/PyTorchLightning/pytorch-lightning/pull/7228)) -- Fixed `BaseFinetuning` properly reloading `optimizer_states` when using `resume_from_checkpoint` ([#6891](https://github.com/PyTorchLightning/pytorch-lightning/pull/6891)) -- Fixed `parameters_to_ignore` not properly set to DDPWrapper ([#7239](https://github.com/PyTorchLightning/pytorch-lightning/pull/7239)) -- Fixed parsing of `fast_dev_run=True` with the built-in `ArgumentParser` ([#7240](https://github.com/PyTorchLightning/pytorch-lightning/pull/7240)) -- Fixed handling an `IterableDataset` that fails to produce a batch at the beginning of an epoch ([#7294](https://github.com/PyTorchLightning/pytorch-lightning/pull/7294)) -- Fixed `LightningModule.save_hyperparameters()` when attempting to save an empty container ([#7268](https://github.com/PyTorchLightning/pytorch-lightning/pull/7268)) -- Fixed `apex` not properly instantiated when running with `ddp` ([#7274](https://github.com/PyTorchLightning/pytorch-lightning/pull/7274)) -- Fixed optimizer `state` not moved to `GPU` ([#7277](https://github.com/PyTorchLightning/pytorch-lightning/pull/7277)) -- Fixed custom init args for `WandbLogger` ([#6989](https://github.com/PyTorchLightning/pytorch-lightning/pull/6989)) -- Fixed a bug where an error would be raised if the train dataloader sometimes produced None for a batch ([#7342](https://github.com/PyTorchLightning/pytorch-lightning/pull/7342)) +- Removed deprecated `ModelCheckpoint` arguments `prefix`, `mode="auto"` ([#6162](https://github.com/Lightning-AI/lightning/pull/6162)) +- Removed `mode='auto'` from `EarlyStopping` ([#6167](https://github.com/Lightning-AI/lightning/pull/6167)) +- Removed `epoch` and `step` arguments from `ModelCheckpoint.format_checkpoint_name()`, these are now included in the `metrics` argument ([#7344](https://github.com/Lightning-AI/lightning/pull/7344)) +- Removed legacy references for magic keys in the `Result` object ([#6016](https://github.com/Lightning-AI/lightning/pull/6016)) +- Removed deprecated `LightningModule` `hparams` setter ([#6207](https://github.com/Lightning-AI/lightning/pull/6207)) +- Removed legacy code to log or include metrics in the progress bar by returning them in a dict with the `"log"/"progress_bar"` magic keys. Use `self.log` instead ([#6734](https://github.com/Lightning-AI/lightning/pull/6734)) +- Removed `trainer.fit()` return value of `1`. It has no return now ([#7237](https://github.com/Lightning-AI/lightning/pull/7237)) +- Removed `logger_connector` legacy code ([#6733](https://github.com/Lightning-AI/lightning/pull/6733)) +- Removed unused mixin attributes ([#6487](https://github.com/Lightning-AI/lightning/pull/6487)) + +### Fixed + +- Fixed NaN errors in progress bars when training with iterable datasets with no length defined ([#7306](https://github.com/Lightning-AI/lightning/pull/7306)) +- Fixed attaching train and validation dataloaders when `reload_dataloaders_every_epoch=True` and `num_sanity_val_steps=0` ([#7207](https://github.com/Lightning-AI/lightning/pull/7207)) +- Added a barrier in the accelerator `teardown` to synchronize processes before execution finishes ([#6814](https://github.com/Lightning-AI/lightning/pull/6814)) +- Fixed multi-node DDP sub-process launch by using `local_rank` instead of `global_rank` for main process assertion ([#7061](https://github.com/Lightning-AI/lightning/pull/7061)) +- Fixed incorrect removal of `WORLD_SIZE` environment variable in DDP training when launching with torch distributed/torchelastic ([#6942](https://github.com/Lightning-AI/lightning/pull/6942)) +- Made the `Plugin.reduce` method more consistent across all Plugins to reflect a mean-reduction by default ([#6011](https://github.com/Lightning-AI/lightning/pull/6011)) +- Move lightning module to correct device type when using LightningDistributedWrapper ([#6070](https://github.com/Lightning-AI/lightning/pull/6070)) +- Do not print top-k verbose log with `ModelCheckpoint(monitor=None)` ([#6109](https://github.com/Lightning-AI/lightning/pull/6109)) +- Fixed `ModelCheckpoint(save_top_k=0, save_last=True)` not saving the `last` checkpoint ([#6136](https://github.com/Lightning-AI/lightning/pull/6136)) +- Fixed `.teardown(stage='fit')` and `.on_fit_{start,end}()` getting called during `trainer.test` ([#6386](https://github.com/Lightning-AI/lightning/pull/6386)) +- Fixed LightningModule `all_gather` on cpu tensors ([#6416](https://github.com/Lightning-AI/lightning/pull/6416)) +- Fixed torch distributed not available in setup hook for DDP ([#6506](https://github.com/Lightning-AI/lightning/pull/6506)) +- Fixed `trainer.tuner.{lr_find,scale_batch_size}` not setting the `Trainer` state properly ([#7258](https://github.com/Lightning-AI/lightning/pull/7258)) +- Fixed bug where the learning rate schedulers did not follow the optimizer frequencies ([#4868](https://github.com/Lightning-AI/lightning/pull/4868)) +- Fixed pickle error checker to now check for `pickle.PickleError` to catch all pickle errors ([#6917](https://github.com/Lightning-AI/lightning/pull/6917)) +- Fixed a bug where the outputs object passed to `LightningModule.training_epoch_end` was different from the object passed to the `on_train_end_epoch` hook ([#6969](https://github.com/Lightning-AI/lightning/pull/6969)) +- Fixed a bug where the outputs passed to `train_batch_end` would be lists even when using a single optimizer and no truncated backprop through time steps ([#6969](https://github.com/Lightning-AI/lightning/pull/6969)) +- Fixed bug for trainer error handling which would cause hang for distributed training ([#6864](https://github.com/Lightning-AI/lightning/pull/6864)) +- Fixed `self.device` not returning the correct device in replicas of data-parallel ([#6414](https://github.com/Lightning-AI/lightning/pull/6414)) +- Fixed `lr_find` trying beyond `num_training` steps and suggesting a too high learning rate ([#7076](https://github.com/Lightning-AI/lightning/pull/7076)) +- Fixed logger creating incorrect version folder in DDP with repeated `Trainer.fit` calls ([#7077](https://github.com/Lightning-AI/lightning/pull/7077)) +- Fixed metric objects passed directly to `self.log` not being reset correctly ([#7055](https://github.com/Lightning-AI/lightning/pull/7055)) +- Fixed `CombinedLoader` in distributed settings for validation / testing ([#7102](https://github.com/Lightning-AI/lightning/pull/7102)) +- Fixed the save_dir in `WandbLogger` when the run was initiated externally ([#7106](https://github.com/Lightning-AI/lightning/pull/7106)) +- Fixed `num_sanity_val_steps` affecting reproducibility of training data shuffling ([#7014](https://github.com/Lightning-AI/lightning/pull/7014)) +- Fixed resetting device after `fitting/evaluating/predicting` ([#7188](https://github.com/Lightning-AI/lightning/pull/7188)) +- Fixed bug where `trainer.tuner.scale_batch_size(max_trials=0)` would not return the correct batch size result ([#7262](https://github.com/Lightning-AI/lightning/pull/7262)) +- Fixed metrics not being properly logged with `precision=16` and `manual_optimization` ([#7228](https://github.com/Lightning-AI/lightning/pull/7228)) +- Fixed `BaseFinetuning` properly reloading `optimizer_states` when using `resume_from_checkpoint` ([#6891](https://github.com/Lightning-AI/lightning/pull/6891)) +- Fixed `parameters_to_ignore` not properly set to DDPWrapper ([#7239](https://github.com/Lightning-AI/lightning/pull/7239)) +- Fixed parsing of `fast_dev_run=True` with the built-in `ArgumentParser` ([#7240](https://github.com/Lightning-AI/lightning/pull/7240)) +- Fixed handling an `IterableDataset` that fails to produce a batch at the beginning of an epoch ([#7294](https://github.com/Lightning-AI/lightning/pull/7294)) +- Fixed `LightningModule.save_hyperparameters()` when attempting to save an empty container ([#7268](https://github.com/Lightning-AI/lightning/pull/7268)) +- Fixed `apex` not properly instantiated when running with `ddp` ([#7274](https://github.com/Lightning-AI/lightning/pull/7274)) +- Fixed optimizer `state` not moved to `GPU` ([#7277](https://github.com/Lightning-AI/lightning/pull/7277)) +- Fixed custom init args for `WandbLogger` ([#6989](https://github.com/Lightning-AI/lightning/pull/6989)) +- Fixed a bug where an error would be raised if the train dataloader sometimes produced None for a batch ([#7342](https://github.com/Lightning-AI/lightning/pull/7342)) - Fixed examples ( - [#6600](https://github.com/PyTorchLightning/pytorch-lightning/pull/6600), - [#6638](https://github.com/PyTorchLightning/pytorch-lightning/pull/6638), - [#7096](https://github.com/PyTorchLightning/pytorch-lightning/pull/7096), - [#7246](https://github.com/PyTorchLightning/pytorch-lightning/pull/7246), - [#6357](https://github.com/PyTorchLightning/pytorch-lightning/pull/6357), - [#6476](https://github.com/PyTorchLightning/pytorch-lightning/pull/6476), - [#6294](https://github.com/PyTorchLightning/pytorch-lightning/pull/6294), - [#6373](https://github.com/PyTorchLightning/pytorch-lightning/pull/6373), - [#6088](https://github.com/PyTorchLightning/pytorch-lightning/pull/6088), - [#7398](https://github.com/PyTorchLightning/pytorch-lightning/pull/7398) + [#6600](https://github.com/Lightning-AI/lightning/pull/6600), + [#6638](https://github.com/Lightning-AI/lightning/pull/6638), + [#7096](https://github.com/Lightning-AI/lightning/pull/7096), + [#7246](https://github.com/Lightning-AI/lightning/pull/7246), + [#6357](https://github.com/Lightning-AI/lightning/pull/6357), + [#6476](https://github.com/Lightning-AI/lightning/pull/6476), + [#6294](https://github.com/Lightning-AI/lightning/pull/6294), + [#6373](https://github.com/Lightning-AI/lightning/pull/6373), + [#6088](https://github.com/Lightning-AI/lightning/pull/6088), + [#7398](https://github.com/Lightning-AI/lightning/pull/7398) ) -- Resolved schedule step bug for PyTorch Profiler ([#6674](https://github.com/PyTorchLightning/pytorch-lightning/pull/6674), - [#6681](https://github.com/PyTorchLightning/pytorch-lightning/pull/6681)) -- Updated logic for checking TPUs availability ([#6767](https://github.com/PyTorchLightning/pytorch-lightning/pull/6767)) -- Resolve TPU miss rendezvous ([#6781](https://github.com/PyTorchLightning/pytorch-lightning/pull/6781)) -- Fixed auto-scaling mode when calling tune method on trainer ([#7321](https://github.com/PyTorchLightning/pytorch-lightning/pull/7321)) -- Fixed finetuning complex models correctly unfreezes ([#6880](https://github.com/PyTorchLightning/pytorch-lightning/pull/6880)) -- Ensure we set the eval/train flag correctly on accelerator model ([#6877](https://github.com/PyTorchLightning/pytorch-lightning/pull/6877)) -- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/PyTorchLightning/pytorch-lightning/pull/6802)) -- Fixed matching the number of outputs of backward with forward for AllGatherGrad ([#6625](https://github.com/PyTorchLightning/pytorch-lightning/pull/6625)) -- Fixed the `gradient_clip_algorithm` has no effect ([#6928](https://github.com/PyTorchLightning/pytorch-lightning/pull/6928)) -- Fixed CUDA OOM detection and handling ([#6934](https://github.com/PyTorchLightning/pytorch-lightning/pull/6934)) -- Fixed `unfreeze_and_add_param_group` expects `modules` rather than `module` ([#6822](https://github.com/PyTorchLightning/pytorch-lightning/pull/6822)) -- Fixed DPP + SyncBN when move on device ([#6838](https://github.com/PyTorchLightning/pytorch-lightning/pull/6838)) -- Fixed missing arguments in `lr_find` call ([#6784](https://github.com/PyTorchLightning/pytorch-lightning/pull/6784)) -- Fixed `set_default_tensor_type` to `torch.DoubleTensor` with precision=64 ([#7108](https://github.com/PyTorchLightning/pytorch-lightning/pull/7108)) -- Fixed `NeptuneLogger.log_text(step=None)` ([#7194](https://github.com/PyTorchLightning/pytorch-lightning/pull/7194)) -- Fixed importing torchtext batch ([#6365](https://github.com/PyTorchLightning/pytorch-lightning/pull/6365), - [#6323](https://github.com/PyTorchLightning/pytorch-lightning/pull/6323), - [#6211](https://github.com/PyTorchLightning/pytorch-lightning/pull/6211)) +- Resolved schedule step bug for PyTorch Profiler ([#6674](https://github.com/Lightning-AI/lightning/pull/6674), + [#6681](https://github.com/Lightning-AI/lightning/pull/6681)) +- Updated logic for checking TPUs availability ([#6767](https://github.com/Lightning-AI/lightning/pull/6767)) +- Resolve TPU miss rendezvous ([#6781](https://github.com/Lightning-AI/lightning/pull/6781)) +- Fixed auto-scaling mode when calling tune method on trainer ([#7321](https://github.com/Lightning-AI/lightning/pull/7321)) +- Fixed finetuning complex models correctly unfreezes ([#6880](https://github.com/Lightning-AI/lightning/pull/6880)) +- Ensure we set the eval/train flag correctly on accelerator model ([#6877](https://github.com/Lightning-AI/lightning/pull/6877)) +- Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic ([#6802](https://github.com/Lightning-AI/lightning/pull/6802)) +- Fixed matching the number of outputs of backward with forward for AllGatherGrad ([#6625](https://github.com/Lightning-AI/lightning/pull/6625)) +- Fixed the `gradient_clip_algorithm` has no effect ([#6928](https://github.com/Lightning-AI/lightning/pull/6928)) +- Fixed CUDA OOM detection and handling ([#6934](https://github.com/Lightning-AI/lightning/pull/6934)) +- Fixed `unfreeze_and_add_param_group` expects `modules` rather than `module` ([#6822](https://github.com/Lightning-AI/lightning/pull/6822)) +- Fixed DPP + SyncBN when move on device ([#6838](https://github.com/Lightning-AI/lightning/pull/6838)) +- Fixed missing arguments in `lr_find` call ([#6784](https://github.com/Lightning-AI/lightning/pull/6784)) +- Fixed `set_default_tensor_type` to `torch.DoubleTensor` with precision=64 ([#7108](https://github.com/Lightning-AI/lightning/pull/7108)) +- Fixed `NeptuneLogger.log_text(step=None)` ([#7194](https://github.com/Lightning-AI/lightning/pull/7194)) +- Fixed importing torchtext batch ([#6365](https://github.com/Lightning-AI/lightning/pull/6365), + [#6323](https://github.com/Lightning-AI/lightning/pull/6323), + [#6211](https://github.com/Lightning-AI/lightning/pull/6211)) ## [1.2.9] - 2021-04-20 ### Fixed -- Fixed the order to call for world ranks & the `root_device` property in `TPUSpawnPlugin` ([#7074](https://github.com/PyTorchLightning/pytorch-lightning/pull/7074)) -- Fixed multi-gpu join for Horovod ([#6954](https://github.com/PyTorchLightning/pytorch-lightning/pull/6954)) -- Fixed parsing for pre-release package versions ([#6999](https://github.com/PyTorchLightning/pytorch-lightning/pull/6999)) +- Fixed the order to call for world ranks & the `root_device` property in `TPUSpawnPlugin` ([#7074](https://github.com/Lightning-AI/lightning/pull/7074)) +- Fixed multi-gpu join for Horovod ([#6954](https://github.com/Lightning-AI/lightning/pull/6954)) +- Fixed parsing for pre-release package versions ([#6999](https://github.com/Lightning-AI/lightning/pull/6999)) ## [1.2.8] - 2021-04-14 ### Added -- Added TPUSpawn + IterableDataset error message ([#6875](https://github.com/PyTorchLightning/pytorch-lightning/pull/6875)) +- Added TPUSpawn + IterableDataset error message ([#6875](https://github.com/Lightning-AI/lightning/pull/6875)) ### Fixed -- Fixed process rank not being available right away after `Trainer` instantiation ([#6941](https://github.com/PyTorchLightning/pytorch-lightning/pull/6941)) -- Fixed `sync_dist` for tpus ([#6950](https://github.com/PyTorchLightning/pytorch-lightning/pull/6950)) -- Fixed `AttributeError` for `require_backward_grad_sync` when running manual optimization with sharded plugin ([#6915](https://github.com/PyTorchLightning/pytorch-lightning/pull/6915)) -- Fixed `--gpus` default for parser returned by `Trainer.add_argparse_args` ([#6898](https://github.com/PyTorchLightning/pytorch-lightning/pull/6898)) -- Fixed TPU Spawn all gather ([#6896](https://github.com/PyTorchLightning/pytorch-lightning/pull/6896)) -- Fixed `EarlyStopping` logic when `min_epochs` or `min_steps` requirement is not met ([#6705](https://github.com/PyTorchLightning/pytorch-lightning/pull/6705)) -- Fixed csv extension check ([#6436](https://github.com/PyTorchLightning/pytorch-lightning/pull/6436)) -- Fixed checkpoint issue when using Horovod distributed backend ([#6958](https://github.com/PyTorchLightning/pytorch-lightning/pull/6958)) -- Fixed tensorboard exception raising ([#6901](https://github.com/PyTorchLightning/pytorch-lightning/pull/6901)) -- Fixed setting the eval/train flag correctly on accelerator model ([#6983](https://github.com/PyTorchLightning/pytorch-lightning/pull/6983)) -- Fixed DDP_SPAWN compatibility with bug_report_model.py ([#6892](https://github.com/PyTorchLightning/pytorch-lightning/pull/6892)) -- Fixed bug where `BaseFinetuning.flatten_modules()` was duplicating leaf node parameters ([#6879](https://github.com/PyTorchLightning/pytorch-lightning/pull/6879)) +- Fixed process rank not being available right away after `Trainer` instantiation ([#6941](https://github.com/Lightning-AI/lightning/pull/6941)) +- Fixed `sync_dist` for tpus ([#6950](https://github.com/Lightning-AI/lightning/pull/6950)) +- Fixed `AttributeError` for `require_backward_grad_sync` when running manual optimization with sharded plugin ([#6915](https://github.com/Lightning-AI/lightning/pull/6915)) +- Fixed `--gpus` default for parser returned by `Trainer.add_argparse_args` ([#6898](https://github.com/Lightning-AI/lightning/pull/6898)) +- Fixed TPU Spawn all gather ([#6896](https://github.com/Lightning-AI/lightning/pull/6896)) +- Fixed `EarlyStopping` logic when `min_epochs` or `min_steps` requirement is not met ([#6705](https://github.com/Lightning-AI/lightning/pull/6705)) +- Fixed csv extension check ([#6436](https://github.com/Lightning-AI/lightning/pull/6436)) +- Fixed checkpoint issue when using Horovod distributed backend ([#6958](https://github.com/Lightning-AI/lightning/pull/6958)) +- Fixed tensorboard exception raising ([#6901](https://github.com/Lightning-AI/lightning/pull/6901)) +- Fixed setting the eval/train flag correctly on accelerator model ([#6983](https://github.com/Lightning-AI/lightning/pull/6983)) +- Fixed DDP_SPAWN compatibility with bug_report_model.py ([#6892](https://github.com/Lightning-AI/lightning/pull/6892)) +- Fixed bug where `BaseFinetuning.flatten_modules()` was duplicating leaf node parameters ([#6879](https://github.com/Lightning-AI/lightning/pull/6879)) - Set better defaults for `rank_zero_only.rank` when training is launched with SLURM and torchelastic: - * Support SLURM and torchelastic global rank environment variables ([#5715](https://github.com/PyTorchLightning/pytorch-lightning/pull/5715)) - * Remove hardcoding of local rank in accelerator connector ([#6878](https://github.com/PyTorchLightning/pytorch-lightning/pull/6878)) + * Support SLURM and torchelastic global rank environment variables ([#5715](https://github.com/Lightning-AI/lightning/pull/5715)) + * Remove hardcoding of local rank in accelerator connector ([#6878](https://github.com/Lightning-AI/lightning/pull/6878)) ## [1.2.7] - 2021-04-06 ### Fixed -- Fixed resolve a bug with omegaconf and xm.save ([#6741](https://github.com/PyTorchLightning/pytorch-lightning/pull/6741)) -- Fixed an issue with IterableDataset when __len__ is not defined ([#6828](https://github.com/PyTorchLightning/pytorch-lightning/pull/6828)) -- Sanitize None params during pruning ([#6836](https://github.com/PyTorchLightning/pytorch-lightning/pull/6836)) -- Enforce an epoch scheduler interval when using SWA ([#6588](https://github.com/PyTorchLightning/pytorch-lightning/pull/6588)) -- Fixed TPU Colab hang issue, post training ([#6816](https://github.com/PyTorchLightning/pytorch-lightning/pull/6816)) -- Fixed a bug where `TensorBoardLogger` would give a warning and not log correctly to a symbolic link `save_dir` ([#6730](https://github.com/PyTorchLightning/pytorch-lightning/pull/6730)) -- Fixed bug where `predict` could not be used when `progress_bar_refresh_rate=0` ([#6884](https://github.com/PyTorchLightning/pytorch-lightning/pull/6884)) +- Fixed resolve a bug with omegaconf and xm.save ([#6741](https://github.com/Lightning-AI/lightning/pull/6741)) +- Fixed an issue with IterableDataset when __len__ is not defined ([#6828](https://github.com/Lightning-AI/lightning/pull/6828)) +- Sanitize None params during pruning ([#6836](https://github.com/Lightning-AI/lightning/pull/6836)) +- Enforce an epoch scheduler interval when using SWA ([#6588](https://github.com/Lightning-AI/lightning/pull/6588)) +- Fixed TPU Colab hang issue, post training ([#6816](https://github.com/Lightning-AI/lightning/pull/6816)) +- Fixed a bug where `TensorBoardLogger` would give a warning and not log correctly to a symbolic link `save_dir` ([#6730](https://github.com/Lightning-AI/lightning/pull/6730)) +- Fixed bug where `predict` could not be used when `progress_bar_refresh_rate=0` ([#6884](https://github.com/Lightning-AI/lightning/pull/6884)) ## [1.2.6] - 2021-03-30 ### Changed -- Changed the behavior of `on_epoch_start` to run at the beginning of validation & test epoch ([#6498](https://github.com/PyTorchLightning/pytorch-lightning/pull/6498)) +- Changed the behavior of `on_epoch_start` to run at the beginning of validation & test epoch ([#6498](https://github.com/Lightning-AI/lightning/pull/6498)) ### Removed -- Removed legacy code to include `step` dictionary returns in `callback_metrics`. Use `self.log_dict` instead. ([#6682](https://github.com/PyTorchLightning/pytorch-lightning/pull/6682)) +- Removed legacy code to include `step` dictionary returns in `callback_metrics`. Use `self.log_dict` instead. ([#6682](https://github.com/Lightning-AI/lightning/pull/6682)) ### Fixed -- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/PyTorchLightning/pytorch-lightning/pull/6398)) -- Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) -- Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/PyTorchLightning/pytorch-lightning/pull/6654)) -- Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/PyTorchLightning/pytorch-lightning/pull/6657)) -- Fixed bug where no TPUs were detected in a TPU pod env ([#6719](https://github.com/PyTorchLightning/pytorch-lightning/pull/6719)) +- Fixed `DummyLogger.log_hyperparams` raising a `TypeError` when running with `fast_dev_run=True` ([#6398](https://github.com/Lightning-AI/lightning/pull/6398)) +- Fixed error on TPUs when there was no `ModelCheckpoint` ([#6654](https://github.com/Lightning-AI/lightning/pull/6654)) +- Fixed `trainer.test` freeze on TPUs ([#6654](https://github.com/Lightning-AI/lightning/pull/6654)) +- Fixed a bug where gradients were disabled after calling `Trainer.predict` ([#6657](https://github.com/Lightning-AI/lightning/pull/6657)) +- Fixed bug where no TPUs were detected in a TPU pod env ([#6719](https://github.com/Lightning-AI/lightning/pull/6719)) ## [1.2.5] - 2021-03-23 ### Changed -- Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/PyTorchLightning/pytorch-lightning/pull/6576)) -- Refactored setup for typing friendly ([#6590](https://github.com/PyTorchLightning/pytorch-lightning/pull/6590)) +- Update Gradient Clipping for the TPU Accelerator ([#6576](https://github.com/Lightning-AI/lightning/pull/6576)) +- Refactored setup for typing friendly ([#6590](https://github.com/Lightning-AI/lightning/pull/6590)) ### Fixed -- Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/PyTorchLightning/pytorch-lightning/pull/6587)) -- Fixed comparing required versions ([#6434](https://github.com/PyTorchLightning/pytorch-lightning/pull/6434)) -- Fixed duplicate logs appearing in console when using the python logging module ([#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275)) -- Added Autocast in validation, test and predict modes for Native AMP ([#6565](https://github.com/PyTorchLightning/pytorch-lightning/pull/6565)) +- Fixed a bug where `all_gather` would not work correctly with `tpu_cores=8` ([#6587](https://github.com/Lightning-AI/lightning/pull/6587)) +- Fixed comparing required versions ([#6434](https://github.com/Lightning-AI/lightning/pull/6434)) +- Fixed duplicate logs appearing in console when using the python logging module ([#6275](https://github.com/Lightning-AI/lightning/pull/6275)) +- Added Autocast in validation, test and predict modes for Native AMP ([#6565](https://github.com/Lightning-AI/lightning/pull/6565)) ## [1.2.4] - 2021-03-16 ### Changed -- Changed the default of `find_unused_parameters` back to `True` in DDP and DDP Spawn ([#6438](https://github.com/PyTorchLightning/pytorch-lightning/pull/6438)) +- Changed the default of `find_unused_parameters` back to `True` in DDP and DDP Spawn ([#6438](https://github.com/Lightning-AI/lightning/pull/6438)) ### Fixed -- Expose DeepSpeed loss parameters to allow users to fix loss instability ([#6115](https://github.com/PyTorchLightning/pytorch-lightning/pull/6115)) -- Fixed DP reduction with collection ([#6324](https://github.com/PyTorchLightning/pytorch-lightning/pull/6324)) -- Fixed an issue where the tuner would not tune the learning rate if also tuning the batch size ([#4688](https://github.com/PyTorchLightning/pytorch-lightning/pull/4688)) -- Fixed broadcast to use PyTorch `broadcast_object_list` and add `reduce_decision` ([#6410](https://github.com/PyTorchLightning/pytorch-lightning/pull/6410)) -- Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/PyTorchLightning/pytorch-lightning/pull/6380)) -- Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/PyTorchLightning/pytorch-lightning/pull/6460)) -- Fixed an issue with `Tuner.scale_batch_size` not finding the batch size attribute in the datamodule ([#5968](https://github.com/PyTorchLightning/pytorch-lightning/pull/5968)) -- Fixed an exception in the layer summary when the model contains torch.jit scripted submodules ([#6511](https://github.com/PyTorchLightning/pytorch-lightning/pull/6511)) -- Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/PyTorchLightning/pytorch-lightning/pull/6541)) +- Expose DeepSpeed loss parameters to allow users to fix loss instability ([#6115](https://github.com/Lightning-AI/lightning/pull/6115)) +- Fixed DP reduction with collection ([#6324](https://github.com/Lightning-AI/lightning/pull/6324)) +- Fixed an issue where the tuner would not tune the learning rate if also tuning the batch size ([#4688](https://github.com/Lightning-AI/lightning/pull/4688)) +- Fixed broadcast to use PyTorch `broadcast_object_list` and add `reduce_decision` ([#6410](https://github.com/Lightning-AI/lightning/pull/6410)) +- Fixed logger creating directory structure too early in DDP ([#6380](https://github.com/Lightning-AI/lightning/pull/6380)) +- Fixed DeepSpeed additional memory use on rank 0 when default device not set early enough ([#6460](https://github.com/Lightning-AI/lightning/pull/6460)) +- Fixed an issue with `Tuner.scale_batch_size` not finding the batch size attribute in the datamodule ([#5968](https://github.com/Lightning-AI/lightning/pull/5968)) +- Fixed an exception in the layer summary when the model contains torch.jit scripted submodules ([#6511](https://github.com/Lightning-AI/lightning/pull/6511)) +- Fixed when Train loop config was run during `Trainer.predict` ([#6541](https://github.com/Lightning-AI/lightning/pull/6541)) ## [1.2.3] - 2021-03-09 ### Fixed -- Fixed `ModelPruning(make_pruning_permanent=True)` pruning buffers getting removed when saved during training ([#6073](https://github.com/PyTorchLightning/pytorch-lightning/pull/6073)) -- Fixed when `_stable_1d_sort` to work when `n >= N` ([#6177](https://github.com/PyTorchLightning/pytorch-lightning/pull/6177)) -- Fixed `AttributeError` when `logger=None` on TPU ([#6221](https://github.com/PyTorchLightning/pytorch-lightning/pull/6221)) -- Fixed PyTorch Profiler with `emit_nvtx` ([#6260](https://github.com/PyTorchLightning/pytorch-lightning/pull/6260)) -- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit` ([#6272](https://github.com/PyTorchLightning/pytorch-lightning/pull/6272)) -- Fixed `SingleTPU` calling `all_gather` ([#6296](https://github.com/PyTorchLightning/pytorch-lightning/pull/6296)) -- Ensure we check DeepSpeed/Sharded in multi-node DDP ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) -- Check `LightningOptimizer` doesn't delete optimizer hooks ([#6305](https://github.com/PyTorchLightning/pytorch-lightning/pull/6305) -- Resolve memory leak for evaluation ([#6326](https://github.com/PyTorchLightning/pytorch-lightning/pull/6326) -- Ensure that clip gradients is only called if the value is greater than 0 ([#6330](https://github.com/PyTorchLightning/pytorch-lightning/pull/6330) -- Fixed `Trainer` not resetting `lightning_optimizers` when calling `Trainer.fit()` multiple times ([#6372](https://github.com/PyTorchLightning/pytorch-lightning/pull/6372)) +- Fixed `ModelPruning(make_pruning_permanent=True)` pruning buffers getting removed when saved during training ([#6073](https://github.com/Lightning-AI/lightning/pull/6073)) +- Fixed when `_stable_1d_sort` to work when `n >= N` ([#6177](https://github.com/Lightning-AI/lightning/pull/6177)) +- Fixed `AttributeError` when `logger=None` on TPU ([#6221](https://github.com/Lightning-AI/lightning/pull/6221)) +- Fixed PyTorch Profiler with `emit_nvtx` ([#6260](https://github.com/Lightning-AI/lightning/pull/6260)) +- Fixed `trainer.test` from `best_path` hangs after calling `trainer.fit` ([#6272](https://github.com/Lightning-AI/lightning/pull/6272)) +- Fixed `SingleTPU` calling `all_gather` ([#6296](https://github.com/Lightning-AI/lightning/pull/6296)) +- Ensure we check DeepSpeed/Sharded in multi-node DDP ([#6297](https://github.com/Lightning-AI/lightning/pull/6297) +- Check `LightningOptimizer` doesn't delete optimizer hooks ([#6305](https://github.com/Lightning-AI/lightning/pull/6305) +- Resolve memory leak for evaluation ([#6326](https://github.com/Lightning-AI/lightning/pull/6326) +- Ensure that clip gradients is only called if the value is greater than 0 ([#6330](https://github.com/Lightning-AI/lightning/pull/6330) +- Fixed `Trainer` not resetting `lightning_optimizers` when calling `Trainer.fit()` multiple times ([#6372](https://github.com/Lightning-AI/lightning/pull/6372)) ## [1.2.2] - 2021-03-02 ### Added -- Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/PyTorchLightning/pytorch-lightning/pull/6072)) +- Added `checkpoint` parameter to callback's `on_save_checkpoint` hook ([#6072](https://github.com/Lightning-AI/lightning/pull/6072)) ### Changed -- Changed the order of `backward`, `step`, `zero_grad` to `zero_grad`, `backward`, `step` ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) -- Changed default for DeepSpeed CPU Offload to False, due to prohibitively slow speeds at smaller scale ([#6262](https://github.com/PyTorchLightning/pytorch-lightning/pull/6262)) +- Changed the order of `backward`, `step`, `zero_grad` to `zero_grad`, `backward`, `step` ([#6147](https://github.com/Lightning-AI/lightning/pull/6147)) +- Changed default for DeepSpeed CPU Offload to False, due to prohibitively slow speeds at smaller scale ([#6262](https://github.com/Lightning-AI/lightning/pull/6262)) ### Fixed -- Fixed epoch level schedulers not being called when `val_check_interval < 1.0` ([#6075](https://github.com/PyTorchLightning/pytorch-lightning/pull/6075)) -- Fixed multiple early stopping callbacks ([#6197](https://github.com/PyTorchLightning/pytorch-lightning/pull/6197)) -- Fixed incorrect usage of `detach()`, `cpu()`, `to()` ([#6216](https://github.com/PyTorchLightning/pytorch-lightning/pull/6216)) -- Fixed LBFGS optimizer support which didn't converge in automatic optimization ([#6147](https://github.com/PyTorchLightning/pytorch-lightning/pull/6147)) -- Prevent `WandbLogger` from dropping values ([#5931](https://github.com/PyTorchLightning/pytorch-lightning/pull/5931)) -- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297) +- Fixed epoch level schedulers not being called when `val_check_interval < 1.0` ([#6075](https://github.com/Lightning-AI/lightning/pull/6075)) +- Fixed multiple early stopping callbacks ([#6197](https://github.com/Lightning-AI/lightning/pull/6197)) +- Fixed incorrect usage of `detach()`, `cpu()`, `to()` ([#6216](https://github.com/Lightning-AI/lightning/pull/6216)) +- Fixed LBFGS optimizer support which didn't converge in automatic optimization ([#6147](https://github.com/Lightning-AI/lightning/pull/6147)) +- Prevent `WandbLogger` from dropping values ([#5931](https://github.com/Lightning-AI/lightning/pull/5931)) +- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/Lightning-AI/lightning/pull/6297) ## [1.2.1] - 2021-02-23 ### Fixed -- Fixed incorrect yield logic for the amp autocast context manager ([#6080](https://github.com/PyTorchLightning/pytorch-lightning/pull/6080)) -- Fixed priority of plugin/accelerator when setting distributed mode ([#6089](https://github.com/PyTorchLightning/pytorch-lightning/pull/6089)) -- Fixed error message for AMP + CPU incompatibility ([#6107](https://github.com/PyTorchLightning/pytorch-lightning/pull/6107)) -- Disabled batch transfer in DP mode ([#6093](https://github.com/PyTorchLightning/pytorch-lightning/pull/6093)) +- Fixed incorrect yield logic for the amp autocast context manager ([#6080](https://github.com/Lightning-AI/lightning/pull/6080)) +- Fixed priority of plugin/accelerator when setting distributed mode ([#6089](https://github.com/Lightning-AI/lightning/pull/6089)) +- Fixed error message for AMP + CPU incompatibility ([#6107](https://github.com/Lightning-AI/lightning/pull/6107)) +- Disabled batch transfer in DP mode ([#6093](https://github.com/Lightning-AI/lightning/pull/6093)) ## [1.2.0] - 2021-02-18 ### Added -- Added `DataType`, `AverageMethod` and `MDMCAverageMethod` enum in metrics ([#5657](https://github.com/PyTorchLightning/pytorch-lightning/pull/5689)) -- Added support for summarized model total params size in megabytes ([#5590](https://github.com/PyTorchLightning/pytorch-lightning/pull/5590)) -- Added support for multiple train loaders ([#1959](https://github.com/PyTorchLightning/pytorch-lightning/pull/1959)) -- Added `Accuracy` metric now generalizes to Top-k accuracy for (multi-dimensional) multi-class inputs using the `top_k` parameter ([#4838](https://github.com/PyTorchLightning/pytorch-lightning/pull/4838)) -- Added `Accuracy` metric now enables the computation of subset accuracy for multi-label or multi-dimensional multi-class inputs with the `subset_accuracy` parameter ([#4838](https://github.com/PyTorchLightning/pytorch-lightning/pull/4838)) -- Added `HammingDistance` metric to compute the hamming distance (loss) ([#4838](https://github.com/PyTorchLightning/pytorch-lightning/pull/4838)) -- Added `max_fpr` parameter to `auroc` metric for computing partial auroc metric ([#3790](https://github.com/PyTorchLightning/pytorch-lightning/pull/3790)) -- Added `StatScores` metric to compute the number of true positives, false positives, true negatives and false negatives ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839)) -- Added `R2Score` metric ([#5241](https://github.com/PyTorchLightning/pytorch-lightning/pull/5241)) -- Added `LambdaCallback` ([#5347](https://github.com/PyTorchLightning/pytorch-lightning/pull/5347)) -- Added `BackboneLambdaFinetuningCallback` ([#5377](https://github.com/PyTorchLightning/pytorch-lightning/pull/5377)) -- Accelerator `all_gather` supports collection ([#5221](https://github.com/PyTorchLightning/pytorch-lightning/pull/5221)) -- Added `image_gradients` functional metric to compute the image gradients of a given input image. ([#5056](https://github.com/PyTorchLightning/pytorch-lightning/pull/5056)) -- Added `MetricCollection` ([#4318](https://github.com/PyTorchLightning/pytorch-lightning/pull/4318)) -- Added `.clone()` method to metrics ([#4318](https://github.com/PyTorchLightning/pytorch-lightning/pull/4318)) -- Added `IoU` class interface ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704)) +- Added `DataType`, `AverageMethod` and `MDMCAverageMethod` enum in metrics ([#5657](https://github.com/Lightning-AI/lightning/pull/5689)) +- Added support for summarized model total params size in megabytes ([#5590](https://github.com/Lightning-AI/lightning/pull/5590)) +- Added support for multiple train loaders ([#1959](https://github.com/Lightning-AI/lightning/pull/1959)) +- Added `Accuracy` metric now generalizes to Top-k accuracy for (multi-dimensional) multi-class inputs using the `top_k` parameter ([#4838](https://github.com/Lightning-AI/lightning/pull/4838)) +- Added `Accuracy` metric now enables the computation of subset accuracy for multi-label or multi-dimensional multi-class inputs with the `subset_accuracy` parameter ([#4838](https://github.com/Lightning-AI/lightning/pull/4838)) +- Added `HammingDistance` metric to compute the hamming distance (loss) ([#4838](https://github.com/Lightning-AI/lightning/pull/4838)) +- Added `max_fpr` parameter to `auroc` metric for computing partial auroc metric ([#3790](https://github.com/Lightning-AI/lightning/pull/3790)) +- Added `StatScores` metric to compute the number of true positives, false positives, true negatives and false negatives ([#4839](https://github.com/Lightning-AI/lightning/pull/4839)) +- Added `R2Score` metric ([#5241](https://github.com/Lightning-AI/lightning/pull/5241)) +- Added `LambdaCallback` ([#5347](https://github.com/Lightning-AI/lightning/pull/5347)) +- Added `BackboneLambdaFinetuningCallback` ([#5377](https://github.com/Lightning-AI/lightning/pull/5377)) +- Accelerator `all_gather` supports collection ([#5221](https://github.com/Lightning-AI/lightning/pull/5221)) +- Added `image_gradients` functional metric to compute the image gradients of a given input image. ([#5056](https://github.com/Lightning-AI/lightning/pull/5056)) +- Added `MetricCollection` ([#4318](https://github.com/Lightning-AI/lightning/pull/4318)) +- Added `.clone()` method to metrics ([#4318](https://github.com/Lightning-AI/lightning/pull/4318)) +- Added `IoU` class interface ([#4704](https://github.com/Lightning-AI/lightning/pull/4704)) - Support to tie weights after moving model to TPU via `on_post_move_to_device` hook -- Added missing val/test hooks in `LightningModule` ([#5467](https://github.com/PyTorchLightning/pytorch-lightning/pull/5467)) -- The `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/PyTorchLightning/pytorch-lightning/pull/4842)) -- Added `ModelPruning` Callback ([#5618](https://github.com/PyTorchLightning/pytorch-lightning/pull/5618), - [#5825](https://github.com/PyTorchLightning/pytorch-lightning/pull/5825), - [#6045](https://github.com/PyTorchLightning/pytorch-lightning/pull/6045)) -- Added `PyTorchProfiler` ([#5560](https://github.com/PyTorchLightning/pytorch-lightning/pull/5560)) -- Added compositional metrics ([#5464](https://github.com/PyTorchLightning/pytorch-lightning/pull/5464)) -- Added Trainer method `predict(...)` for high performance predictions ([#5579](https://github.com/PyTorchLightning/pytorch-lightning/pull/5579)) -- Added `on_before_batch_transfer` and `on_after_batch_transfer` data hooks ([#3671](https://github.com/PyTorchLightning/pytorch-lightning/pull/3671)) -- Added AUC/AUROC class interface ([#5479](https://github.com/PyTorchLightning/pytorch-lightning/pull/5479)) -- Added `PredictLoop` object ([#5752](https://github.com/PyTorchLightning/pytorch-lightning/pull/5752)) -- Added `QuantizationAwareTraining` callback ([#5706](https://github.com/PyTorchLightning/pytorch-lightning/pull/5706), - [#6040](https://github.com/PyTorchLightning/pytorch-lightning/pull/6040)) -- Added `LightningModule.configure_callbacks` to enable the definition of model-specific callbacks ([#5621](https://github.com/PyTorchLightning/pytorch-lightning/pull/5621)) -- Added `dim` to `PSNR` metric for mean-squared-error reduction ([#5957](https://github.com/PyTorchLightning/pytorch-lightning/pull/5957)) -- Added promxial policy optimization template to pl_examples ([#5394](https://github.com/PyTorchLightning/pytorch-lightning/pull/5394)) -- Added `log_graph` to `CometLogger` ([#5295](https://github.com/PyTorchLightning/pytorch-lightning/pull/5295)) -- Added possibility for nested loaders ([#5404](https://github.com/PyTorchLightning/pytorch-lightning/pull/5404)) -- Added `sync_step` to Wandb logger ([#5351](https://github.com/PyTorchLightning/pytorch-lightning/pull/5351)) -- Added `StochasticWeightAveraging` callback ([#5640](https://github.com/PyTorchLightning/pytorch-lightning/pull/5640)) -- Added `LightningDataModule.from_datasets(...)` ([#5133](https://github.com/PyTorchLightning/pytorch-lightning/pull/5133)) -- Added `PL_TORCH_DISTRIBUTED_BACKEND` env variable to select backend ([#5981](https://github.com/PyTorchLightning/pytorch-lightning/pull/5981)) -- Added `Trainer` flag to activate Stochastic Weight Averaging (SWA) `Trainer(stochastic_weight_avg=True)` ([#6038](https://github.com/PyTorchLightning/pytorch-lightning/pull/6038)) -- Added DeepSpeed integration ([#5954](https://github.com/PyTorchLightning/pytorch-lightning/pull/5954), - [#6042](https://github.com/PyTorchLightning/pytorch-lightning/pull/6042)) +- Added missing val/test hooks in `LightningModule` ([#5467](https://github.com/Lightning-AI/lightning/pull/5467)) +- The `Recall` and `Precision` metrics (and their functional counterparts `recall` and `precision`) can now be generalized to Recall@K and Precision@K with the use of `top_k` parameter ([#4842](https://github.com/Lightning-AI/lightning/pull/4842)) +- Added `ModelPruning` Callback ([#5618](https://github.com/Lightning-AI/lightning/pull/5618), + [#5825](https://github.com/Lightning-AI/lightning/pull/5825), + [#6045](https://github.com/Lightning-AI/lightning/pull/6045)) +- Added `PyTorchProfiler` ([#5560](https://github.com/Lightning-AI/lightning/pull/5560)) +- Added compositional metrics ([#5464](https://github.com/Lightning-AI/lightning/pull/5464)) +- Added Trainer method `predict(...)` for high performance predictions ([#5579](https://github.com/Lightning-AI/lightning/pull/5579)) +- Added `on_before_batch_transfer` and `on_after_batch_transfer` data hooks ([#3671](https://github.com/Lightning-AI/lightning/pull/3671)) +- Added AUC/AUROC class interface ([#5479](https://github.com/Lightning-AI/lightning/pull/5479)) +- Added `PredictLoop` object ([#5752](https://github.com/Lightning-AI/lightning/pull/5752)) +- Added `QuantizationAwareTraining` callback ([#5706](https://github.com/Lightning-AI/lightning/pull/5706), + [#6040](https://github.com/Lightning-AI/lightning/pull/6040)) +- Added `LightningModule.configure_callbacks` to enable the definition of model-specific callbacks ([#5621](https://github.com/Lightning-AI/lightning/pull/5621)) +- Added `dim` to `PSNR` metric for mean-squared-error reduction ([#5957](https://github.com/Lightning-AI/lightning/pull/5957)) +- Added promxial policy optimization template to pl_examples ([#5394](https://github.com/Lightning-AI/lightning/pull/5394)) +- Added `log_graph` to `CometLogger` ([#5295](https://github.com/Lightning-AI/lightning/pull/5295)) +- Added possibility for nested loaders ([#5404](https://github.com/Lightning-AI/lightning/pull/5404)) +- Added `sync_step` to Wandb logger ([#5351](https://github.com/Lightning-AI/lightning/pull/5351)) +- Added `StochasticWeightAveraging` callback ([#5640](https://github.com/Lightning-AI/lightning/pull/5640)) +- Added `LightningDataModule.from_datasets(...)` ([#5133](https://github.com/Lightning-AI/lightning/pull/5133)) +- Added `PL_TORCH_DISTRIBUTED_BACKEND` env variable to select backend ([#5981](https://github.com/Lightning-AI/lightning/pull/5981)) +- Added `Trainer` flag to activate Stochastic Weight Averaging (SWA) `Trainer(stochastic_weight_avg=True)` ([#6038](https://github.com/Lightning-AI/lightning/pull/6038)) +- Added DeepSpeed integration ([#5954](https://github.com/Lightning-AI/lightning/pull/5954), + [#6042](https://github.com/Lightning-AI/lightning/pull/6042)) ### Changed -- Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839)) -- Changed `computer_vision_fine_tunning` example to use `BackboneLambdaFinetuningCallback` ([#5377](https://github.com/PyTorchLightning/pytorch-lightning/pull/5377)) -- Changed `automatic casting` for LoggerConnector `metrics` ([#5218](https://github.com/PyTorchLightning/pytorch-lightning/pull/5218)) -- Changed `iou` [func] to allow float input ([#4704](https://github.com/PyTorchLightning/pytorch-lightning/pull/4704)) -- Metric `compute()` method will no longer automatically call `reset()` ([#5409](https://github.com/PyTorchLightning/pytorch-lightning/pull/5409)) -- Set PyTorch 1.4 as min requirements, also for testing and examples `torchvision>=0.5` and `torchtext>=0.5` ([#5418](https://github.com/PyTorchLightning/pytorch-lightning/pull/5418)) -- Changed `callbacks` argument in `Trainer` to allow `Callback` input ([#5446](https://github.com/PyTorchLightning/pytorch-lightning/pull/5446)) -- Changed the default of `find_unused_parameters` to `False` in DDP ([#5185](https://github.com/PyTorchLightning/pytorch-lightning/pull/5185)) -- Changed `ModelCheckpoint` version suffixes to start at 1 ([#5008](https://github.com/PyTorchLightning/pytorch-lightning/pull/5008)) -- Progress bar metrics tensors are now converted to float ([#5692](https://github.com/PyTorchLightning/pytorch-lightning/pull/5692)) -- Changed the default value for the `progress_bar_refresh_rate` Trainer argument in Google COLAB notebooks to 20 ([#5516](https://github.com/PyTorchLightning/pytorch-lightning/pull/5516)) -- Extended support for purely iteration-based training ([#5726](https://github.com/PyTorchLightning/pytorch-lightning/pull/5726)) -- Made `LightningModule.global_rank`, `LightningModule.local_rank` and `LightningModule.logger` read-only properties ([#5730](https://github.com/PyTorchLightning/pytorch-lightning/pull/5730)) -- Forced `ModelCheckpoint` callbacks to run after all others to guarantee all states are saved to the checkpoint ([#5731](https://github.com/PyTorchLightning/pytorch-lightning/pull/5731)) +- Changed `stat_scores` metric now calculates stat scores over all classes and gains new parameters, in line with the new `StatScores` metric ([#4839](https://github.com/Lightning-AI/lightning/pull/4839)) +- Changed `computer_vision_fine_tunning` example to use `BackboneLambdaFinetuningCallback` ([#5377](https://github.com/Lightning-AI/lightning/pull/5377)) +- Changed `automatic casting` for LoggerConnector `metrics` ([#5218](https://github.com/Lightning-AI/lightning/pull/5218)) +- Changed `iou` [func] to allow float input ([#4704](https://github.com/Lightning-AI/lightning/pull/4704)) +- Metric `compute()` method will no longer automatically call `reset()` ([#5409](https://github.com/Lightning-AI/lightning/pull/5409)) +- Set PyTorch 1.4 as min requirements, also for testing and examples `torchvision>=0.5` and `torchtext>=0.5` ([#5418](https://github.com/Lightning-AI/lightning/pull/5418)) +- Changed `callbacks` argument in `Trainer` to allow `Callback` input ([#5446](https://github.com/Lightning-AI/lightning/pull/5446)) +- Changed the default of `find_unused_parameters` to `False` in DDP ([#5185](https://github.com/Lightning-AI/lightning/pull/5185)) +- Changed `ModelCheckpoint` version suffixes to start at 1 ([#5008](https://github.com/Lightning-AI/lightning/pull/5008)) +- Progress bar metrics tensors are now converted to float ([#5692](https://github.com/Lightning-AI/lightning/pull/5692)) +- Changed the default value for the `progress_bar_refresh_rate` Trainer argument in Google COLAB notebooks to 20 ([#5516](https://github.com/Lightning-AI/lightning/pull/5516)) +- Extended support for purely iteration-based training ([#5726](https://github.com/Lightning-AI/lightning/pull/5726)) +- Made `LightningModule.global_rank`, `LightningModule.local_rank` and `LightningModule.logger` read-only properties ([#5730](https://github.com/Lightning-AI/lightning/pull/5730)) +- Forced `ModelCheckpoint` callbacks to run after all others to guarantee all states are saved to the checkpoint ([#5731](https://github.com/Lightning-AI/lightning/pull/5731)) - Refactored Accelerators and Plugins: - * Added base classes for plugins ([#5715](https://github.com/PyTorchLightning/pytorch-lightning/pull/5715)) - * Added parallel plugins for DP, DDP, DDPSpawn, DDP2 and Horovod ([#5714](https://github.com/PyTorchLightning/pytorch-lightning/pull/5714)) - * Precision Plugins ([#5718](https://github.com/PyTorchLightning/pytorch-lightning/pull/5718)) - * Added new Accelerators for CPU, GPU and TPU ([#5719](https://github.com/PyTorchLightning/pytorch-lightning/pull/5719)) - * Added RPC and Sharded plugins ([#5732](https://github.com/PyTorchLightning/pytorch-lightning/pull/5732)) - * Added missing `LightningModule`-wrapper logic to new plugins and accelerator ([#5734](https://github.com/PyTorchLightning/pytorch-lightning/pull/5734)) - * Moved device-specific teardown logic from training loop to accelerator ([#5973](https://github.com/PyTorchLightning/pytorch-lightning/pull/5973)) - * Moved accelerator_connector.py to the connectors subfolder ([#6033](https://github.com/PyTorchLightning/pytorch-lightning/pull/6033)) - * Trainer only references accelerator ([#6039](https://github.com/PyTorchLightning/pytorch-lightning/pull/6039)) - * Made parallel devices optional across all plugins ([#6051](https://github.com/PyTorchLightning/pytorch-lightning/pull/6051)) - * Cleaning ([#5948](https://github.com/PyTorchLightning/pytorch-lightning/pull/5948), - [#5949](https://github.com/PyTorchLightning/pytorch-lightning/pull/5949), - [#5950](https://github.com/PyTorchLightning/pytorch-lightning/pull/5950)) -- Enabled `self.log` in callbacks ([#5094](https://github.com/PyTorchLightning/pytorch-lightning/pull/5094)) -- Renamed xxx_AVAILABLE as protected ([#5082](https://github.com/PyTorchLightning/pytorch-lightning/pull/5082)) -- Unified module names in Utils ([#5199](https://github.com/PyTorchLightning/pytorch-lightning/pull/5199)) -- Separated utils: imports & enums ([#5256](https://github.com/PyTorchLightning/pytorch-lightning/pull/5256) - [#5874](https://github.com/PyTorchLightning/pytorch-lightning/pull/5874)) -- Refactor: clean trainer device & distributed getters ([#5300](https://github.com/PyTorchLightning/pytorch-lightning/pull/5300)) -- Simplified training phase as LightningEnum ([#5419](https://github.com/PyTorchLightning/pytorch-lightning/pull/5419)) -- Updated metrics to use LightningEnum ([#5689](https://github.com/PyTorchLightning/pytorch-lightning/pull/5689)) -- Changed the seq of `on_train_batch_end`, `on_batch_end` & `on_train_epoch_end`, `on_epoch_end hooks` ([#5688](https://github.com/PyTorchLightning/pytorch-lightning/pull/5688)) -- Refactored `setup_training` and remove `test_mode` ([#5388](https://github.com/PyTorchLightning/pytorch-lightning/pull/5388)) -- Disabled training with zero `num_training_batches` when insufficient `limit_train_batches` ([#5703](https://github.com/PyTorchLightning/pytorch-lightning/pull/5703)) -- Refactored `EpochResultStore` ([#5522](https://github.com/PyTorchLightning/pytorch-lightning/pull/5522)) -- Update `lr_finder` to check for attribute if not running `fast_dev_run` ([#5990](https://github.com/PyTorchLightning/pytorch-lightning/pull/5990)) -- LightningOptimizer manual optimizer is more flexible and expose `toggle_model` ([#5771](https://github.com/PyTorchLightning/pytorch-lightning/pull/5771)) -- `MlflowLogger` limit parameter value length to 250 char ([#5893](https://github.com/PyTorchLightning/pytorch-lightning/pull/5893)) -- Re-introduced fix for Hydra directory sync with multiple process ([#5993](https://github.com/PyTorchLightning/pytorch-lightning/pull/5993)) + * Added base classes for plugins ([#5715](https://github.com/Lightning-AI/lightning/pull/5715)) + * Added parallel plugins for DP, DDP, DDPSpawn, DDP2 and Horovod ([#5714](https://github.com/Lightning-AI/lightning/pull/5714)) + * Precision Plugins ([#5718](https://github.com/Lightning-AI/lightning/pull/5718)) + * Added new Accelerators for CPU, GPU and TPU ([#5719](https://github.com/Lightning-AI/lightning/pull/5719)) + * Added RPC and Sharded plugins ([#5732](https://github.com/Lightning-AI/lightning/pull/5732)) + * Added missing `LightningModule`-wrapper logic to new plugins and accelerator ([#5734](https://github.com/Lightning-AI/lightning/pull/5734)) + * Moved device-specific teardown logic from training loop to accelerator ([#5973](https://github.com/Lightning-AI/lightning/pull/5973)) + * Moved accelerator_connector.py to the connectors subfolder ([#6033](https://github.com/Lightning-AI/lightning/pull/6033)) + * Trainer only references accelerator ([#6039](https://github.com/Lightning-AI/lightning/pull/6039)) + * Made parallel devices optional across all plugins ([#6051](https://github.com/Lightning-AI/lightning/pull/6051)) + * Cleaning ([#5948](https://github.com/Lightning-AI/lightning/pull/5948), + [#5949](https://github.com/Lightning-AI/lightning/pull/5949), + [#5950](https://github.com/Lightning-AI/lightning/pull/5950)) +- Enabled `self.log` in callbacks ([#5094](https://github.com/Lightning-AI/lightning/pull/5094)) +- Renamed xxx_AVAILABLE as protected ([#5082](https://github.com/Lightning-AI/lightning/pull/5082)) +- Unified module names in Utils ([#5199](https://github.com/Lightning-AI/lightning/pull/5199)) +- Separated utils: imports & enums ([#5256](https://github.com/Lightning-AI/lightning/pull/5256) + [#5874](https://github.com/Lightning-AI/lightning/pull/5874)) +- Refactor: clean trainer device & distributed getters ([#5300](https://github.com/Lightning-AI/lightning/pull/5300)) +- Simplified training phase as LightningEnum ([#5419](https://github.com/Lightning-AI/lightning/pull/5419)) +- Updated metrics to use LightningEnum ([#5689](https://github.com/Lightning-AI/lightning/pull/5689)) +- Changed the seq of `on_train_batch_end`, `on_batch_end` & `on_train_epoch_end`, `on_epoch_end hooks` ([#5688](https://github.com/Lightning-AI/lightning/pull/5688)) +- Refactored `setup_training` and remove `test_mode` ([#5388](https://github.com/Lightning-AI/lightning/pull/5388)) +- Disabled training with zero `num_training_batches` when insufficient `limit_train_batches` ([#5703](https://github.com/Lightning-AI/lightning/pull/5703)) +- Refactored `EpochResultStore` ([#5522](https://github.com/Lightning-AI/lightning/pull/5522)) +- Update `lr_finder` to check for attribute if not running `fast_dev_run` ([#5990](https://github.com/Lightning-AI/lightning/pull/5990)) +- LightningOptimizer manual optimizer is more flexible and expose `toggle_model` ([#5771](https://github.com/Lightning-AI/lightning/pull/5771)) +- `MlflowLogger` limit parameter value length to 250 char ([#5893](https://github.com/Lightning-AI/lightning/pull/5893)) +- Re-introduced fix for Hydra directory sync with multiple process ([#5993](https://github.com/Lightning-AI/lightning/pull/5993)) ### Deprecated -- Function `stat_scores_multiple_classes` is deprecated in favor of `stat_scores` ([#4839](https://github.com/PyTorchLightning/pytorch-lightning/pull/4839)) -- Moved accelerators and plugins to its `legacy` pkg ([#5645](https://github.com/PyTorchLightning/pytorch-lightning/pull/5645)) -- Deprecated `LightningDistributedDataParallel` in favor of new wrapper module `LightningDistributedModule` ([#5185](https://github.com/PyTorchLightning/pytorch-lightning/pull/5185)) -- Deprecated `LightningDataParallel` in favor of new wrapper module `LightningParallelModule` ([#5670](https://github.com/PyTorchLightning/pytorch-lightning/pull/5670)) -- Renamed utils modules ([#5199](https://github.com/PyTorchLightning/pytorch-lightning/pull/5199)) +- Function `stat_scores_multiple_classes` is deprecated in favor of `stat_scores` ([#4839](https://github.com/Lightning-AI/lightning/pull/4839)) +- Moved accelerators and plugins to its `legacy` pkg ([#5645](https://github.com/Lightning-AI/lightning/pull/5645)) +- Deprecated `LightningDistributedDataParallel` in favor of new wrapper module `LightningDistributedModule` ([#5185](https://github.com/Lightning-AI/lightning/pull/5185)) +- Deprecated `LightningDataParallel` in favor of new wrapper module `LightningParallelModule` ([#5670](https://github.com/Lightning-AI/lightning/pull/5670)) +- Renamed utils modules ([#5199](https://github.com/Lightning-AI/lightning/pull/5199)) * `argparse_utils` >> `argparse` * `model_utils` >> `model_helpers` * `warning_utils` >> `warnings` * `xla_device_utils` >> `xla_device` -- Deprecated using `'val_loss'` to set the `ModelCheckpoint` monitor ([#6012](https://github.com/PyTorchLightning/pytorch-lightning/pull/6012)) -- Deprecated `.get_model()` with explicit `.lightning_module` property ([#6035](https://github.com/PyTorchLightning/pytorch-lightning/pull/6035)) -- Deprecated Trainer attribute `accelerator_backend` in favor of `accelerator` ([#6034](https://github.com/PyTorchLightning/pytorch-lightning/pull/6034)) +- Deprecated using `'val_loss'` to set the `ModelCheckpoint` monitor ([#6012](https://github.com/Lightning-AI/lightning/pull/6012)) +- Deprecated `.get_model()` with explicit `.lightning_module` property ([#6035](https://github.com/Lightning-AI/lightning/pull/6035)) +- Deprecated Trainer attribute `accelerator_backend` in favor of `accelerator` ([#6034](https://github.com/Lightning-AI/lightning/pull/6034)) ### Removed -- Removed deprecated checkpoint argument `filepath` ([#5321](https://github.com/PyTorchLightning/pytorch-lightning/pull/5321)) -- Removed deprecated `Fbeta`, `f1_score` and `fbeta_score` metrics ([#5322](https://github.com/PyTorchLightning/pytorch-lightning/pull/5322)) -- Removed deprecated `TrainResult` ([#5323](https://github.com/PyTorchLightning/pytorch-lightning/pull/5323)) -- Removed deprecated `EvalResult` ([#5633](https://github.com/PyTorchLightning/pytorch-lightning/pull/5633)) -- Removed `LoggerStages` ([#5673](https://github.com/PyTorchLightning/pytorch-lightning/pull/5673)) - -### Fixed - -- Fixed distributed setting and `ddp_cpu` only with `num_processes>1` ([#5297](https://github.com/PyTorchLightning/pytorch-lightning/pull/5297)) -- Fixed `num_workers` for Windows example ([#5375](https://github.com/PyTorchLightning/pytorch-lightning/pull/5375)) -- Fixed loading yaml ([#5619](https://github.com/PyTorchLightning/pytorch-lightning/pull/5619)) -- Fixed support custom DataLoader with DDP if they can be re-instantiated ([#5745](https://github.com/PyTorchLightning/pytorch-lightning/pull/5745)) -- Fixed repeated `.fit()` calls ignore max_steps iteration bound ([#5936](https://github.com/PyTorchLightning/pytorch-lightning/pull/5936)) -- Fixed throwing `MisconfigurationError` on unknown mode ([#5255](https://github.com/PyTorchLightning/pytorch-lightning/pull/5255)) -- Resolve bug with Finetuning ([#5744](https://github.com/PyTorchLightning/pytorch-lightning/pull/5744)) -- Fixed `ModelCheckpoint` race condition in file existence check ([#5155](https://github.com/PyTorchLightning/pytorch-lightning/pull/5155)) -- Fixed some compatibility with PyTorch 1.8 ([#5864](https://github.com/PyTorchLightning/pytorch-lightning/pull/5864)) -- Fixed forward cache ([#5895](https://github.com/PyTorchLightning/pytorch-lightning/pull/5895)) -- Fixed recursive detach of tensors to CPU ([#6007](https://github.com/PyTorchLightning/pytorch-lightning/pull/6007)) -- Fixed passing wrong strings for scheduler interval doesn't throw an error ([#5923](https://github.com/PyTorchLightning/pytorch-lightning/pull/5923)) -- Fixed wrong `requires_grad` state after `return None` with multiple optimizers ([#5738](https://github.com/PyTorchLightning/pytorch-lightning/pull/5638)) -- Fixed add `on_epoch_end` hook at the end of `validation`, `test` epoch ([#5986](https://github.com/PyTorchLightning/pytorch-lightning/pull/5986)) -- Fixed missing `process_dataloader` call for `TPUSpawn` when in distributed mode ([#6015](https://github.com/PyTorchLightning/pytorch-lightning/pull/6015)) -- Fixed progress bar flickering by appending 0 to floats/strings ([#6009](https://github.com/PyTorchLightning/pytorch-lightning/pull/6009)) -- Fixed synchronization issues with TPU training ([#6027](https://github.com/PyTorchLightning/pytorch-lightning/pull/6027)) -- Fixed `hparams.yaml` saved twice when using `TensorBoardLogger` ([#5953](https://github.com/PyTorchLightning/pytorch-lightning/pull/5953)) -- Fixed basic examples ([#5912](https://github.com/PyTorchLightning/pytorch-lightning/pull/5912), - [#5985](https://github.com/PyTorchLightning/pytorch-lightning/pull/5985)) -- Fixed `fairscale` compatible with PT 1.8 ([#5996](https://github.com/PyTorchLightning/pytorch-lightning/pull/5996)) -- Ensured `process_dataloader` is called when `tpu_cores > 1` to use Parallel DataLoader ([#6015](https://github.com/PyTorchLightning/pytorch-lightning/pull/6015)) -- Attempted SLURM auto resume call when non-shell call fails ([#6002](https://github.com/PyTorchLightning/pytorch-lightning/pull/6002)) -- Fixed wrapping optimizers upon assignment ([#6006](https://github.com/PyTorchLightning/pytorch-lightning/pull/6006)) -- Fixed allowing hashing of metrics with lists in their state ([#5939](https://github.com/PyTorchLightning/pytorch-lightning/pull/5939)) +- Removed deprecated checkpoint argument `filepath` ([#5321](https://github.com/Lightning-AI/lightning/pull/5321)) +- Removed deprecated `Fbeta`, `f1_score` and `fbeta_score` metrics ([#5322](https://github.com/Lightning-AI/lightning/pull/5322)) +- Removed deprecated `TrainResult` ([#5323](https://github.com/Lightning-AI/lightning/pull/5323)) +- Removed deprecated `EvalResult` ([#5633](https://github.com/Lightning-AI/lightning/pull/5633)) +- Removed `LoggerStages` ([#5673](https://github.com/Lightning-AI/lightning/pull/5673)) + +### Fixed + +- Fixed distributed setting and `ddp_cpu` only with `num_processes>1` ([#5297](https://github.com/Lightning-AI/lightning/pull/5297)) +- Fixed `num_workers` for Windows example ([#5375](https://github.com/Lightning-AI/lightning/pull/5375)) +- Fixed loading yaml ([#5619](https://github.com/Lightning-AI/lightning/pull/5619)) +- Fixed support custom DataLoader with DDP if they can be re-instantiated ([#5745](https://github.com/Lightning-AI/lightning/pull/5745)) +- Fixed repeated `.fit()` calls ignore max_steps iteration bound ([#5936](https://github.com/Lightning-AI/lightning/pull/5936)) +- Fixed throwing `MisconfigurationError` on unknown mode ([#5255](https://github.com/Lightning-AI/lightning/pull/5255)) +- Resolve bug with Finetuning ([#5744](https://github.com/Lightning-AI/lightning/pull/5744)) +- Fixed `ModelCheckpoint` race condition in file existence check ([#5155](https://github.com/Lightning-AI/lightning/pull/5155)) +- Fixed some compatibility with PyTorch 1.8 ([#5864](https://github.com/Lightning-AI/lightning/pull/5864)) +- Fixed forward cache ([#5895](https://github.com/Lightning-AI/lightning/pull/5895)) +- Fixed recursive detach of tensors to CPU ([#6007](https://github.com/Lightning-AI/lightning/pull/6007)) +- Fixed passing wrong strings for scheduler interval doesn't throw an error ([#5923](https://github.com/Lightning-AI/lightning/pull/5923)) +- Fixed wrong `requires_grad` state after `return None` with multiple optimizers ([#5738](https://github.com/Lightning-AI/lightning/pull/5638)) +- Fixed add `on_epoch_end` hook at the end of `validation`, `test` epoch ([#5986](https://github.com/Lightning-AI/lightning/pull/5986)) +- Fixed missing `process_dataloader` call for `TPUSpawn` when in distributed mode ([#6015](https://github.com/Lightning-AI/lightning/pull/6015)) +- Fixed progress bar flickering by appending 0 to floats/strings ([#6009](https://github.com/Lightning-AI/lightning/pull/6009)) +- Fixed synchronization issues with TPU training ([#6027](https://github.com/Lightning-AI/lightning/pull/6027)) +- Fixed `hparams.yaml` saved twice when using `TensorBoardLogger` ([#5953](https://github.com/Lightning-AI/lightning/pull/5953)) +- Fixed basic examples ([#5912](https://github.com/Lightning-AI/lightning/pull/5912), + [#5985](https://github.com/Lightning-AI/lightning/pull/5985)) +- Fixed `fairscale` compatible with PT 1.8 ([#5996](https://github.com/Lightning-AI/lightning/pull/5996)) +- Ensured `process_dataloader` is called when `tpu_cores > 1` to use Parallel DataLoader ([#6015](https://github.com/Lightning-AI/lightning/pull/6015)) +- Attempted SLURM auto resume call when non-shell call fails ([#6002](https://github.com/Lightning-AI/lightning/pull/6002)) +- Fixed wrapping optimizers upon assignment ([#6006](https://github.com/Lightning-AI/lightning/pull/6006)) +- Fixed allowing hashing of metrics with lists in their state ([#5939](https://github.com/Lightning-AI/lightning/pull/5939)) ## [1.1.8] - 2021-02-08 ### Fixed -- Separate epoch validation from step validation ([#5208](https://github.com/PyTorchLightning/pytorch-lightning/pull/5208)) -- Fixed `toggle_optimizers` not handling all optimizer parameters ([#5775](https://github.com/PyTorchLightning/pytorch-lightning/pull/5775)) +- Separate epoch validation from step validation ([#5208](https://github.com/Lightning-AI/lightning/pull/5208)) +- Fixed `toggle_optimizers` not handling all optimizer parameters ([#5775](https://github.com/Lightning-AI/lightning/pull/5775)) ## [1.1.7] - 2021-02-03 ### Fixed -- Fixed `TensorBoardLogger` not closing `SummaryWriter` on `finalize` ([#5696](https://github.com/PyTorchLightning/pytorch-lightning/pull/5696)) -- Fixed filtering of pytorch "unsqueeze" warning when using DP ([#5622](https://github.com/PyTorchLightning/pytorch-lightning/pull/5622)) -- Fixed `num_classes` argument in F1 metric ([#5663](https://github.com/PyTorchLightning/pytorch-lightning/pull/5663)) -- Fixed `log_dir` property ([#5537](https://github.com/PyTorchLightning/pytorch-lightning/pull/5537)) -- Fixed a race condition in `ModelCheckpoint` when checking if a checkpoint file exists ([#5144](https://github.com/PyTorchLightning/pytorch-lightning/pull/5144)) -- Remove unnecessary intermediate layers in Dockerfiles ([#5697](https://github.com/PyTorchLightning/pytorch-lightning/pull/5697)) -- Fixed auto learning rate ordering ([#5638](https://github.com/PyTorchLightning/pytorch-lightning/pull/5638)) +- Fixed `TensorBoardLogger` not closing `SummaryWriter` on `finalize` ([#5696](https://github.com/Lightning-AI/lightning/pull/5696)) +- Fixed filtering of pytorch "unsqueeze" warning when using DP ([#5622](https://github.com/Lightning-AI/lightning/pull/5622)) +- Fixed `num_classes` argument in F1 metric ([#5663](https://github.com/Lightning-AI/lightning/pull/5663)) +- Fixed `log_dir` property ([#5537](https://github.com/Lightning-AI/lightning/pull/5537)) +- Fixed a race condition in `ModelCheckpoint` when checking if a checkpoint file exists ([#5144](https://github.com/Lightning-AI/lightning/pull/5144)) +- Remove unnecessary intermediate layers in Dockerfiles ([#5697](https://github.com/Lightning-AI/lightning/pull/5697)) +- Fixed auto learning rate ordering ([#5638](https://github.com/Lightning-AI/lightning/pull/5638)) ## [1.1.6] - 2021-01-26 ### Changed -- Increased TPU check timeout from 20s to 100s ([#5598](https://github.com/PyTorchLightning/pytorch-lightning/pull/5598)) -- Ignored `step` param in Neptune logger's log_metric method ([#5510](https://github.com/PyTorchLightning/pytorch-lightning/pull/5510)) -- Pass batch outputs to `on_train_batch_end` instead of `epoch_end` outputs ([#4369](https://github.com/PyTorchLightning/pytorch-lightning/pull/4369)) +- Increased TPU check timeout from 20s to 100s ([#5598](https://github.com/Lightning-AI/lightning/pull/5598)) +- Ignored `step` param in Neptune logger's log_metric method ([#5510](https://github.com/Lightning-AI/lightning/pull/5510)) +- Pass batch outputs to `on_train_batch_end` instead of `epoch_end` outputs ([#4369](https://github.com/Lightning-AI/lightning/pull/4369)) ### Fixed -- Fixed `toggle_optimizer` to reset `requires_grad` state ([#5574](https://github.com/PyTorchLightning/pytorch-lightning/pull/5574)) -- Fixed FileNotFoundError for best checkpoint when using DDP with Hydra ([#5629](https://github.com/PyTorchLightning/pytorch-lightning/pull/5629)) -- Fixed an error when logging a progress bar metric with a reserved name ([#5620](https://github.com/PyTorchLightning/pytorch-lightning/pull/5620)) -- Fixed `Metric`'s `state_dict` not included when child modules ([#5614](https://github.com/PyTorchLightning/pytorch-lightning/pull/5614)) -- Fixed Neptune logger creating multiple experiments when GPUs > 1 ([#3256](https://github.com/PyTorchLightning/pytorch-lightning/pull/3256)) -- Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509)) -- Fixed tensor printing in `trainer.test()` ([#5138](https://github.com/PyTorchLightning/pytorch-lightning/pull/5138)) -- Fixed not using dataloader when `hparams` present ([#4559](https://github.com/PyTorchLightning/pytorch-lightning/pull/4559)) +- Fixed `toggle_optimizer` to reset `requires_grad` state ([#5574](https://github.com/Lightning-AI/lightning/pull/5574)) +- Fixed FileNotFoundError for best checkpoint when using DDP with Hydra ([#5629](https://github.com/Lightning-AI/lightning/pull/5629)) +- Fixed an error when logging a progress bar metric with a reserved name ([#5620](https://github.com/Lightning-AI/lightning/pull/5620)) +- Fixed `Metric`'s `state_dict` not included when child modules ([#5614](https://github.com/Lightning-AI/lightning/pull/5614)) +- Fixed Neptune logger creating multiple experiments when GPUs > 1 ([#3256](https://github.com/Lightning-AI/lightning/pull/3256)) +- Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/Lightning-AI/lightning/pull/5509)) +- Fixed tensor printing in `trainer.test()` ([#5138](https://github.com/Lightning-AI/lightning/pull/5138)) +- Fixed not using dataloader when `hparams` present ([#4559](https://github.com/Lightning-AI/lightning/pull/4559)) ## [1.1.5] - 2021-01-19 ### Fixed -- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/PyTorchLightning/pytorch-lightning/pull/4579)) -- Fixed logging `on_train_batch_end` in a callback with multiple optimizers ([#5521](https://github.com/PyTorchLightning/pytorch-lightning/pull/5521)) -- Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/PyTorchLightning/pytorch-lightning/pull/5519)) -- Fixed `val_check_interval` with `fast_dev_run` ([#5540](https://github.com/PyTorchLightning/pytorch-lightning/pull/5540)) +- Fixed a visual bug in the progress bar display initialization ([#4579](https://github.com/Lightning-AI/lightning/pull/4579)) +- Fixed logging `on_train_batch_end` in a callback with multiple optimizers ([#5521](https://github.com/Lightning-AI/lightning/pull/5521)) +- Fixed `reinit_scheduler_properties` with correct optimizer ([#5519](https://github.com/Lightning-AI/lightning/pull/5519)) +- Fixed `val_check_interval` with `fast_dev_run` ([#5540](https://github.com/Lightning-AI/lightning/pull/5540)) ## [1.1.4] - 2021-01-12 ### Added -- Add automatic optimization property setter to lightning module ([#5169](https://github.com/PyTorchLightning/pytorch-lightning/pull/5169)) +- Add automatic optimization property setter to lightning module ([#5169](https://github.com/Lightning-AI/lightning/pull/5169)) ### Changed -- Changed deprecated `enable_pl_optimizer=True` ([#5244](https://github.com/PyTorchLightning/pytorch-lightning/pull/5244)) +- Changed deprecated `enable_pl_optimizer=True` ([#5244](https://github.com/Lightning-AI/lightning/pull/5244)) ### Fixed -- Fixed `transfer_batch_to_device` for DDP with `len(devices_ids) == 1` ([#5195](https://github.com/PyTorchLightning/pytorch-lightning/pull/5195)) -- Logging only on `not should_accumulate()` during training ([#5417](https://github.com/PyTorchLightning/pytorch-lightning/pull/5417)) -- Resolve interpolation bug with Hydra ([#5406](https://github.com/PyTorchLightning/pytorch-lightning/pull/5406)) -- Check environ before selecting a seed to prevent warning message ([#4743](https://github.com/PyTorchLightning/pytorch-lightning/pull/4743)) -- Fixed signature mismatch in `model_to_device` of `DDPCPUHPCAccelerator` ([#5505](https://github.com/PyTorchLightning/pytorch-lightning/pull/5505)) +- Fixed `transfer_batch_to_device` for DDP with `len(devices_ids) == 1` ([#5195](https://github.com/Lightning-AI/lightning/pull/5195)) +- Logging only on `not should_accumulate()` during training ([#5417](https://github.com/Lightning-AI/lightning/pull/5417)) +- Resolve interpolation bug with Hydra ([#5406](https://github.com/Lightning-AI/lightning/pull/5406)) +- Check environ before selecting a seed to prevent warning message ([#4743](https://github.com/Lightning-AI/lightning/pull/4743)) +- Fixed signature mismatch in `model_to_device` of `DDPCPUHPCAccelerator` ([#5505](https://github.com/Lightning-AI/lightning/pull/5505)) ## [1.1.3] - 2021-01-05 ### Added -- Added a check for optimizer attached to `lr_scheduler` ([#5338](https://github.com/PyTorchLightning/pytorch-lightning/pull/5338)) -- Added support for passing non-existing filepaths to `resume_from_checkpoint` ([#4402](https://github.com/PyTorchLightning/pytorch-lightning/pull/4402)) +- Added a check for optimizer attached to `lr_scheduler` ([#5338](https://github.com/Lightning-AI/lightning/pull/5338)) +- Added support for passing non-existing filepaths to `resume_from_checkpoint` ([#4402](https://github.com/Lightning-AI/lightning/pull/4402)) ### Changed -- Skip restore from `resume_from_checkpoint` while `testing` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161)) -- Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/PyTorchLightning/pytorch-lightning/pull/5333)) -- Disabled checkpointing, earlystopping and logging with `fast_dev_run` ([#5277](https://github.com/PyTorchLightning/pytorch-lightning/pull/5277)) -- Distributed group defaults to `WORLD` if `None` ([#5125](https://github.com/PyTorchLightning/pytorch-lightning/pull/5125)) +- Skip restore from `resume_from_checkpoint` while `testing` ([#5161](https://github.com/Lightning-AI/lightning/pull/5161)) +- Allowed `log_momentum` for adaptive optimizers in `LearningRateMonitor` ([#5333](https://github.com/Lightning-AI/lightning/pull/5333)) +- Disabled checkpointing, earlystopping and logging with `fast_dev_run` ([#5277](https://github.com/Lightning-AI/lightning/pull/5277)) +- Distributed group defaults to `WORLD` if `None` ([#5125](https://github.com/Lightning-AI/lightning/pull/5125)) ### Fixed -- Fixed `trainer.test` returning non-test metrics ([#5214](https://github.com/PyTorchLightning/pytorch-lightning/pull/5214)) -- Fixed metric state reset ([#5273](https://github.com/PyTorchLightning/pytorch-lightning/pull/5273)) -- Fixed `--num-nodes` on `DDPSequentialPlugin` ([#5327](https://github.com/PyTorchLightning/pytorch-lightning/pull/5327)) -- Fixed invalid value for `weights_summary` ([#5296](https://github.com/PyTorchLightning/pytorch-lightning/pull/5296)) -- Fixed `Trainer.test` not using the latest `best_model_path` ([#5161](https://github.com/PyTorchLightning/pytorch-lightning/pull/5161)) -- Fixed existence check for hparams not using underlying filesystem ([#5250](https://github.com/PyTorchLightning/pytorch-lightning/pull/5250)) -- Fixed `LightningOptimizer` AMP bug ([#5191](https://github.com/PyTorchLightning/pytorch-lightning/pull/5191)) -- Fixed casted key to string in `_flatten_dict` ([#5354](https://github.com/PyTorchLightning/pytorch-lightning/pull/5354)) +- Fixed `trainer.test` returning non-test metrics ([#5214](https://github.com/Lightning-AI/lightning/pull/5214)) +- Fixed metric state reset ([#5273](https://github.com/Lightning-AI/lightning/pull/5273)) +- Fixed `--num-nodes` on `DDPSequentialPlugin` ([#5327](https://github.com/Lightning-AI/lightning/pull/5327)) +- Fixed invalid value for `weights_summary` ([#5296](https://github.com/Lightning-AI/lightning/pull/5296)) +- Fixed `Trainer.test` not using the latest `best_model_path` ([#5161](https://github.com/Lightning-AI/lightning/pull/5161)) +- Fixed existence check for hparams not using underlying filesystem ([#5250](https://github.com/Lightning-AI/lightning/pull/5250)) +- Fixed `LightningOptimizer` AMP bug ([#5191](https://github.com/Lightning-AI/lightning/pull/5191)) +- Fixed casted key to string in `_flatten_dict` ([#5354](https://github.com/Lightning-AI/lightning/pull/5354)) ## [1.1.2] - 2020-12-23 ### Added -- Support number for logging with `sync_dist=True` ([#5080](https://github.com/PyTorchLightning/pytorch-lightning/pull/5080)) -- Added offset logging step when resuming for Wandb logger ([#5050](https://github.com/PyTorchLightning/pytorch-lightning/pull/5050)) +- Support number for logging with `sync_dist=True` ([#5080](https://github.com/Lightning-AI/lightning/pull/5080)) +- Added offset logging step when resuming for Wandb logger ([#5050](https://github.com/Lightning-AI/lightning/pull/5050)) ### Removed -- `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/PyTorchLightning/pytorch-lightning/pull/5163)) +- `enable_pl_optimizer=False` by default to temporarily fix AMP issues ([#5163](https://github.com/Lightning-AI/lightning/pull/5163)) ### Fixed -- Metric reduction with Logging ([#5150](https://github.com/PyTorchLightning/pytorch-lightning/pull/5150)) -- Remove nan loss in manual optimization ([#5121](https://github.com/PyTorchLightning/pytorch-lightning/pull/5121)) -- Un-balanced logging properly supported ([#5119](https://github.com/PyTorchLightning/pytorch-lightning/pull/5119)) -- Fix hanging in DDP HPC accelerators ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)) -- Fix reset `TensorRunningAccum` ([#5106](https://github.com/PyTorchLightning/pytorch-lightning/pull/5106)) -- Updated `DALIClassificationLoader` to not use deprecated arguments ([#4925](https://github.com/PyTorchLightning/pytorch-lightning/pull/4925)) -- Corrected call to `torch.no_grad` ([#5124](https://github.com/PyTorchLightning/pytorch-lightning/pull/5124)) +- Metric reduction with Logging ([#5150](https://github.com/Lightning-AI/lightning/pull/5150)) +- Remove nan loss in manual optimization ([#5121](https://github.com/Lightning-AI/lightning/pull/5121)) +- Un-balanced logging properly supported ([#5119](https://github.com/Lightning-AI/lightning/pull/5119)) +- Fix hanging in DDP HPC accelerators ([#5157](https://github.com/Lightning-AI/lightning/pull/5157)) +- Fix reset `TensorRunningAccum` ([#5106](https://github.com/Lightning-AI/lightning/pull/5106)) +- Updated `DALIClassificationLoader` to not use deprecated arguments ([#4925](https://github.com/Lightning-AI/lightning/pull/4925)) +- Corrected call to `torch.no_grad` ([#5124](https://github.com/Lightning-AI/lightning/pull/5124)) ## [1.1.1] - 2020-12-15 ### Added -- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/PyTorchLightning/pytorch-lightning/pull/4818)) +- Add a notebook example to reach a quick baseline of ~94% accuracy on CIFAR10 using Resnet in Lightning ([#4818](https://github.com/Lightning-AI/lightning/pull/4818)) ### Changed -- Simplify accelerator steps ([#5015](https://github.com/PyTorchLightning/pytorch-lightning/pull/5015)) -- Refactor load in checkpoint connector ([#4593](https://github.com/PyTorchLightning/pytorch-lightning/pull/4593)) -- Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/PyTorchLightning/pytorch-lightning/pull/4861)) +- Simplify accelerator steps ([#5015](https://github.com/Lightning-AI/lightning/pull/5015)) +- Refactor load in checkpoint connector ([#4593](https://github.com/Lightning-AI/lightning/pull/4593)) +- Fixed the saved filename in `ModelCheckpoint` when it already exists ([#4861](https://github.com/Lightning-AI/lightning/pull/4861)) ### Removed -- Drop duplicate metrics ([#5014](https://github.com/PyTorchLightning/pytorch-lightning/pull/5014)) -- Remove beta arg from F1 class and functional ([#5076](https://github.com/PyTorchLightning/pytorch-lightning/pull/5076)) +- Drop duplicate metrics ([#5014](https://github.com/Lightning-AI/lightning/pull/5014)) +- Remove beta arg from F1 class and functional ([#5076](https://github.com/Lightning-AI/lightning/pull/5076)) ### Fixed -- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/PyTorchLightning/pytorch-lightning/pull/4915)) -- Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/PyTorchLightning/pytorch-lightning/pull/5095)) -- Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/PyTorchLightning/pytorch-lightning/pull/5057)) -- Check if optimizer supports closure ([#4981](https://github.com/PyTorchLightning/pytorch-lightning/pull/4981)) +- Fixed trainer by default `None` in `DDPAccelerator` ([#4915](https://github.com/Lightning-AI/lightning/pull/4915)) +- Fixed `LightningOptimizer` to expose optimizer attributes ([#5095](https://github.com/Lightning-AI/lightning/pull/5095)) +- Do not warn when the `name` key is used in the `lr_scheduler` dict ([#5057](https://github.com/Lightning-AI/lightning/pull/5057)) +- Check if optimizer supports closure ([#4981](https://github.com/Lightning-AI/lightning/pull/4981)) - Add deprecated metric utility functions back to functional ( - [#5067](https://github.com/PyTorchLightning/pytorch-lightning/pull/5067), - [#5068](https://github.com/PyTorchLightning/pytorch-lightning/pull/5068)) -- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/PyTorchLightning/pytorch-lightning/pull/4378)) -- Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/PyTorchLightning/pytorch-lightning/pull/5157)) + [#5067](https://github.com/Lightning-AI/lightning/pull/5067), + [#5068](https://github.com/Lightning-AI/lightning/pull/5068)) +- Allow any input in `to_onnx` and `to_torchscript` ([#4378](https://github.com/Lightning-AI/lightning/pull/4378)) +- Fixed `DDPHPCAccelerator` hangs in DDP construction by calling `init_device` ([#5157](https://github.com/Lightning-AI/lightning/pull/5157)) ## [1.1.0] - 2020-12-09 ### Added -- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/PyTorchLightning/pytorch-lightning/pull/4383)) -- Added `ConfusionMatrix` class interface ([#4348](https://github.com/PyTorchLightning/pytorch-lightning/pull/4348)) -- Added multiclass AUROC metric ([#4236](https://github.com/PyTorchLightning/pytorch-lightning/pull/4236)) -- Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/PyTorchLightning/pytorch-lightning/pull/3807)) -- Added optimizer hooks in callbacks ([#4379](https://github.com/PyTorchLightning/pytorch-lightning/pull/4379)) -- Added option to log momentum ([#4384](https://github.com/PyTorchLightning/pytorch-lightning/pull/4384)) -- Added `current_score` to `ModelCheckpoint.on_save_checkpoint` ([#4721](https://github.com/PyTorchLightning/pytorch-lightning/pull/4721)) +- Added "monitor" key to saved `ModelCheckpoints` ([#4383](https://github.com/Lightning-AI/lightning/pull/4383)) +- Added `ConfusionMatrix` class interface ([#4348](https://github.com/Lightning-AI/lightning/pull/4348)) +- Added multiclass AUROC metric ([#4236](https://github.com/Lightning-AI/lightning/pull/4236)) +- Added global step indexing to the checkpoint name for a better sub-epoch checkpointing experience ([#3807](https://github.com/Lightning-AI/lightning/pull/3807)) +- Added optimizer hooks in callbacks ([#4379](https://github.com/Lightning-AI/lightning/pull/4379)) +- Added option to log momentum ([#4384](https://github.com/Lightning-AI/lightning/pull/4384)) +- Added `current_score` to `ModelCheckpoint.on_save_checkpoint` ([#4721](https://github.com/Lightning-AI/lightning/pull/4721)) - Added logging using `self.log` in train and evaluation for epoch end hooks ( - [#4552](https://github.com/PyTorchLightning/pytorch-lightning/pull/4552), - [#4495](https://github.com/PyTorchLightning/pytorch-lightning/pull/4495), - [#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439), - [#4684](https://github.com/PyTorchLightning/pytorch-lightning/pull/4684), - [#4913](https://github.com/PyTorchLightning/pytorch-lightning/pull/4913)) -- Added ability for DDP plugin to modify optimizer state saving ([#4675](https://github.com/PyTorchLightning/pytorch-lightning/pull/4675)) -- Added `prefix` argument in loggers ([#4557](https://github.com/PyTorchLightning/pytorch-lightning/pull/4557)) -- Added printing of total num of params, trainable and non-trainable params in ModelSummary ([#4521](https://github.com/PyTorchLightning/pytorch-lightning/pull/4521)) -- Added `PrecisionRecallCurve, ROC, AveragePrecision` class metric ([#4549](https://github.com/PyTorchLightning/pytorch-lightning/pull/4549)) -- Added custom `Apex` and `NativeAMP` as `Precision plugins` ([#4355](https://github.com/PyTorchLightning/pytorch-lightning/pull/4355)) -- Added `DALI MNIST` example ([#3721](https://github.com/PyTorchLightning/pytorch-lightning/pull/3721)) + [#4552](https://github.com/Lightning-AI/lightning/pull/4552), + [#4495](https://github.com/Lightning-AI/lightning/pull/4495), + [#4439](https://github.com/Lightning-AI/lightning/pull/4439), + [#4684](https://github.com/Lightning-AI/lightning/pull/4684), + [#4913](https://github.com/Lightning-AI/lightning/pull/4913)) +- Added ability for DDP plugin to modify optimizer state saving ([#4675](https://github.com/Lightning-AI/lightning/pull/4675)) +- Added `prefix` argument in loggers ([#4557](https://github.com/Lightning-AI/lightning/pull/4557)) +- Added printing of total num of params, trainable and non-trainable params in ModelSummary ([#4521](https://github.com/Lightning-AI/lightning/pull/4521)) +- Added `PrecisionRecallCurve, ROC, AveragePrecision` class metric ([#4549](https://github.com/Lightning-AI/lightning/pull/4549)) +- Added custom `Apex` and `NativeAMP` as `Precision plugins` ([#4355](https://github.com/Lightning-AI/lightning/pull/4355)) +- Added `DALI MNIST` example ([#3721](https://github.com/Lightning-AI/lightning/pull/3721)) - Added `sharded plugin` for DDP for multi-gpu training memory optimizations ( - [#4639](https://github.com/PyTorchLightning/pytorch-lightning/pull/4639), - [#4686](https://github.com/PyTorchLightning/pytorch-lightning/pull/4686), - [#4737](https://github.com/PyTorchLightning/pytorch-lightning/pull/4737), - [#4773](https://github.com/PyTorchLightning/pytorch-lightning/pull/4773)) -- Added `experiment_id` to the NeptuneLogger ([#3462](https://github.com/PyTorchLightning/pytorch-lightning/pull/3462)) -- Added `PyTorch Geometric` integration example with Lightning ([#4568](https://github.com/PyTorchLightning/pytorch-lightning/pull/4568)) -- Added `all_gather` method to `LightningModule` which allows gradient based tensor synchronizations for use-cases such as negative sampling. ([#5012](https://github.com/PyTorchLightning/pytorch-lightning/pull/5012)) -- Enabled `self.log` in most functions ([#4969](https://github.com/PyTorchLightning/pytorch-lightning/pull/4969)) -- Added changeable extension variable for `ModelCheckpoint` ([#4977](https://github.com/PyTorchLightning/pytorch-lightning/pull/4977)) + [#4639](https://github.com/Lightning-AI/lightning/pull/4639), + [#4686](https://github.com/Lightning-AI/lightning/pull/4686), + [#4737](https://github.com/Lightning-AI/lightning/pull/4737), + [#4773](https://github.com/Lightning-AI/lightning/pull/4773)) +- Added `experiment_id` to the NeptuneLogger ([#3462](https://github.com/Lightning-AI/lightning/pull/3462)) +- Added `PyTorch Geometric` integration example with Lightning ([#4568](https://github.com/Lightning-AI/lightning/pull/4568)) +- Added `all_gather` method to `LightningModule` which allows gradient based tensor synchronizations for use-cases such as negative sampling. ([#5012](https://github.com/Lightning-AI/lightning/pull/5012)) +- Enabled `self.log` in most functions ([#4969](https://github.com/Lightning-AI/lightning/pull/4969)) +- Added changeable extension variable for `ModelCheckpoint` ([#4977](https://github.com/Lightning-AI/lightning/pull/4977)) ### Changed -- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/PyTorchLightning/pytorch-lightning/pull/3903)) -- `WandbLogger` does not force wandb `reinit` arg to True anymore and creates a run only when needed ([#4648](https://github.com/PyTorchLightning/pytorch-lightning/pull/4648)) -- Changed `automatic_optimization` to be a model attribute ([#4602](https://github.com/PyTorchLightning/pytorch-lightning/pull/4602)) -- Changed `Simple Profiler` report to order by percentage time spent + num calls ([#4880](https://github.com/PyTorchLightning/pytorch-lightning/pull/4880)) -- Simplify optimization Logic ([#4984](https://github.com/PyTorchLightning/pytorch-lightning/pull/4984)) -- Classification metrics overhaul ([#4837](https://github.com/PyTorchLightning/pytorch-lightning/pull/4837)) -- Updated `fast_dev_run` to accept integer representing num_batches ([#4629](https://github.com/PyTorchLightning/pytorch-lightning/pull/4629)) -- Refactored optimizer ([#4658](https://github.com/PyTorchLightning/pytorch-lightning/pull/4658)) +- Tuner algorithms will be skipped if `fast_dev_run=True` ([#3903](https://github.com/Lightning-AI/lightning/pull/3903)) +- `WandbLogger` does not force wandb `reinit` arg to True anymore and creates a run only when needed ([#4648](https://github.com/Lightning-AI/lightning/pull/4648)) +- Changed `automatic_optimization` to be a model attribute ([#4602](https://github.com/Lightning-AI/lightning/pull/4602)) +- Changed `Simple Profiler` report to order by percentage time spent + num calls ([#4880](https://github.com/Lightning-AI/lightning/pull/4880)) +- Simplify optimization Logic ([#4984](https://github.com/Lightning-AI/lightning/pull/4984)) +- Classification metrics overhaul ([#4837](https://github.com/Lightning-AI/lightning/pull/4837)) +- Updated `fast_dev_run` to accept integer representing num_batches ([#4629](https://github.com/Lightning-AI/lightning/pull/4629)) +- Refactored optimizer ([#4658](https://github.com/Lightning-AI/lightning/pull/4658)) ### Deprecated -- Deprecated `prefix` argument in `ModelCheckpoint` ([#4765](https://github.com/PyTorchLightning/pytorch-lightning/pull/4765)) -- Deprecated the old way of assigning hyper-parameters through `self.hparams = ...` ([#4813](https://github.com/PyTorchLightning/pytorch-lightning/pull/4813)) -- Deprecated `mode='auto'` from `ModelCheckpoint` and `EarlyStopping` ([#4695](https://github.com/PyTorchLightning/pytorch-lightning/pull/4695)) +- Deprecated `prefix` argument in `ModelCheckpoint` ([#4765](https://github.com/Lightning-AI/lightning/pull/4765)) +- Deprecated the old way of assigning hyper-parameters through `self.hparams = ...` ([#4813](https://github.com/Lightning-AI/lightning/pull/4813)) +- Deprecated `mode='auto'` from `ModelCheckpoint` and `EarlyStopping` ([#4695](https://github.com/Lightning-AI/lightning/pull/4695)) ### Removed -- Removed `reorder` parameter of the `auc` metric ([#5004](https://github.com/PyTorchLightning/pytorch-lightning/pull/5004)) -- Removed `multiclass_roc` and `multiclass_precision_recall_curve`, use `roc` and `precision_recall_curve` instead ([#4549](https://github.com/PyTorchLightning/pytorch-lightning/pull/4549)) +- Removed `reorder` parameter of the `auc` metric ([#5004](https://github.com/Lightning-AI/lightning/pull/5004)) +- Removed `multiclass_roc` and `multiclass_precision_recall_curve`, use `roc` and `precision_recall_curve` instead ([#4549](https://github.com/Lightning-AI/lightning/pull/4549)) ### Fixed -- Added feature to move tensors to CPU before saving ([#4309](https://github.com/PyTorchLightning/pytorch-lightning/pull/4309)) -- Fixed `LoggerConnector` to have logged metrics on root device in DP ([#4138](https://github.com/PyTorchLightning/pytorch-lightning/pull/4138)) -- Auto convert tensors to contiguous format when `gather_all` ([#4907](https://github.com/PyTorchLightning/pytorch-lightning/pull/4907)) -- Fixed `PYTHONPATH` for ddp test model ([#4528](https://github.com/PyTorchLightning/pytorch-lightning/pull/4528)) -- Fixed allowing logger to support indexing ([#4595](https://github.com/PyTorchLightning/pytorch-lightning/pull/4595)) -- Fixed DDP and manual_optimization ([#4976](https://github.com/PyTorchLightning/pytorch-lightning/pull/4976)) +- Added feature to move tensors to CPU before saving ([#4309](https://github.com/Lightning-AI/lightning/pull/4309)) +- Fixed `LoggerConnector` to have logged metrics on root device in DP ([#4138](https://github.com/Lightning-AI/lightning/pull/4138)) +- Auto convert tensors to contiguous format when `gather_all` ([#4907](https://github.com/Lightning-AI/lightning/pull/4907)) +- Fixed `PYTHONPATH` for ddp test model ([#4528](https://github.com/Lightning-AI/lightning/pull/4528)) +- Fixed allowing logger to support indexing ([#4595](https://github.com/Lightning-AI/lightning/pull/4595)) +- Fixed DDP and manual_optimization ([#4976](https://github.com/Lightning-AI/lightning/pull/4976)) ## [1.0.8] - 2020-11-24 ### Added -- Added casting to python types for numpy scalars when logging `hparams` ([#4647](https://github.com/PyTorchLightning/pytorch-lightning/pull/4647)) -- Added warning when progress bar refresh rate is less than 20 on Google Colab to prevent crashing ([#4654](https://github.com/PyTorchLightning/pytorch-lightning/pull/4654)) -- Added `F1` class metric ([#4656](https://github.com/PyTorchLightning/pytorch-lightning/pull/4656)) +- Added casting to python types for numpy scalars when logging `hparams` ([#4647](https://github.com/Lightning-AI/lightning/pull/4647)) +- Added warning when progress bar refresh rate is less than 20 on Google Colab to prevent crashing ([#4654](https://github.com/Lightning-AI/lightning/pull/4654)) +- Added `F1` class metric ([#4656](https://github.com/Lightning-AI/lightning/pull/4656)) ### Changed -- Consistently use `step=trainer.global_step` in `LearningRateMonitor` independently of `logging_interval` ([#4376](https://github.com/PyTorchLightning/pytorch-lightning/pull/4376)) -- Metric states are no longer as default added to `state_dict` ([#4685](https://github.com/PyTorchLightning/pytorch-lightning/pull/4685)) -- Renamed class metric `Fbeta` >> `FBeta` ([#4656](https://github.com/PyTorchLightning/pytorch-lightning/pull/4656)) -- Model summary: add 1 decimal place ([#4745](https://github.com/PyTorchLightning/pytorch-lightning/pull/4745)) -- Do not override `PYTHONWARNINGS` ([#4700](https://github.com/PyTorchLightning/pytorch-lightning/pull/4700)) -- Changed `init_ddp_connection` moved from `DDP` to `DDPPlugin` ([#4407](https://github.com/PyTorchLightning/pytorch-lightning/pull/4407)) +- Consistently use `step=trainer.global_step` in `LearningRateMonitor` independently of `logging_interval` ([#4376](https://github.com/Lightning-AI/lightning/pull/4376)) +- Metric states are no longer as default added to `state_dict` ([#4685](https://github.com/Lightning-AI/lightning/pull/4685)) +- Renamed class metric `Fbeta` >> `FBeta` ([#4656](https://github.com/Lightning-AI/lightning/pull/4656)) +- Model summary: add 1 decimal place ([#4745](https://github.com/Lightning-AI/lightning/pull/4745)) +- Do not override `PYTHONWARNINGS` ([#4700](https://github.com/Lightning-AI/lightning/pull/4700)) +- Changed `init_ddp_connection` moved from `DDP` to `DDPPlugin` ([#4407](https://github.com/Lightning-AI/lightning/pull/4407)) ### Fixed -- Fixed checkpoint `hparams` dict casting when `omegaconf` is available ([#4770](https://github.com/PyTorchLightning/pytorch-lightning/pull/4770)) -- Fixed incomplete progress bars when total batches not divisible by refresh rate ([#4577](https://github.com/PyTorchLightning/pytorch-lightning/pull/4577)) -- Updated SSIM metric ([#4566](https://github.com/PyTorchLightning/pytorch-lightning/pull/4566)) -- Fixed batch_arg_name - add `batch_arg_name` to all calls to `_adjust_batch_size`bug ([#4812](https://github.com/PyTorchLightning/pytorch-lightning/pull/4812)) -- Fixed `torchtext` data to GPU ([#4785](https://github.com/PyTorchLightning/pytorch-lightning/pull/4785)) -- Fixed a crash bug in MLFlow logger ([#4716](https://github.com/PyTorchLightning/pytorch-lightning/pull/4716)) +- Fixed checkpoint `hparams` dict casting when `omegaconf` is available ([#4770](https://github.com/Lightning-AI/lightning/pull/4770)) +- Fixed incomplete progress bars when total batches not divisible by refresh rate ([#4577](https://github.com/Lightning-AI/lightning/pull/4577)) +- Updated SSIM metric ([#4566](https://github.com/Lightning-AI/lightning/pull/4566)) +- Fixed batch_arg_name - add `batch_arg_name` to all calls to `_adjust_batch_size`bug ([#4812](https://github.com/Lightning-AI/lightning/pull/4812)) +- Fixed `torchtext` data to GPU ([#4785](https://github.com/Lightning-AI/lightning/pull/4785)) +- Fixed a crash bug in MLFlow logger ([#4716](https://github.com/Lightning-AI/lightning/pull/4716)) ## [1.0.7] - 2020-11-17 ### Added -- Added lambda closure to `manual_optimizer_step` ([#4618](https://github.com/PyTorchLightning/pytorch-lightning/pull/4618)) +- Added lambda closure to `manual_optimizer_step` ([#4618](https://github.com/Lightning-AI/lightning/pull/4618)) ### Changed -- Change Metrics `persistent` default mode to `False` ([#4685](https://github.com/PyTorchLightning/pytorch-lightning/pull/4685)) -- LoggerConnector log_metrics will use `total_batch_idx` instead of `global_step` when logging on `training step` ([#4738](https://github.com/PyTorchLightning/pytorch-lightning/pull/4738)) +- Change Metrics `persistent` default mode to `False` ([#4685](https://github.com/Lightning-AI/lightning/pull/4685)) +- LoggerConnector log_metrics will use `total_batch_idx` instead of `global_step` when logging on `training step` ([#4738](https://github.com/Lightning-AI/lightning/pull/4738)) ### Fixed -- Prevent crash if `sync_dist=True` on CPU ([#4626](https://github.com/PyTorchLightning/pytorch-lightning/pull/4626)) -- Fixed average pbar Metrics ([#4534](https://github.com/PyTorchLightning/pytorch-lightning/pull/4534)) -- Fixed `setup` callback hook to correctly pass the LightningModule through ([#4608](https://github.com/PyTorchLightning/pytorch-lightning/pull/4608)) -- Allowing decorate model init with saving `hparams` inside ([#4662](https://github.com/PyTorchLightning/pytorch-lightning/pull/4662)) -- Fixed `split_idx` set by `LoggerConnector` in `on_trainer_init` to `Trainer` ([#4697](https://github.com/PyTorchLightning/pytorch-lightning/pull/4697)) +- Prevent crash if `sync_dist=True` on CPU ([#4626](https://github.com/Lightning-AI/lightning/pull/4626)) +- Fixed average pbar Metrics ([#4534](https://github.com/Lightning-AI/lightning/pull/4534)) +- Fixed `setup` callback hook to correctly pass the LightningModule through ([#4608](https://github.com/Lightning-AI/lightning/pull/4608)) +- Allowing decorate model init with saving `hparams` inside ([#4662](https://github.com/Lightning-AI/lightning/pull/4662)) +- Fixed `split_idx` set by `LoggerConnector` in `on_trainer_init` to `Trainer` ([#4697](https://github.com/Lightning-AI/lightning/pull/4697)) ## [1.0.6] - 2020-11-11 ### Added -- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/PyTorchLightning/pytorch-lightning/pull/3775)) -- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) -- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) -- Added congratulations at the end of our notebooks ([#4555](https://github.com/PyTorchLightning/pytorch-lightning/pull/4555)) -- Added parameters `move_metrics_to_cpu` in Trainer to disable gpu leak ([#4592](https://github.com/PyTorchLightning/pytorch-lightning/pull/4592)) +- Added metrics aggregation in Horovod and fixed early stopping ([#3775](https://github.com/Lightning-AI/lightning/pull/3775)) +- Added `manual_optimizer_step` which work with `AMP Native` and `accumulated_grad_batches` ([#4485](https://github.com/Lightning-AI/lightning/pull/4485)) +- Added `persistent(mode)` method to metrics, to enable and disable metric states being added to `state_dict` ([#4482](https://github.com/Lightning-AI/lightning/pull/4482)) +- Added congratulations at the end of our notebooks ([#4555](https://github.com/Lightning-AI/lightning/pull/4555)) +- Added parameters `move_metrics_to_cpu` in Trainer to disable gpu leak ([#4592](https://github.com/Lightning-AI/lightning/pull/4592)) ### Changed -- Changed `fsspec` to tuner ([#4458](https://github.com/PyTorchLightning/pytorch-lightning/pull/4458)) -- Unify SLURM/TorchElastic under backend plugin ([#4578](https://github.com/PyTorchLightning/pytorch-lightning/pull/4578), - [#4580](https://github.com/PyTorchLightning/pytorch-lightning/pull/4580), - [#4581](https://github.com/PyTorchLightning/pytorch-lightning/pull/4581), - [#4582](https://github.com/PyTorchLightning/pytorch-lightning/pull/4582), - [#4583](https://github.com/PyTorchLightning/pytorch-lightning/pull/4583)) +- Changed `fsspec` to tuner ([#4458](https://github.com/Lightning-AI/lightning/pull/4458)) +- Unify SLURM/TorchElastic under backend plugin ([#4578](https://github.com/Lightning-AI/lightning/pull/4578), + [#4580](https://github.com/Lightning-AI/lightning/pull/4580), + [#4581](https://github.com/Lightning-AI/lightning/pull/4581), + [#4582](https://github.com/Lightning-AI/lightning/pull/4582), + [#4583](https://github.com/Lightning-AI/lightning/pull/4583)) ### Fixed -- Fixed feature-lack in `hpc_load` ([#4526](https://github.com/PyTorchLightning/pytorch-lightning/pull/4526)) -- Fixed metrics states being overridden in DDP mode ([#4482](https://github.com/PyTorchLightning/pytorch-lightning/pull/4482)) -- Fixed `lightning_getattr`, `lightning_hasattr` not finding the correct attributes in datamodule ([#4347](https://github.com/PyTorchLightning/pytorch-lightning/pull/4347)) -- Fixed automatic optimization AMP by `manual_optimization_step` ([#4485](https://github.com/PyTorchLightning/pytorch-lightning/pull/4485)) -- Replace `MisconfigurationException` with warning in `ModelCheckpoint` Callback ([#4560](https://github.com/PyTorchLightning/pytorch-lightning/pull/4560)) -- Fixed logged keys in mlflow logger ([#4412](https://github.com/PyTorchLightning/pytorch-lightning/pull/4412)) -- Fixed `is_picklable` by catching `AttributeError` ([#4508](https://github.com/PyTorchLightning/pytorch-lightning/pull/4508)) -- Fixed multi test dataloaders dict `AttributeError` error ([#4480](https://github.com/PyTorchLightning/pytorch-lightning/pull/4480)) -- Fixed show progress bar only for `progress_rank 0` on `DDP_SLURM` ([#4437](https://github.com/PyTorchLightning/pytorch-lightning/pull/4437)) +- Fixed feature-lack in `hpc_load` ([#4526](https://github.com/Lightning-AI/lightning/pull/4526)) +- Fixed metrics states being overridden in DDP mode ([#4482](https://github.com/Lightning-AI/lightning/pull/4482)) +- Fixed `lightning_getattr`, `lightning_hasattr` not finding the correct attributes in datamodule ([#4347](https://github.com/Lightning-AI/lightning/pull/4347)) +- Fixed automatic optimization AMP by `manual_optimization_step` ([#4485](https://github.com/Lightning-AI/lightning/pull/4485)) +- Replace `MisconfigurationException` with warning in `ModelCheckpoint` Callback ([#4560](https://github.com/Lightning-AI/lightning/pull/4560)) +- Fixed logged keys in mlflow logger ([#4412](https://github.com/Lightning-AI/lightning/pull/4412)) +- Fixed `is_picklable` by catching `AttributeError` ([#4508](https://github.com/Lightning-AI/lightning/pull/4508)) +- Fixed multi test dataloaders dict `AttributeError` error ([#4480](https://github.com/Lightning-AI/lightning/pull/4480)) +- Fixed show progress bar only for `progress_rank 0` on `DDP_SLURM` ([#4437](https://github.com/Lightning-AI/lightning/pull/4437)) ## [1.0.5] - 2020-11-03 ### Added -- Added PyTorch 1.7 Stable support ([#3821](https://github.com/PyTorchLightning/pytorch-lightning/pull/3821)) -- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/PyTorchLightning/pytorch-lightning/pull/4340)) +- Added PyTorch 1.7 Stable support ([#3821](https://github.com/Lightning-AI/lightning/pull/3821)) +- Added timeout for `tpu_device_exists` to ensure process does not hang indefinitely ([#4340](https://github.com/Lightning-AI/lightning/pull/4340)) ### Changed -- W&B log in sync with `Trainer` step ([#4405](https://github.com/PyTorchLightning/pytorch-lightning/pull/4405)) -- Hook `on_after_backward` is called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) -- Moved `track_and_norm_grad` into `training loop` and called only when `optimizer_step` is being called ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) -- Changed type checker with explicit cast of `ref_model` object ([#4457](https://github.com/PyTorchLightning/pytorch-lightning/pull/4457)) -- Changed `distributed_backend` -> `accelerator` ([#4429](https://github.com/PyTorchLightning/pytorch-lightning/pull/4429)) +- W&B log in sync with `Trainer` step ([#4405](https://github.com/Lightning-AI/lightning/pull/4405)) +- Hook `on_after_backward` is called only when `optimizer_step` is being called ([#4439](https://github.com/Lightning-AI/lightning/pull/4439)) +- Moved `track_and_norm_grad` into `training loop` and called only when `optimizer_step` is being called ([#4439](https://github.com/Lightning-AI/lightning/pull/4439)) +- Changed type checker with explicit cast of `ref_model` object ([#4457](https://github.com/Lightning-AI/lightning/pull/4457)) +- Changed `distributed_backend` -> `accelerator` ([#4429](https://github.com/Lightning-AI/lightning/pull/4429)) ### Deprecated -- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/PyTorchLightning/pytorch-lightning/pull/4336)) +- Deprecated passing `ModelCheckpoint` instance to `checkpoint_callback` Trainer argument ([#4336](https://github.com/Lightning-AI/lightning/pull/4336)) ### Fixed -- Disable saving checkpoints if not trained ([#4372](https://github.com/PyTorchLightning/pytorch-lightning/pull/4372)) -- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/PyTorchLightning/pytorch-lightning/pull/4209)) -- Disabled training when `limit_train_batches=0` ([#4371](https://github.com/PyTorchLightning/pytorch-lightning/pull/4371)) -- Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/PyTorchLightning/pytorch-lightning/pull/4313)) -- Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/PyTorchLightning/pytorch-lightning/pull/4439)) -- Fixed TorchScript export when module includes Metrics ([#4428](https://github.com/PyTorchLightning/pytorch-lightning/pull/4428)) -- Fixed TorchScript trace method's data to device and docstring ([#4360](https://github.com/PyTorchLightning/pytorch-lightning/pull/4360)) -- Fixed CSV logger warning ([#4419](https://github.com/PyTorchLightning/pytorch-lightning/pull/4419)) -- Fixed skip DDP parameter sync ([#4301](https://github.com/PyTorchLightning/pytorch-lightning/pull/4301)) -- Fixed `WandbLogger` _sanitize_callable function ([#4422](https://github.com/PyTorchLightning/pytorch-lightning/pull/4422)) -- Fixed `AMP Native` `_unscale` gradient ([#4441](https://github.com/PyTorchLightning/pytorch-lightning/pull/4441)) +- Disable saving checkpoints if not trained ([#4372](https://github.com/Lightning-AI/lightning/pull/4372)) +- Fixed error using `auto_select_gpus=True` with `gpus=-1` ([#4209](https://github.com/Lightning-AI/lightning/pull/4209)) +- Disabled training when `limit_train_batches=0` ([#4371](https://github.com/Lightning-AI/lightning/pull/4371)) +- Fixed that metrics do not store computational graph for all seen data ([#4313](https://github.com/Lightning-AI/lightning/pull/4313)) +- Fixed AMP unscale for `on_after_backward` ([#4439](https://github.com/Lightning-AI/lightning/pull/4439)) +- Fixed TorchScript export when module includes Metrics ([#4428](https://github.com/Lightning-AI/lightning/pull/4428)) +- Fixed TorchScript trace method's data to device and docstring ([#4360](https://github.com/Lightning-AI/lightning/pull/4360)) +- Fixed CSV logger warning ([#4419](https://github.com/Lightning-AI/lightning/pull/4419)) +- Fixed skip DDP parameter sync ([#4301](https://github.com/Lightning-AI/lightning/pull/4301)) +- Fixed `WandbLogger` _sanitize_callable function ([#4422](https://github.com/Lightning-AI/lightning/pull/4422)) +- Fixed `AMP Native` `_unscale` gradient ([#4441](https://github.com/Lightning-AI/lightning/pull/4441)) ## [1.0.4] - 2020-10-27 ### Added -- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) -- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/PyTorchLightning/pytorch-lightning/pull/4285)) -- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/PyTorchLightning/pytorch-lightning/pull/3586)) -- Added `fsspec` support for profilers ([#4162](https://github.com/PyTorchLightning/pytorch-lightning/pull/4162)) -- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/PyTorchLightning/pytorch-lightning/pull/4344)) -- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) -- Added `optimizer_closure` to `optimizer.step` when supported ([#4190](https://github.com/PyTorchLightning/pytorch-lightning/pull/4190)) -- Added unification of regression metrics ([#4166](https://github.com/PyTorchLightning/pytorch-lightning/pull/4166)) -- Added checkpoint load from Bytes ([#4314](https://github.com/PyTorchLightning/pytorch-lightning/pull/4314)) +- Added `dirpath` and `filename` parameter in `ModelCheckpoint` ([#4213](https://github.com/Lightning-AI/lightning/pull/4213)) +- Added plugins docs and DDPPlugin to customize ddp across all accelerators ([#4258](https://github.com/Lightning-AI/lightning/pull/4285)) +- Added `strict` option to the scheduler dictionary ([#3586](https://github.com/Lightning-AI/lightning/pull/3586)) +- Added `fsspec` support for profilers ([#4162](https://github.com/Lightning-AI/lightning/pull/4162)) +- Added autogenerated helptext to `Trainer.add_argparse_args` ([#4344](https://github.com/Lightning-AI/lightning/pull/4344)) +- Added support for string values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/Lightning-AI/lightning/pull/3656)) +- Added `optimizer_closure` to `optimizer.step` when supported ([#4190](https://github.com/Lightning-AI/lightning/pull/4190)) +- Added unification of regression metrics ([#4166](https://github.com/Lightning-AI/lightning/pull/4166)) +- Added checkpoint load from Bytes ([#4314](https://github.com/Lightning-AI/lightning/pull/4314)) ### Changed -- Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/PyTorchLightning/pytorch-lightning/pull/3587)) -- Allow changing the logged step value in `validation_step` ([#4130](https://github.com/PyTorchLightning/pytorch-lightning/pull/4130)) -- Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/PyTorchLightning/pytorch-lightning/pull/4273)) -- Fixed sanitized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/PyTorchLightning/pytorch-lightning/pull/4320)) +- Improved error messages for invalid `configure_optimizers` returns ([#3587](https://github.com/Lightning-AI/lightning/pull/3587)) +- Allow changing the logged step value in `validation_step` ([#4130](https://github.com/Lightning-AI/lightning/pull/4130)) +- Allow setting `replace_sampler_ddp=True` with a distributed sampler already added ([#4273](https://github.com/Lightning-AI/lightning/pull/4273)) +- Fixed sanitized parameters for `WandbLogger.log_hyperparams` ([#4320](https://github.com/Lightning-AI/lightning/pull/4320)) ### Deprecated -- Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/PyTorchLightning/pytorch-lightning/pull/4213)) -- Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/PyTorchLightning/pytorch-lightning/pull/4237)) -- Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/PyTorchLightning/pytorch-lightning/pull/3656)) +- Deprecated `filepath` in `ModelCheckpoint` ([#4213](https://github.com/Lightning-AI/lightning/pull/4213)) +- Deprecated `reorder` parameter of the `auc` metric ([#4237](https://github.com/Lightning-AI/lightning/pull/4237)) +- Deprecated bool values in `Trainer`'s `profiler` parameter ([#3656](https://github.com/Lightning-AI/lightning/pull/3656)) ### Fixed -- Fixed setting device ids in DDP ([#4297](https://github.com/PyTorchLightning/pytorch-lightning/pull/4297)) -- Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/PyTorchLightning/pytorch-lightning/pull/4323)) -- Fixed `WandbLogger` not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/PyTorchLightning/pytorch-lightning/pull/4341)) -- Fixed `FBeta` computation ([#4183](https://github.com/PyTorchLightning/pytorch-lightning/pull/4183)) -- Fixed `accumulation across batches` has completed `before breaking training loop` ([#4278](https://github.com/PyTorchLightning/pytorch-lightning/pull/4278)) -- Fixed `ModelCheckpoint` don't increase current_epoch and global_step when not training ([#4291](https://github.com/PyTorchLightning/pytorch-lightning/pull/4291)) -- Fixed `COMET_EXPERIMENT_KEY` environment variable usage in comet logger ([#4230](https://github.com/PyTorchLightning/pytorch-lightning/pull/4230)) +- Fixed setting device ids in DDP ([#4297](https://github.com/Lightning-AI/lightning/pull/4297)) +- Fixed synchronization of best model path in `ddp_accelerator` ([#4323](https://github.com/Lightning-AI/lightning/pull/4323)) +- Fixed `WandbLogger` not uploading checkpoint artifacts at the end of training ([#4341](https://github.com/Lightning-AI/lightning/pull/4341)) +- Fixed `FBeta` computation ([#4183](https://github.com/Lightning-AI/lightning/pull/4183)) +- Fixed `accumulation across batches` has completed `before breaking training loop` ([#4278](https://github.com/Lightning-AI/lightning/pull/4278)) +- Fixed `ModelCheckpoint` don't increase current_epoch and global_step when not training ([#4291](https://github.com/Lightning-AI/lightning/pull/4291)) +- Fixed `COMET_EXPERIMENT_KEY` environment variable usage in comet logger ([#4230](https://github.com/Lightning-AI/lightning/pull/4230)) ## [1.0.3] - 2020-10-20 ### Added -- Added persistent flag to `Metric.add_state` ([#4195](https://github.com/PyTorchLightning/pytorch-lightning/pull/4195)) +- Added persistent flag to `Metric.add_state` ([#4195](https://github.com/Lightning-AI/lightning/pull/4195)) ### Changed -- Used `checkpoint_connector.hpc_save` in SLURM ([#4217](https://github.com/PyTorchLightning/pytorch-lightning/pull/4217)) -- Moved base req. to root ([#4219](https://github.com/PyTorchLightning/pytorch-lightning/pull/4219)) +- Used `checkpoint_connector.hpc_save` in SLURM ([#4217](https://github.com/Lightning-AI/lightning/pull/4217)) +- Moved base req. to root ([#4219](https://github.com/Lightning-AI/lightning/pull/4219)) ### Fixed -- Fixed `hparams` assign in init ([#4189](https://github.com/PyTorchLightning/pytorch-lightning/pull/4189)) -- Fixed overwrite check for model hooks ([#4010](https://github.com/PyTorchLightning/pytorch-lightning/pull/4010)) +- Fixed `hparams` assign in init ([#4189](https://github.com/Lightning-AI/lightning/pull/4189)) +- Fixed overwrite check for model hooks ([#4010](https://github.com/Lightning-AI/lightning/pull/4010)) ## [1.0.2] - 2020-10-15 ### Added -- Added trace functionality to the function `to_torchscript` ([#4142](https://github.com/PyTorchLightning/pytorch-lightning/pull/4142)) +- Added trace functionality to the function `to_torchscript` ([#4142](https://github.com/Lightning-AI/lightning/pull/4142)) ### Changed -- Called `on_load_checkpoint` before loading `state_dict` ([#4057](https://github.com/PyTorchLightning/pytorch-lightning/pull/4057)) +- Called `on_load_checkpoint` before loading `state_dict` ([#4057](https://github.com/Lightning-AI/lightning/pull/4057)) ### Removed -- Removed duplicate metric vs step log for train loop ([#4173](https://github.com/PyTorchLightning/pytorch-lightning/pull/4173)) +- Removed duplicate metric vs step log for train loop ([#4173](https://github.com/Lightning-AI/lightning/pull/4173)) ### Fixed -- Fixed the `self.log` problem in `validation_step()` ([#4169](https://github.com/PyTorchLightning/pytorch-lightning/pull/4169)) -- Fixed `hparams` saving - save the state when `save_hyperparameters()` is called [in `__init__`] ([#4163](https://github.com/PyTorchLightning/pytorch-lightning/pull/4163)) -- Fixed runtime failure while exporting `hparams` to yaml ([#4158](https://github.com/PyTorchLightning/pytorch-lightning/pull/4158)) +- Fixed the `self.log` problem in `validation_step()` ([#4169](https://github.com/Lightning-AI/lightning/pull/4169)) +- Fixed `hparams` saving - save the state when `save_hyperparameters()` is called [in `__init__`] ([#4163](https://github.com/Lightning-AI/lightning/pull/4163)) +- Fixed runtime failure while exporting `hparams` to yaml ([#4158](https://github.com/Lightning-AI/lightning/pull/4158)) ## [1.0.1] - 2020-10-14 ### Added -- Added getstate/setstate method for torch.save serialization ([#4127](https://github.com/PyTorchLightning/pytorch-lightning/pull/4127)) +- Added getstate/setstate method for torch.save serialization ([#4127](https://github.com/Lightning-AI/lightning/pull/4127)) ## [1.0.0] - 2020-10-13 ### Added -- Added Explained Variance Metric + metric fix ([#4013](https://github.com/PyTorchLightning/pytorch-lightning/pull/4013)) -- Added Metric <-> Lightning Module integration tests ([#4008](https://github.com/PyTorchLightning/pytorch-lightning/pull/4008)) -- Added parsing OS env vars in `Trainer` ([#4022](https://github.com/PyTorchLightning/pytorch-lightning/pull/4022)) -- Added classification metrics ([#4043](https://github.com/PyTorchLightning/pytorch-lightning/pull/4043)) -- Updated explained variance metric ([#4024](https://github.com/PyTorchLightning/pytorch-lightning/pull/4024)) -- Enabled plugins ([#4041](https://github.com/PyTorchLightning/pytorch-lightning/pull/4041)) -- Enabled custom clusters ([#4048](https://github.com/PyTorchLightning/pytorch-lightning/pull/4048)) -- Enabled passing in custom accelerators ([#4050](https://github.com/PyTorchLightning/pytorch-lightning/pull/4050)) -- Added `LightningModule.toggle_optimizer` ([#4058](https://github.com/PyTorchLightning/pytorch-lightning/pull/4058)) -- Added `LightningModule.manual_backward` ([#4063](https://github.com/PyTorchLightning/pytorch-lightning/pull/4063)) -- Added `output` argument to `*_batch_end` hooks ([#3965](https://github.com/PyTorchLightning/pytorch-lightning/pull/3965), - [#3966](https://github.com/PyTorchLightning/pytorch-lightning/pull/3966)) -- Added `output` argument to `*_epoch_end` hooks ([#3967](https://github.com/PyTorchLightning/pytorch-lightning/pull/3967)) +- Added Explained Variance Metric + metric fix ([#4013](https://github.com/Lightning-AI/lightning/pull/4013)) +- Added Metric <-> Lightning Module integration tests ([#4008](https://github.com/Lightning-AI/lightning/pull/4008)) +- Added parsing OS env vars in `Trainer` ([#4022](https://github.com/Lightning-AI/lightning/pull/4022)) +- Added classification metrics ([#4043](https://github.com/Lightning-AI/lightning/pull/4043)) +- Updated explained variance metric ([#4024](https://github.com/Lightning-AI/lightning/pull/4024)) +- Enabled plugins ([#4041](https://github.com/Lightning-AI/lightning/pull/4041)) +- Enabled custom clusters ([#4048](https://github.com/Lightning-AI/lightning/pull/4048)) +- Enabled passing in custom accelerators ([#4050](https://github.com/Lightning-AI/lightning/pull/4050)) +- Added `LightningModule.toggle_optimizer` ([#4058](https://github.com/Lightning-AI/lightning/pull/4058)) +- Added `LightningModule.manual_backward` ([#4063](https://github.com/Lightning-AI/lightning/pull/4063)) +- Added `output` argument to `*_batch_end` hooks ([#3965](https://github.com/Lightning-AI/lightning/pull/3965), + [#3966](https://github.com/Lightning-AI/lightning/pull/3966)) +- Added `output` argument to `*_epoch_end` hooks ([#3967](https://github.com/Lightning-AI/lightning/pull/3967)) ### Changed -- Integrated metrics API with self.log ([#3961](https://github.com/PyTorchLightning/pytorch-lightning/pull/3961)) -- Decoupled Apex ([#4052](https://github.com/PyTorchLightning/pytorch-lightning/pull/4052), - [#4054](https://github.com/PyTorchLightning/pytorch-lightning/pull/4054), - [#4055](https://github.com/PyTorchLightning/pytorch-lightning/pull/4055), - [#4056](https://github.com/PyTorchLightning/pytorch-lightning/pull/4056), - [#4058](https://github.com/PyTorchLightning/pytorch-lightning/pull/4058), - [#4060](https://github.com/PyTorchLightning/pytorch-lightning/pull/4060), - [#4061](https://github.com/PyTorchLightning/pytorch-lightning/pull/4061), - [#4062](https://github.com/PyTorchLightning/pytorch-lightning/pull/4062), - [#4063](https://github.com/PyTorchLightning/pytorch-lightning/pull/4063), - [#4064](https://github.com/PyTorchLightning/pytorch-lightning/pull/4064), - [#4065](https://github.com/PyTorchLightning/pytorch-lightning/pull/4065)) -- Renamed all backends to `Accelerator` ([#4066](https://github.com/PyTorchLightning/pytorch-lightning/pull/4066)) -- Enabled manual returns ([#4089](https://github.com/PyTorchLightning/pytorch-lightning/pull/4089)) +- Integrated metrics API with self.log ([#3961](https://github.com/Lightning-AI/lightning/pull/3961)) +- Decoupled Apex ([#4052](https://github.com/Lightning-AI/lightning/pull/4052), + [#4054](https://github.com/Lightning-AI/lightning/pull/4054), + [#4055](https://github.com/Lightning-AI/lightning/pull/4055), + [#4056](https://github.com/Lightning-AI/lightning/pull/4056), + [#4058](https://github.com/Lightning-AI/lightning/pull/4058), + [#4060](https://github.com/Lightning-AI/lightning/pull/4060), + [#4061](https://github.com/Lightning-AI/lightning/pull/4061), + [#4062](https://github.com/Lightning-AI/lightning/pull/4062), + [#4063](https://github.com/Lightning-AI/lightning/pull/4063), + [#4064](https://github.com/Lightning-AI/lightning/pull/4064), + [#4065](https://github.com/Lightning-AI/lightning/pull/4065)) +- Renamed all backends to `Accelerator` ([#4066](https://github.com/Lightning-AI/lightning/pull/4066)) +- Enabled manual returns ([#4089](https://github.com/Lightning-AI/lightning/pull/4089)) ### Removed -- Removed support for EvalResult and TrainResult ([#3968](https://github.com/PyTorchLightning/pytorch-lightning/pull/3968)) -- Removed deprecated trainer flags: `overfit_pct`, `log_save_interval`, `row_log_interval` ([#3969](https://github.com/PyTorchLightning/pytorch-lightning/pull/3969)) -- Removed deprecated early_stop_callback ([#3982](https://github.com/PyTorchLightning/pytorch-lightning/pull/3982)) -- Removed deprecated model hooks ([#3980](https://github.com/PyTorchLightning/pytorch-lightning/pull/3980)) -- Removed deprecated callbacks ([#3979](https://github.com/PyTorchLightning/pytorch-lightning/pull/3979)) -- Removed `trainer` argument in `LightningModule.backward` [#4056](https://github.com/PyTorchLightning/pytorch-lightning/pull/4056)) +- Removed support for EvalResult and TrainResult ([#3968](https://github.com/Lightning-AI/lightning/pull/3968)) +- Removed deprecated trainer flags: `overfit_pct`, `log_save_interval`, `row_log_interval` ([#3969](https://github.com/Lightning-AI/lightning/pull/3969)) +- Removed deprecated early_stop_callback ([#3982](https://github.com/Lightning-AI/lightning/pull/3982)) +- Removed deprecated model hooks ([#3980](https://github.com/Lightning-AI/lightning/pull/3980)) +- Removed deprecated callbacks ([#3979](https://github.com/Lightning-AI/lightning/pull/3979)) +- Removed `trainer` argument in `LightningModule.backward` [#4056](https://github.com/Lightning-AI/lightning/pull/4056)) ### Fixed -- Fixed `current_epoch` property update to reflect true epoch number inside `LightningDataModule`, when `reload_dataloaders_every_epoch=True`. ([#3974](https://github.com/PyTorchLightning/pytorch-lightning/pull/3974)) -- Fixed to print scaler value in progress bar ([#4053](https://github.com/PyTorchLightning/pytorch-lightning/pull/4053)) -- Fixed mismatch between docstring and code regarding when `on_load_checkpoint` hook is called ([#3996](https://github.com/PyTorchLightning/pytorch-lightning/pull/3996)) +- Fixed `current_epoch` property update to reflect true epoch number inside `LightningDataModule`, when `reload_dataloaders_every_epoch=True`. ([#3974](https://github.com/Lightning-AI/lightning/pull/3974)) +- Fixed to print scaler value in progress bar ([#4053](https://github.com/Lightning-AI/lightning/pull/4053)) +- Fixed mismatch between docstring and code regarding when `on_load_checkpoint` hook is called ([#3996](https://github.com/Lightning-AI/lightning/pull/3996)) ## [0.10.0] - 2020-10-07 ### Added -- Added new Metrics API. ([#3868](https://github.com/PyTorchLightning/pytorch-lightning/pull/3868), [#3921](https://github.com/PyTorchLightning/pytorch-lightning/pull/3921)) -- Enable PyTorch 1.7 compatibility ([#3541](https://github.com/PyTorchLightning/pytorch-lightning/pull/3541)) -- Added `LightningModule.to_torchscript` to support exporting as `ScriptModule` ([#3258](https://github.com/PyTorchLightning/pytorch-lightning/pull/3258)) -- Added warning when dropping unpicklable `hparams` ([#2874](https://github.com/PyTorchLightning/pytorch-lightning/pull/2874)) -- Added EMB similarity ([#3349](https://github.com/PyTorchLightning/pytorch-lightning/pull/3349)) -- Added `ModelCheckpoint.to_yaml` method ([#3048](https://github.com/PyTorchLightning/pytorch-lightning/pull/3048)) -- Allow `ModelCheckpoint` monitor to be `None`, meaning it will always save ([#3630](https://github.com/PyTorchLightning/pytorch-lightning/pull/3630)) -- Disabled optimizers setup during testing ([#3059](https://github.com/PyTorchLightning/pytorch-lightning/pull/3059)) -- Added support for datamodules to save and load checkpoints when training ([#3563](https://github.com/PyTorchLightning/pytorch-lightning/pull/3563)) -- Added support for datamodule in learning rate finder ([#3425](https://github.com/PyTorchLightning/pytorch-lightning/pull/3425)) -- Added gradient clip test for native AMP ([#3754](https://github.com/PyTorchLightning/pytorch-lightning/pull/3754)) -- Added dist lib to enable syncing anything across devices ([#3762](https://github.com/PyTorchLightning/pytorch-lightning/pull/3762)) -- Added `broadcast` to `TPUBackend` ([#3814](https://github.com/PyTorchLightning/pytorch-lightning/pull/3814)) -- Added `XLADeviceUtils` class to check XLA device type ([#3274](https://github.com/PyTorchLightning/pytorch-lightning/pull/3274)) +- Added new Metrics API. ([#3868](https://github.com/Lightning-AI/lightning/pull/3868), [#3921](https://github.com/Lightning-AI/lightning/pull/3921)) +- Enable PyTorch 1.7 compatibility ([#3541](https://github.com/Lightning-AI/lightning/pull/3541)) +- Added `LightningModule.to_torchscript` to support exporting as `ScriptModule` ([#3258](https://github.com/Lightning-AI/lightning/pull/3258)) +- Added warning when dropping unpicklable `hparams` ([#2874](https://github.com/Lightning-AI/lightning/pull/2874)) +- Added EMB similarity ([#3349](https://github.com/Lightning-AI/lightning/pull/3349)) +- Added `ModelCheckpoint.to_yaml` method ([#3048](https://github.com/Lightning-AI/lightning/pull/3048)) +- Allow `ModelCheckpoint` monitor to be `None`, meaning it will always save ([#3630](https://github.com/Lightning-AI/lightning/pull/3630)) +- Disabled optimizers setup during testing ([#3059](https://github.com/Lightning-AI/lightning/pull/3059)) +- Added support for datamodules to save and load checkpoints when training ([#3563](https://github.com/Lightning-AI/lightning/pull/3563)) +- Added support for datamodule in learning rate finder ([#3425](https://github.com/Lightning-AI/lightning/pull/3425)) +- Added gradient clip test for native AMP ([#3754](https://github.com/Lightning-AI/lightning/pull/3754)) +- Added dist lib to enable syncing anything across devices ([#3762](https://github.com/Lightning-AI/lightning/pull/3762)) +- Added `broadcast` to `TPUBackend` ([#3814](https://github.com/Lightning-AI/lightning/pull/3814)) +- Added `XLADeviceUtils` class to check XLA device type ([#3274](https://github.com/Lightning-AI/lightning/pull/3274)) ### Changed - Refactored accelerator backends: - * moved TPU `xxx_step` to backend ([#3118](https://github.com/PyTorchLightning/pytorch-lightning/pull/3118)) - * refactored DDP backend `forward` ([#3119](https://github.com/PyTorchLightning/pytorch-lightning/pull/3119)) - * refactored GPU backend `__step` ([#3120](https://github.com/PyTorchLightning/pytorch-lightning/pull/3120)) - * refactored Horovod backend ([#3121](https://github.com/PyTorchLightning/pytorch-lightning/pull/3121), - [#3122](https://github.com/PyTorchLightning/pytorch-lightning/pull/3122)) - * remove obscure forward call in eval + CPU backend `___step` ([#3123](https://github.com/PyTorchLightning/pytorch-lightning/pull/3123)) - * reduced all simplified forward ([#3126](https://github.com/PyTorchLightning/pytorch-lightning/pull/3126)) - * added hook base method ([#3127](https://github.com/PyTorchLightning/pytorch-lightning/pull/3127)) - * refactor eval loop to use hooks - use `test_mode` for if so we can split later ([#3129](https://github.com/PyTorchLightning/pytorch-lightning/pull/3129)) - * moved `___step_end` hooks ([#3130](https://github.com/PyTorchLightning/pytorch-lightning/pull/3130)) - * training forward refactor ([#3134](https://github.com/PyTorchLightning/pytorch-lightning/pull/3134)) - * training AMP scaling refactor ([#3135](https://github.com/PyTorchLightning/pytorch-lightning/pull/3135)) - * eval step scaling factor ([#3136](https://github.com/PyTorchLightning/pytorch-lightning/pull/3136)) - * add eval loop object to streamline eval loop ([#3138](https://github.com/PyTorchLightning/pytorch-lightning/pull/3138)) - * refactored dataloader process hook ([#3139](https://github.com/PyTorchLightning/pytorch-lightning/pull/3139)) - * refactored inner eval loop ([#3141](https://github.com/PyTorchLightning/pytorch-lightning/pull/3141)) - * final inner eval loop hooks ([#3154](https://github.com/PyTorchLightning/pytorch-lightning/pull/3154)) - * clean up hooks in `run_evaluation` ([#3156](https://github.com/PyTorchLightning/pytorch-lightning/pull/3156)) - * clean up data reset ([#3161](https://github.com/PyTorchLightning/pytorch-lightning/pull/3161)) - * expand eval loop out ([#3165](https://github.com/PyTorchLightning/pytorch-lightning/pull/3165)) - * moved hooks around in eval loop ([#3195](https://github.com/PyTorchLightning/pytorch-lightning/pull/3195)) - * remove `_evaluate` fx ([#3197](https://github.com/PyTorchLightning/pytorch-lightning/pull/3197)) - * `Trainer.fit` hook clean up ([#3198](https://github.com/PyTorchLightning/pytorch-lightning/pull/3198)) - * DDPs train hooks ([#3203](https://github.com/PyTorchLightning/pytorch-lightning/pull/3203)) - * refactor DDP backend ([#3204](https://github.com/PyTorchLightning/pytorch-lightning/pull/3204), - [#3207](https://github.com/PyTorchLightning/pytorch-lightning/pull/3207), - [#3208](https://github.com/PyTorchLightning/pytorch-lightning/pull/3208), - [#3209](https://github.com/PyTorchLightning/pytorch-lightning/pull/3209), - [#3210](https://github.com/PyTorchLightning/pytorch-lightning/pull/3210)) - * reduced accelerator selection ([#3211](https://github.com/PyTorchLightning/pytorch-lightning/pull/3211)) - * group prepare data hook ([#3212](https://github.com/PyTorchLightning/pytorch-lightning/pull/3212)) - * added data connector ([#3285](https://github.com/PyTorchLightning/pytorch-lightning/pull/3285)) - * modular is_overridden ([#3290](https://github.com/PyTorchLightning/pytorch-lightning/pull/3290)) - * adding `Trainer.tune()` ([#3293](https://github.com/PyTorchLightning/pytorch-lightning/pull/3293)) - * move `run_pretrain_routine` -> `setup_training` ([#3294](https://github.com/PyTorchLightning/pytorch-lightning/pull/3294)) - * move train outside of setup training ([#3297](https://github.com/PyTorchLightning/pytorch-lightning/pull/3297)) - * move `prepare_data` to data connector ([#3307](https://github.com/PyTorchLightning/pytorch-lightning/pull/3307)) - * moved accelerator router ([#3309](https://github.com/PyTorchLightning/pytorch-lightning/pull/3309)) - * train loop refactor - moving train loop to own object ([#3310](https://github.com/PyTorchLightning/pytorch-lightning/pull/3310), - [#3312](https://github.com/PyTorchLightning/pytorch-lightning/pull/3312), - [#3313](https://github.com/PyTorchLightning/pytorch-lightning/pull/3313), - [#3314](https://github.com/PyTorchLightning/pytorch-lightning/pull/3314)) - * duplicate data interface definition up into DataHooks class ([#3344](https://github.com/PyTorchLightning/pytorch-lightning/pull/3344)) - * inner train loop ([#3359](https://github.com/PyTorchLightning/pytorch-lightning/pull/3359), - [#3361](https://github.com/PyTorchLightning/pytorch-lightning/pull/3361), - [#3362](https://github.com/PyTorchLightning/pytorch-lightning/pull/3362), - [#3363](https://github.com/PyTorchLightning/pytorch-lightning/pull/3363), - [#3365](https://github.com/PyTorchLightning/pytorch-lightning/pull/3365), - [#3366](https://github.com/PyTorchLightning/pytorch-lightning/pull/3366), - [#3367](https://github.com/PyTorchLightning/pytorch-lightning/pull/3367), - [#3368](https://github.com/PyTorchLightning/pytorch-lightning/pull/3368), - [#3369](https://github.com/PyTorchLightning/pytorch-lightning/pull/3369), - [#3370](https://github.com/PyTorchLightning/pytorch-lightning/pull/3370), - [#3371](https://github.com/PyTorchLightning/pytorch-lightning/pull/3371), - [#3372](https://github.com/PyTorchLightning/pytorch-lightning/pull/3372), - [#3373](https://github.com/PyTorchLightning/pytorch-lightning/pull/3373), - [#3374](https://github.com/PyTorchLightning/pytorch-lightning/pull/3374), - [#3375](https://github.com/PyTorchLightning/pytorch-lightning/pull/3375), - [#3376](https://github.com/PyTorchLightning/pytorch-lightning/pull/3376), - [#3385](https://github.com/PyTorchLightning/pytorch-lightning/pull/3385), - [#3388](https://github.com/PyTorchLightning/pytorch-lightning/pull/3388), - [#3397](https://github.com/PyTorchLightning/pytorch-lightning/pull/3397)) - * all logging related calls in a connector ([#3395](https://github.com/PyTorchLightning/pytorch-lightning/pull/3395)) - * device parser ([#3400](https://github.com/PyTorchLightning/pytorch-lightning/pull/3400), - [#3405](https://github.com/PyTorchLightning/pytorch-lightning/pull/3405)) - * added model connector ([#3407](https://github.com/PyTorchLightning/pytorch-lightning/pull/3407)) - * moved eval loop logging to loggers ([#3408](https://github.com/PyTorchLightning/pytorch-lightning/pull/3408)) - * moved eval loop (#3412[#3408](https://github.com/PyTorchLightning/pytorch-lightning/pull/3408)) - * trainer/separate argparse ([#3421](https://github.com/PyTorchLightning/pytorch-lightning/pull/3421), - [#3428](https://github.com/PyTorchLightning/pytorch-lightning/pull/3428), - [#3432](https://github.com/PyTorchLightning/pytorch-lightning/pull/3432)) - * move `lr_finder` ([#3434](https://github.com/PyTorchLightning/pytorch-lightning/pull/3434)) - * organize args (#[#3435](https://github.com/PyTorchLightning/pytorch-lightning/pull/3435), - [#3442](https://github.com/PyTorchLightning/pytorch-lightning/pull/3442), - [#3447](https://github.com/PyTorchLightning/pytorch-lightning/pull/3447), - [#3448](https://github.com/PyTorchLightning/pytorch-lightning/pull/3448), - [#3449](https://github.com/PyTorchLightning/pytorch-lightning/pull/3449), - [#3456](https://github.com/PyTorchLightning/pytorch-lightning/pull/3456)) - * move specific accelerator code ([#3457](https://github.com/PyTorchLightning/pytorch-lightning/pull/3457)) - * group connectors ([#3472](https://github.com/PyTorchLightning/pytorch-lightning/pull/3472)) - * accelerator connector methods x/n ([#3469](https://github.com/PyTorchLightning/pytorch-lightning/pull/3469), - [#3470](https://github.com/PyTorchLightning/pytorch-lightning/pull/3470), - [#3474](https://github.com/PyTorchLightning/pytorch-lightning/pull/3474)) - * merge backends x/n ([#3476](https://github.com/PyTorchLightning/pytorch-lightning/pull/3476), - [#3477](https://github.com/PyTorchLightning/pytorch-lightning/pull/3477), - [#3478](https://github.com/PyTorchLightning/pytorch-lightning/pull/3478), - [#3480](https://github.com/PyTorchLightning/pytorch-lightning/pull/3480), - [#3482](https://github.com/PyTorchLightning/pytorch-lightning/pull/3482)) - * apex plugin ([#3502](https://github.com/PyTorchLightning/pytorch-lightning/pull/3502)) - * precision plugins ([#3504](https://github.com/PyTorchLightning/pytorch-lightning/pull/3504)) - * Result - make monitor default to `checkpoint_on` to simplify ([#3571](https://github.com/PyTorchLightning/pytorch-lightning/pull/3571)) - * reference to the Trainer on the `LightningDataModule` ([#3684](https://github.com/PyTorchLightning/pytorch-lightning/pull/3684)) - * add `.log` to lightning module ([#3686](https://github.com/PyTorchLightning/pytorch-lightning/pull/3686), - [#3699](https://github.com/PyTorchLightning/pytorch-lightning/pull/3699), - [#3701](https://github.com/PyTorchLightning/pytorch-lightning/pull/3701), - [#3704](https://github.com/PyTorchLightning/pytorch-lightning/pull/3704), - [#3715](https://github.com/PyTorchLightning/pytorch-lightning/pull/3715)) - * enable tracking original metric when step and epoch are both true ([#3685](https://github.com/PyTorchLightning/pytorch-lightning/pull/3685)) - * deprecated results obj, added support for simpler comms ([#3681](https://github.com/PyTorchLightning/pytorch-lightning/pull/3681)) - * move backends back to individual files ([#3712](https://github.com/PyTorchLightning/pytorch-lightning/pull/3712)) - * fixes logging for eval steps ([#3763](https://github.com/PyTorchLightning/pytorch-lightning/pull/3763)) - * decoupled DDP, DDP spawn ([#3733](https://github.com/PyTorchLightning/pytorch-lightning/pull/3733), - [#3766](https://github.com/PyTorchLightning/pytorch-lightning/pull/3766), - [#3767](https://github.com/PyTorchLightning/pytorch-lightning/pull/3767), - [#3774](https://github.com/PyTorchLightning/pytorch-lightning/pull/3774), - [#3802](https://github.com/PyTorchLightning/pytorch-lightning/pull/3802), - [#3806](https://github.com/PyTorchLightning/pytorch-lightning/pull/3806), - [#3817](https://github.com/PyTorchLightning/pytorch-lightning/pull/3817), - [#3819](https://github.com/PyTorchLightning/pytorch-lightning/pull/3819), - [#3927](https://github.com/PyTorchLightning/pytorch-lightning/pull/3927)) - * remove weight loading hack for ddp_cpu ([#3808](https://github.com/PyTorchLightning/pytorch-lightning/pull/3808)) - * separate `torchelastic` from DDP ([#3810](https://github.com/PyTorchLightning/pytorch-lightning/pull/3810)) - * separate SLURM from DDP ([#3809](https://github.com/PyTorchLightning/pytorch-lightning/pull/3809)) - * decoupled DDP2 ([#3816](https://github.com/PyTorchLightning/pytorch-lightning/pull/3816)) - * bug fix with logging val epoch end + monitor ([#3812](https://github.com/PyTorchLightning/pytorch-lightning/pull/3812)) - * callback system and init DDP ([#3836](https://github.com/PyTorchLightning/pytorch-lightning/pull/3836)) - * adding compute environments ([#3837](https://github.com/PyTorchLightning/pytorch-lightning/pull/3837), [#3842](https://github.com/PyTorchLightning/pytorch-lightning/pull/3842)) - * epoch can now log independently ([#3843](https://github.com/PyTorchLightning/pytorch-lightning/pull/3843)) - * test selecting the correct backend. temp backends while slurm and TorchElastic are decoupled ([#3848](https://github.com/PyTorchLightning/pytorch-lightning/pull/3848)) - * fixed `init_slurm_connection` causing hostname errors ([#3856](https://github.com/PyTorchLightning/pytorch-lightning/pull/3856)) - * moves init apex from LM to apex connector ([#3923](https://github.com/PyTorchLightning/pytorch-lightning/pull/3923)) - * moves sync bn to each backend ([#3925](https://github.com/PyTorchLightning/pytorch-lightning/pull/3925)) - * moves configure ddp to each backend ([#3924](https://github.com/PyTorchLightning/pytorch-lightning/pull/3924)) -- Deprecation warning ([#3844](https://github.com/PyTorchLightning/pytorch-lightning/pull/3844)) -- Changed `LearningRateLogger` to `LearningRateMonitor` ([#3251](https://github.com/PyTorchLightning/pytorch-lightning/pull/3251)) -- Used `fsspec` instead of `gfile` for all IO ([#3320](https://github.com/PyTorchLightning/pytorch-lightning/pull/3320)) - * Swapped `torch.load` for `fsspec` load in DDP spawn backend ([#3787](https://github.com/PyTorchLightning/pytorch-lightning/pull/3787)) - * Swapped `torch.load` for `fsspec` load in cloud_io loading ([#3692](https://github.com/PyTorchLightning/pytorch-lightning/pull/3692)) - * Added support for `to_disk()` to use remote filepaths with `fsspec` ([#3930](https://github.com/PyTorchLightning/pytorch-lightning/pull/3930)) - * Updated model_checkpoint's to_yaml to use `fsspec` open ([#3801](https://github.com/PyTorchLightning/pytorch-lightning/pull/3801)) - * Fixed `fsspec` is inconsistent when doing `fs.ls` ([#3805](https://github.com/PyTorchLightning/pytorch-lightning/pull/3805)) -- Refactor `GPUStatsMonitor` to improve training speed ([#3257](https://github.com/PyTorchLightning/pytorch-lightning/pull/3257)) -- Changed IoU score behavior for classes absent in target and pred ([#3098](https://github.com/PyTorchLightning/pytorch-lightning/pull/3098)) -- Changed IoU `remove_bg` bool to `ignore_index` optional int ([#3098](https://github.com/PyTorchLightning/pytorch-lightning/pull/3098)) -- Changed defaults of `save_top_k` and `save_last` to `None` in ModelCheckpoint ([#3680](https://github.com/PyTorchLightning/pytorch-lightning/pull/3680)) -- `row_log_interval` and `log_save_interval` are now based on training loop's `global_step` instead of epoch-internal batch index ([#3667](https://github.com/PyTorchLightning/pytorch-lightning/pull/3667)) -- Silenced some warnings. verified ddp refactors ([#3483](https://github.com/PyTorchLightning/pytorch-lightning/pull/3483)) -- Cleaning up stale logger tests ([#3490](https://github.com/PyTorchLightning/pytorch-lightning/pull/3490)) -- Allow `ModelCheckpoint` monitor to be `None` ([#3633](https://github.com/PyTorchLightning/pytorch-lightning/pull/3633)) -- Enable `None` model checkpoint default ([#3669](https://github.com/PyTorchLightning/pytorch-lightning/pull/3669)) -- Skipped `best_model_path` if `checkpoint_callback` is `None` ([#2962](https://github.com/PyTorchLightning/pytorch-lightning/pull/2962)) -- Used `raise .. from ..` to explicitly chain exceptions ([#3750](https://github.com/PyTorchLightning/pytorch-lightning/pull/3750)) -- Mocking loggers ([#3596](https://github.com/PyTorchLightning/pytorch-lightning/pull/3596), - [#3617](https://github.com/PyTorchLightning/pytorch-lightning/pull/3617), - [#3851](https://github.com/PyTorchLightning/pytorch-lightning/pull/3851), - [#3859](https://github.com/PyTorchLightning/pytorch-lightning/pull/3859), - [#3884](https://github.com/PyTorchLightning/pytorch-lightning/pull/3884), - [#3853](https://github.com/PyTorchLightning/pytorch-lightning/pull/3853), - [#3910](https://github.com/PyTorchLightning/pytorch-lightning/pull/3910), - [#3889](https://github.com/PyTorchLightning/pytorch-lightning/pull/3889), - [#3926](https://github.com/PyTorchLightning/pytorch-lightning/pull/3926)) -- Write predictions in LightningModule instead of EvalResult [#3882](https://github.com/PyTorchLightning/pytorch-lightning/pull/3882) + * moved TPU `xxx_step` to backend ([#3118](https://github.com/Lightning-AI/lightning/pull/3118)) + * refactored DDP backend `forward` ([#3119](https://github.com/Lightning-AI/lightning/pull/3119)) + * refactored GPU backend `__step` ([#3120](https://github.com/Lightning-AI/lightning/pull/3120)) + * refactored Horovod backend ([#3121](https://github.com/Lightning-AI/lightning/pull/3121), + [#3122](https://github.com/Lightning-AI/lightning/pull/3122)) + * remove obscure forward call in eval + CPU backend `___step` ([#3123](https://github.com/Lightning-AI/lightning/pull/3123)) + * reduced all simplified forward ([#3126](https://github.com/Lightning-AI/lightning/pull/3126)) + * added hook base method ([#3127](https://github.com/Lightning-AI/lightning/pull/3127)) + * refactor eval loop to use hooks - use `test_mode` for if so we can split later ([#3129](https://github.com/Lightning-AI/lightning/pull/3129)) + * moved `___step_end` hooks ([#3130](https://github.com/Lightning-AI/lightning/pull/3130)) + * training forward refactor ([#3134](https://github.com/Lightning-AI/lightning/pull/3134)) + * training AMP scaling refactor ([#3135](https://github.com/Lightning-AI/lightning/pull/3135)) + * eval step scaling factor ([#3136](https://github.com/Lightning-AI/lightning/pull/3136)) + * add eval loop object to streamline eval loop ([#3138](https://github.com/Lightning-AI/lightning/pull/3138)) + * refactored dataloader process hook ([#3139](https://github.com/Lightning-AI/lightning/pull/3139)) + * refactored inner eval loop ([#3141](https://github.com/Lightning-AI/lightning/pull/3141)) + * final inner eval loop hooks ([#3154](https://github.com/Lightning-AI/lightning/pull/3154)) + * clean up hooks in `run_evaluation` ([#3156](https://github.com/Lightning-AI/lightning/pull/3156)) + * clean up data reset ([#3161](https://github.com/Lightning-AI/lightning/pull/3161)) + * expand eval loop out ([#3165](https://github.com/Lightning-AI/lightning/pull/3165)) + * moved hooks around in eval loop ([#3195](https://github.com/Lightning-AI/lightning/pull/3195)) + * remove `_evaluate` fx ([#3197](https://github.com/Lightning-AI/lightning/pull/3197)) + * `Trainer.fit` hook clean up ([#3198](https://github.com/Lightning-AI/lightning/pull/3198)) + * DDPs train hooks ([#3203](https://github.com/Lightning-AI/lightning/pull/3203)) + * refactor DDP backend ([#3204](https://github.com/Lightning-AI/lightning/pull/3204), + [#3207](https://github.com/Lightning-AI/lightning/pull/3207), + [#3208](https://github.com/Lightning-AI/lightning/pull/3208), + [#3209](https://github.com/Lightning-AI/lightning/pull/3209), + [#3210](https://github.com/Lightning-AI/lightning/pull/3210)) + * reduced accelerator selection ([#3211](https://github.com/Lightning-AI/lightning/pull/3211)) + * group prepare data hook ([#3212](https://github.com/Lightning-AI/lightning/pull/3212)) + * added data connector ([#3285](https://github.com/Lightning-AI/lightning/pull/3285)) + * modular is_overridden ([#3290](https://github.com/Lightning-AI/lightning/pull/3290)) + * adding `Trainer.tune()` ([#3293](https://github.com/Lightning-AI/lightning/pull/3293)) + * move `run_pretrain_routine` -> `setup_training` ([#3294](https://github.com/Lightning-AI/lightning/pull/3294)) + * move train outside of setup training ([#3297](https://github.com/Lightning-AI/lightning/pull/3297)) + * move `prepare_data` to data connector ([#3307](https://github.com/Lightning-AI/lightning/pull/3307)) + * moved accelerator router ([#3309](https://github.com/Lightning-AI/lightning/pull/3309)) + * train loop refactor - moving train loop to own object ([#3310](https://github.com/Lightning-AI/lightning/pull/3310), + [#3312](https://github.com/Lightning-AI/lightning/pull/3312), + [#3313](https://github.com/Lightning-AI/lightning/pull/3313), + [#3314](https://github.com/Lightning-AI/lightning/pull/3314)) + * duplicate data interface definition up into DataHooks class ([#3344](https://github.com/Lightning-AI/lightning/pull/3344)) + * inner train loop ([#3359](https://github.com/Lightning-AI/lightning/pull/3359), + [#3361](https://github.com/Lightning-AI/lightning/pull/3361), + [#3362](https://github.com/Lightning-AI/lightning/pull/3362), + [#3363](https://github.com/Lightning-AI/lightning/pull/3363), + [#3365](https://github.com/Lightning-AI/lightning/pull/3365), + [#3366](https://github.com/Lightning-AI/lightning/pull/3366), + [#3367](https://github.com/Lightning-AI/lightning/pull/3367), + [#3368](https://github.com/Lightning-AI/lightning/pull/3368), + [#3369](https://github.com/Lightning-AI/lightning/pull/3369), + [#3370](https://github.com/Lightning-AI/lightning/pull/3370), + [#3371](https://github.com/Lightning-AI/lightning/pull/3371), + [#3372](https://github.com/Lightning-AI/lightning/pull/3372), + [#3373](https://github.com/Lightning-AI/lightning/pull/3373), + [#3374](https://github.com/Lightning-AI/lightning/pull/3374), + [#3375](https://github.com/Lightning-AI/lightning/pull/3375), + [#3376](https://github.com/Lightning-AI/lightning/pull/3376), + [#3385](https://github.com/Lightning-AI/lightning/pull/3385), + [#3388](https://github.com/Lightning-AI/lightning/pull/3388), + [#3397](https://github.com/Lightning-AI/lightning/pull/3397)) + * all logging related calls in a connector ([#3395](https://github.com/Lightning-AI/lightning/pull/3395)) + * device parser ([#3400](https://github.com/Lightning-AI/lightning/pull/3400), + [#3405](https://github.com/Lightning-AI/lightning/pull/3405)) + * added model connector ([#3407](https://github.com/Lightning-AI/lightning/pull/3407)) + * moved eval loop logging to loggers ([#3408](https://github.com/Lightning-AI/lightning/pull/3408)) + * moved eval loop (#3412[#3408](https://github.com/Lightning-AI/lightning/pull/3408)) + * trainer/separate argparse ([#3421](https://github.com/Lightning-AI/lightning/pull/3421), + [#3428](https://github.com/Lightning-AI/lightning/pull/3428), + [#3432](https://github.com/Lightning-AI/lightning/pull/3432)) + * move `lr_finder` ([#3434](https://github.com/Lightning-AI/lightning/pull/3434)) + * organize args (#[#3435](https://github.com/Lightning-AI/lightning/pull/3435), + [#3442](https://github.com/Lightning-AI/lightning/pull/3442), + [#3447](https://github.com/Lightning-AI/lightning/pull/3447), + [#3448](https://github.com/Lightning-AI/lightning/pull/3448), + [#3449](https://github.com/Lightning-AI/lightning/pull/3449), + [#3456](https://github.com/Lightning-AI/lightning/pull/3456)) + * move specific accelerator code ([#3457](https://github.com/Lightning-AI/lightning/pull/3457)) + * group connectors ([#3472](https://github.com/Lightning-AI/lightning/pull/3472)) + * accelerator connector methods x/n ([#3469](https://github.com/Lightning-AI/lightning/pull/3469), + [#3470](https://github.com/Lightning-AI/lightning/pull/3470), + [#3474](https://github.com/Lightning-AI/lightning/pull/3474)) + * merge backends x/n ([#3476](https://github.com/Lightning-AI/lightning/pull/3476), + [#3477](https://github.com/Lightning-AI/lightning/pull/3477), + [#3478](https://github.com/Lightning-AI/lightning/pull/3478), + [#3480](https://github.com/Lightning-AI/lightning/pull/3480), + [#3482](https://github.com/Lightning-AI/lightning/pull/3482)) + * apex plugin ([#3502](https://github.com/Lightning-AI/lightning/pull/3502)) + * precision plugins ([#3504](https://github.com/Lightning-AI/lightning/pull/3504)) + * Result - make monitor default to `checkpoint_on` to simplify ([#3571](https://github.com/Lightning-AI/lightning/pull/3571)) + * reference to the Trainer on the `LightningDataModule` ([#3684](https://github.com/Lightning-AI/lightning/pull/3684)) + * add `.log` to lightning module ([#3686](https://github.com/Lightning-AI/lightning/pull/3686), + [#3699](https://github.com/Lightning-AI/lightning/pull/3699), + [#3701](https://github.com/Lightning-AI/lightning/pull/3701), + [#3704](https://github.com/Lightning-AI/lightning/pull/3704), + [#3715](https://github.com/Lightning-AI/lightning/pull/3715)) + * enable tracking original metric when step and epoch are both true ([#3685](https://github.com/Lightning-AI/lightning/pull/3685)) + * deprecated results obj, added support for simpler comms ([#3681](https://github.com/Lightning-AI/lightning/pull/3681)) + * move backends back to individual files ([#3712](https://github.com/Lightning-AI/lightning/pull/3712)) + * fixes logging for eval steps ([#3763](https://github.com/Lightning-AI/lightning/pull/3763)) + * decoupled DDP, DDP spawn ([#3733](https://github.com/Lightning-AI/lightning/pull/3733), + [#3766](https://github.com/Lightning-AI/lightning/pull/3766), + [#3767](https://github.com/Lightning-AI/lightning/pull/3767), + [#3774](https://github.com/Lightning-AI/lightning/pull/3774), + [#3802](https://github.com/Lightning-AI/lightning/pull/3802), + [#3806](https://github.com/Lightning-AI/lightning/pull/3806), + [#3817](https://github.com/Lightning-AI/lightning/pull/3817), + [#3819](https://github.com/Lightning-AI/lightning/pull/3819), + [#3927](https://github.com/Lightning-AI/lightning/pull/3927)) + * remove weight loading hack for ddp_cpu ([#3808](https://github.com/Lightning-AI/lightning/pull/3808)) + * separate `torchelastic` from DDP ([#3810](https://github.com/Lightning-AI/lightning/pull/3810)) + * separate SLURM from DDP ([#3809](https://github.com/Lightning-AI/lightning/pull/3809)) + * decoupled DDP2 ([#3816](https://github.com/Lightning-AI/lightning/pull/3816)) + * bug fix with logging val epoch end + monitor ([#3812](https://github.com/Lightning-AI/lightning/pull/3812)) + * callback system and init DDP ([#3836](https://github.com/Lightning-AI/lightning/pull/3836)) + * adding compute environments ([#3837](https://github.com/Lightning-AI/lightning/pull/3837), [#3842](https://github.com/Lightning-AI/lightning/pull/3842)) + * epoch can now log independently ([#3843](https://github.com/Lightning-AI/lightning/pull/3843)) + * test selecting the correct backend. temp backends while slurm and TorchElastic are decoupled ([#3848](https://github.com/Lightning-AI/lightning/pull/3848)) + * fixed `init_slurm_connection` causing hostname errors ([#3856](https://github.com/Lightning-AI/lightning/pull/3856)) + * moves init apex from LM to apex connector ([#3923](https://github.com/Lightning-AI/lightning/pull/3923)) + * moves sync bn to each backend ([#3925](https://github.com/Lightning-AI/lightning/pull/3925)) + * moves configure ddp to each backend ([#3924](https://github.com/Lightning-AI/lightning/pull/3924)) +- Deprecation warning ([#3844](https://github.com/Lightning-AI/lightning/pull/3844)) +- Changed `LearningRateLogger` to `LearningRateMonitor` ([#3251](https://github.com/Lightning-AI/lightning/pull/3251)) +- Used `fsspec` instead of `gfile` for all IO ([#3320](https://github.com/Lightning-AI/lightning/pull/3320)) + * Swapped `torch.load` for `fsspec` load in DDP spawn backend ([#3787](https://github.com/Lightning-AI/lightning/pull/3787)) + * Swapped `torch.load` for `fsspec` load in cloud_io loading ([#3692](https://github.com/Lightning-AI/lightning/pull/3692)) + * Added support for `to_disk()` to use remote filepaths with `fsspec` ([#3930](https://github.com/Lightning-AI/lightning/pull/3930)) + * Updated model_checkpoint's to_yaml to use `fsspec` open ([#3801](https://github.com/Lightning-AI/lightning/pull/3801)) + * Fixed `fsspec` is inconsistent when doing `fs.ls` ([#3805](https://github.com/Lightning-AI/lightning/pull/3805)) +- Refactor `GPUStatsMonitor` to improve training speed ([#3257](https://github.com/Lightning-AI/lightning/pull/3257)) +- Changed IoU score behavior for classes absent in target and pred ([#3098](https://github.com/Lightning-AI/lightning/pull/3098)) +- Changed IoU `remove_bg` bool to `ignore_index` optional int ([#3098](https://github.com/Lightning-AI/lightning/pull/3098)) +- Changed defaults of `save_top_k` and `save_last` to `None` in ModelCheckpoint ([#3680](https://github.com/Lightning-AI/lightning/pull/3680)) +- `row_log_interval` and `log_save_interval` are now based on training loop's `global_step` instead of epoch-internal batch index ([#3667](https://github.com/Lightning-AI/lightning/pull/3667)) +- Silenced some warnings. verified ddp refactors ([#3483](https://github.com/Lightning-AI/lightning/pull/3483)) +- Cleaning up stale logger tests ([#3490](https://github.com/Lightning-AI/lightning/pull/3490)) +- Allow `ModelCheckpoint` monitor to be `None` ([#3633](https://github.com/Lightning-AI/lightning/pull/3633)) +- Enable `None` model checkpoint default ([#3669](https://github.com/Lightning-AI/lightning/pull/3669)) +- Skipped `best_model_path` if `checkpoint_callback` is `None` ([#2962](https://github.com/Lightning-AI/lightning/pull/2962)) +- Used `raise .. from ..` to explicitly chain exceptions ([#3750](https://github.com/Lightning-AI/lightning/pull/3750)) +- Mocking loggers ([#3596](https://github.com/Lightning-AI/lightning/pull/3596), + [#3617](https://github.com/Lightning-AI/lightning/pull/3617), + [#3851](https://github.com/Lightning-AI/lightning/pull/3851), + [#3859](https://github.com/Lightning-AI/lightning/pull/3859), + [#3884](https://github.com/Lightning-AI/lightning/pull/3884), + [#3853](https://github.com/Lightning-AI/lightning/pull/3853), + [#3910](https://github.com/Lightning-AI/lightning/pull/3910), + [#3889](https://github.com/Lightning-AI/lightning/pull/3889), + [#3926](https://github.com/Lightning-AI/lightning/pull/3926)) +- Write predictions in LightningModule instead of EvalResult [#3882](https://github.com/Lightning-AI/lightning/pull/3882) ### Deprecated -- Deprecated `TrainResult` and `EvalResult`, use `self.log` and `self.write` from the `LightningModule` to log metrics and write predictions. `training_step` can now only return a scalar (for the loss) or a dictionary with anything you want. ([#3681](https://github.com/PyTorchLightning/pytorch-lightning/pull/3681)) -- Deprecate `early_stop_callback` Trainer argument ([#3845](https://github.com/PyTorchLightning/pytorch-lightning/pull/3845)) -- Rename Trainer arguments `row_log_interval` >> `log_every_n_steps` and `log_save_interval` >> `flush_logs_every_n_steps` ([#3748](https://github.com/PyTorchLightning/pytorch-lightning/pull/3748)) +- Deprecated `TrainResult` and `EvalResult`, use `self.log` and `self.write` from the `LightningModule` to log metrics and write predictions. `training_step` can now only return a scalar (for the loss) or a dictionary with anything you want. ([#3681](https://github.com/Lightning-AI/lightning/pull/3681)) +- Deprecate `early_stop_callback` Trainer argument ([#3845](https://github.com/Lightning-AI/lightning/pull/3845)) +- Rename Trainer arguments `row_log_interval` >> `log_every_n_steps` and `log_save_interval` >> `flush_logs_every_n_steps` ([#3748](https://github.com/Lightning-AI/lightning/pull/3748)) ### Removed -- Removed experimental Metric API ([#3943](https://github.com/PyTorchLightning/pytorch-lightning/pull/3943), - [#3949](https://github.com/PyTorchLightning/pytorch-lightning/pull/3949), - [#3946](https://github.com/PyTorchLightning/pytorch-lightning/pull/3946)), listed changes before final removal: - * Added `EmbeddingSimilarity` metric ([#3349](https://github.com/PyTorchLightning/pytorch-lightning/pull/3349), [#3358](https://github.com/PyTorchLightning/pytorch-lightning/pull/3358)) - * Added hooks to metric module interface ([#2528](https://github.com/PyTorchLightning/pytorch-lightning/pull/2528)) - * Added error when AUROC metric is used for multiclass problems ([#3350](https://github.com/PyTorchLightning/pytorch-lightning/pull/3350)) - * Fixed `ModelCheckpoint` with `save_top_k=-1` option not tracking the best models when a monitor metric is available ([#3735](https://github.com/PyTorchLightning/pytorch-lightning/pull/3735)) - * Fixed counter-intuitive error being thrown in `Accuracy` metric for zero target tensor ([#3764](https://github.com/PyTorchLightning/pytorch-lightning/pull/3764)) - * Fixed aggregation of metrics ([#3517](https://github.com/PyTorchLightning/pytorch-lightning/pull/3517)) - * Fixed Metric aggregation ([#3321](https://github.com/PyTorchLightning/pytorch-lightning/pull/3321)) - * Fixed RMSLE metric ([#3188](https://github.com/PyTorchLightning/pytorch-lightning/pull/3188)) - * Renamed `reduction` to `class_reduction` in classification metrics ([#3322](https://github.com/PyTorchLightning/pytorch-lightning/pull/3322)) - * Changed `class_reduction` similar to sklearn for classification metrics ([#3322](https://github.com/PyTorchLightning/pytorch-lightning/pull/3322)) - * Renaming of precision recall metric ([#3308](https://github.com/PyTorchLightning/pytorch-lightning/pull/3308)) - -### Fixed - -- Fixed `on_train_batch_start` hook to end epoch early ([#3700](https://github.com/PyTorchLightning/pytorch-lightning/pull/3700)) -- Fixed `num_sanity_val_steps` is clipped to `limit_val_batches` ([#2917](https://github.com/PyTorchLightning/pytorch-lightning/pull/2917)) -- Fixed ONNX model save on GPU ([#3145](https://github.com/PyTorchLightning/pytorch-lightning/pull/3145)) -- Fixed `GpuUsageLogger` to work on different platforms ([#3008](https://github.com/PyTorchLightning/pytorch-lightning/pull/3008)) -- Fixed auto-scale batch size not dumping `auto_lr_find` parameter ([#3151](https://github.com/PyTorchLightning/pytorch-lightning/pull/3151)) -- Fixed `batch_outputs` with optimizer frequencies ([#3229](https://github.com/PyTorchLightning/pytorch-lightning/pull/3229)) -- Fixed setting batch size in `LightningModule.datamodule` when using `auto_scale_batch_size` ([#3266](https://github.com/PyTorchLightning/pytorch-lightning/pull/3266)) -- Fixed Horovod distributed backend compatibility with native AMP ([#3404](https://github.com/PyTorchLightning/pytorch-lightning/pull/3404)) -- Fixed batch size auto scaling exceeding the size of the dataset ([#3271](https://github.com/PyTorchLightning/pytorch-lightning/pull/3271)) -- Fixed getting `experiment_id` from MLFlow only once instead of each training loop ([#3394](https://github.com/PyTorchLightning/pytorch-lightning/pull/3394)) -- Fixed `overfit_batches` which now correctly disables shuffling for the training loader. ([#3501](https://github.com/PyTorchLightning/pytorch-lightning/pull/3501)) -- Fixed gradient norm tracking for `row_log_interval > 1` ([#3489](https://github.com/PyTorchLightning/pytorch-lightning/pull/3489)) -- Fixed `ModelCheckpoint` name formatting ([#3164](https://github.com/PyTorchLightning/pytorch-lightning/pull/3163)) -- Fixed example implementation of AutoEncoder ([#3190](https://github.com/PyTorchLightning/pytorch-lightning/pull/3190)) -- Fixed invalid paths when remote logging with TensorBoard ([#3236](https://github.com/PyTorchLightning/pytorch-lightning/pull/3236)) -- Fixed change `t()` to `transpose()` as XLA devices do not support `.t()` on 1-dim tensor ([#3252](https://github.com/PyTorchLightning/pytorch-lightning/pull/3252)) -- Fixed (weights only) checkpoints loading without PL ([#3287](https://github.com/PyTorchLightning/pytorch-lightning/pull/3287)) -- Fixed `gather_all_tensors` cross GPUs in DDP ([#3319](https://github.com/PyTorchLightning/pytorch-lightning/pull/3319)) -- Fixed CometML save dir ([#3419](https://github.com/PyTorchLightning/pytorch-lightning/pull/3419)) -- Fixed forward key metrics ([#3467](https://github.com/PyTorchLightning/pytorch-lightning/pull/3467)) -- Fixed normalize mode at confusion matrix (replace NaNs with zeros) ([#3465](https://github.com/PyTorchLightning/pytorch-lightning/pull/3465)) -- Fixed global step increment in training loop when `training_epoch_end` hook is used ([#3673](https://github.com/PyTorchLightning/pytorch-lightning/pull/3673)) -- Fixed dataloader shuffling not getting turned off with `overfit_batches > 0` and `distributed_backend = "ddp"` ([#3534](https://github.com/PyTorchLightning/pytorch-lightning/pull/3534)) -- Fixed determinism in `DDPSpawnBackend` when using `seed_everything` in main process ([#3335](https://github.com/PyTorchLightning/pytorch-lightning/pull/3335)) -- Fixed `ModelCheckpoint` `period` to actually save every `period` epochs ([#3630](https://github.com/PyTorchLightning/pytorch-lightning/pull/3630)) -- Fixed `val_progress_bar` total with `num_sanity_val_steps` ([#3751](https://github.com/PyTorchLightning/pytorch-lightning/pull/3751)) -- Fixed Tuner dump: add `current_epoch` to dumped_params ([#3261](https://github.com/PyTorchLightning/pytorch-lightning/pull/3261)) -- Fixed `current_epoch` and `global_step` properties mismatch between `Trainer` and `LightningModule` ([#3785](https://github.com/PyTorchLightning/pytorch-lightning/pull/3785)) -- Fixed learning rate scheduler for optimizers with internal state ([#3897](https://github.com/PyTorchLightning/pytorch-lightning/pull/3897)) -- Fixed `tbptt_reduce_fx` when non-floating tensors are logged ([#3796](https://github.com/PyTorchLightning/pytorch-lightning/pull/3796)) -- Fixed model checkpoint frequency ([#3852](https://github.com/PyTorchLightning/pytorch-lightning/pull/3852)) -- Fixed logging non-tensor scalar with result breaks subsequent epoch aggregation ([#3855](https://github.com/PyTorchLightning/pytorch-lightning/pull/3855)) -- Fixed `TrainerEvaluationLoopMixin` activates `model.train()` at the end ([#3858](https://github.com/PyTorchLightning/pytorch-lightning/pull/3858)) -- Fixed `overfit_batches` when using with multiple val/test_dataloaders ([#3857](https://github.com/PyTorchLightning/pytorch-lightning/pull/3857)) -- Fixed enables `training_step` to return `None` ([#3862](https://github.com/PyTorchLightning/pytorch-lightning/pull/3862)) -- Fixed init nan for checkpointing ([#3863](https://github.com/PyTorchLightning/pytorch-lightning/pull/3863)) -- Fixed for `load_from_checkpoint` ([#2776](https://github.com/PyTorchLightning/pytorch-lightning/pull/2776)) -- Fixes incorrect `batch_sizes` when Dataloader returns a dict with multiple tensors ([#3668](https://github.com/PyTorchLightning/pytorch-lightning/pull/3668)) -- Fixed unexpected signature for `validation_step` ([#3947](https://github.com/PyTorchLightning/pytorch-lightning/pull/3947)) +- Removed experimental Metric API ([#3943](https://github.com/Lightning-AI/lightning/pull/3943), + [#3949](https://github.com/Lightning-AI/lightning/pull/3949), + [#3946](https://github.com/Lightning-AI/lightning/pull/3946)), listed changes before final removal: + * Added `EmbeddingSimilarity` metric ([#3349](https://github.com/Lightning-AI/lightning/pull/3349), [#3358](https://github.com/Lightning-AI/lightning/pull/3358)) + * Added hooks to metric module interface ([#2528](https://github.com/Lightning-AI/lightning/pull/2528)) + * Added error when AUROC metric is used for multiclass problems ([#3350](https://github.com/Lightning-AI/lightning/pull/3350)) + * Fixed `ModelCheckpoint` with `save_top_k=-1` option not tracking the best models when a monitor metric is available ([#3735](https://github.com/Lightning-AI/lightning/pull/3735)) + * Fixed counter-intuitive error being thrown in `Accuracy` metric for zero target tensor ([#3764](https://github.com/Lightning-AI/lightning/pull/3764)) + * Fixed aggregation of metrics ([#3517](https://github.com/Lightning-AI/lightning/pull/3517)) + * Fixed Metric aggregation ([#3321](https://github.com/Lightning-AI/lightning/pull/3321)) + * Fixed RMSLE metric ([#3188](https://github.com/Lightning-AI/lightning/pull/3188)) + * Renamed `reduction` to `class_reduction` in classification metrics ([#3322](https://github.com/Lightning-AI/lightning/pull/3322)) + * Changed `class_reduction` similar to sklearn for classification metrics ([#3322](https://github.com/Lightning-AI/lightning/pull/3322)) + * Renaming of precision recall metric ([#3308](https://github.com/Lightning-AI/lightning/pull/3308)) + +### Fixed + +- Fixed `on_train_batch_start` hook to end epoch early ([#3700](https://github.com/Lightning-AI/lightning/pull/3700)) +- Fixed `num_sanity_val_steps` is clipped to `limit_val_batches` ([#2917](https://github.com/Lightning-AI/lightning/pull/2917)) +- Fixed ONNX model save on GPU ([#3145](https://github.com/Lightning-AI/lightning/pull/3145)) +- Fixed `GpuUsageLogger` to work on different platforms ([#3008](https://github.com/Lightning-AI/lightning/pull/3008)) +- Fixed auto-scale batch size not dumping `auto_lr_find` parameter ([#3151](https://github.com/Lightning-AI/lightning/pull/3151)) +- Fixed `batch_outputs` with optimizer frequencies ([#3229](https://github.com/Lightning-AI/lightning/pull/3229)) +- Fixed setting batch size in `LightningModule.datamodule` when using `auto_scale_batch_size` ([#3266](https://github.com/Lightning-AI/lightning/pull/3266)) +- Fixed Horovod distributed backend compatibility with native AMP ([#3404](https://github.com/Lightning-AI/lightning/pull/3404)) +- Fixed batch size auto scaling exceeding the size of the dataset ([#3271](https://github.com/Lightning-AI/lightning/pull/3271)) +- Fixed getting `experiment_id` from MLFlow only once instead of each training loop ([#3394](https://github.com/Lightning-AI/lightning/pull/3394)) +- Fixed `overfit_batches` which now correctly disables shuffling for the training loader. ([#3501](https://github.com/Lightning-AI/lightning/pull/3501)) +- Fixed gradient norm tracking for `row_log_interval > 1` ([#3489](https://github.com/Lightning-AI/lightning/pull/3489)) +- Fixed `ModelCheckpoint` name formatting ([#3164](https://github.com/Lightning-AI/lightning/pull/3163)) +- Fixed example implementation of AutoEncoder ([#3190](https://github.com/Lightning-AI/lightning/pull/3190)) +- Fixed invalid paths when remote logging with TensorBoard ([#3236](https://github.com/Lightning-AI/lightning/pull/3236)) +- Fixed change `t()` to `transpose()` as XLA devices do not support `.t()` on 1-dim tensor ([#3252](https://github.com/Lightning-AI/lightning/pull/3252)) +- Fixed (weights only) checkpoints loading without PL ([#3287](https://github.com/Lightning-AI/lightning/pull/3287)) +- Fixed `gather_all_tensors` cross GPUs in DDP ([#3319](https://github.com/Lightning-AI/lightning/pull/3319)) +- Fixed CometML save dir ([#3419](https://github.com/Lightning-AI/lightning/pull/3419)) +- Fixed forward key metrics ([#3467](https://github.com/Lightning-AI/lightning/pull/3467)) +- Fixed normalize mode at confusion matrix (replace NaNs with zeros) ([#3465](https://github.com/Lightning-AI/lightning/pull/3465)) +- Fixed global step increment in training loop when `training_epoch_end` hook is used ([#3673](https://github.com/Lightning-AI/lightning/pull/3673)) +- Fixed dataloader shuffling not getting turned off with `overfit_batches > 0` and `distributed_backend = "ddp"` ([#3534](https://github.com/Lightning-AI/lightning/pull/3534)) +- Fixed determinism in `DDPSpawnBackend` when using `seed_everything` in main process ([#3335](https://github.com/Lightning-AI/lightning/pull/3335)) +- Fixed `ModelCheckpoint` `period` to actually save every `period` epochs ([#3630](https://github.com/Lightning-AI/lightning/pull/3630)) +- Fixed `val_progress_bar` total with `num_sanity_val_steps` ([#3751](https://github.com/Lightning-AI/lightning/pull/3751)) +- Fixed Tuner dump: add `current_epoch` to dumped_params ([#3261](https://github.com/Lightning-AI/lightning/pull/3261)) +- Fixed `current_epoch` and `global_step` properties mismatch between `Trainer` and `LightningModule` ([#3785](https://github.com/Lightning-AI/lightning/pull/3785)) +- Fixed learning rate scheduler for optimizers with internal state ([#3897](https://github.com/Lightning-AI/lightning/pull/3897)) +- Fixed `tbptt_reduce_fx` when non-floating tensors are logged ([#3796](https://github.com/Lightning-AI/lightning/pull/3796)) +- Fixed model checkpoint frequency ([#3852](https://github.com/Lightning-AI/lightning/pull/3852)) +- Fixed logging non-tensor scalar with result breaks subsequent epoch aggregation ([#3855](https://github.com/Lightning-AI/lightning/pull/3855)) +- Fixed `TrainerEvaluationLoopMixin` activates `model.train()` at the end ([#3858](https://github.com/Lightning-AI/lightning/pull/3858)) +- Fixed `overfit_batches` when using with multiple val/test_dataloaders ([#3857](https://github.com/Lightning-AI/lightning/pull/3857)) +- Fixed enables `training_step` to return `None` ([#3862](https://github.com/Lightning-AI/lightning/pull/3862)) +- Fixed init nan for checkpointing ([#3863](https://github.com/Lightning-AI/lightning/pull/3863)) +- Fixed for `load_from_checkpoint` ([#2776](https://github.com/Lightning-AI/lightning/pull/2776)) +- Fixes incorrect `batch_sizes` when Dataloader returns a dict with multiple tensors ([#3668](https://github.com/Lightning-AI/lightning/pull/3668)) +- Fixed unexpected signature for `validation_step` ([#3947](https://github.com/Lightning-AI/lightning/pull/3947)) ## [0.9.0] - 2020-08-20 ### Added -- Added SyncBN for DDP ([#2801](https://github.com/PyTorchLightning/pytorch-lightning/pull/2801), - [#2838](https://github.com/PyTorchLightning/pytorch-lightning/pull/2838)) -- Added basic `CSVLogger` ([#2721](https://github.com/PyTorchLightning/pytorch-lightning/pull/2721)) -- Added SSIM metrics ([#2671](https://github.com/PyTorchLightning/pytorch-lightning/pull/2671)) -- Added BLEU metrics ([#2535](https://github.com/PyTorchLightning/pytorch-lightning/pull/2535)) -- Added support to export a model to ONNX format ([#2596](https://github.com/PyTorchLightning/pytorch-lightning/pull/2596)) -- Added support for `Trainer(num_sanity_val_steps=-1)` to check all validation data before training ([#2246](https://github.com/PyTorchLightning/pytorch-lightning/pull/2246)) +- Added SyncBN for DDP ([#2801](https://github.com/Lightning-AI/lightning/pull/2801), + [#2838](https://github.com/Lightning-AI/lightning/pull/2838)) +- Added basic `CSVLogger` ([#2721](https://github.com/Lightning-AI/lightning/pull/2721)) +- Added SSIM metrics ([#2671](https://github.com/Lightning-AI/lightning/pull/2671)) +- Added BLEU metrics ([#2535](https://github.com/Lightning-AI/lightning/pull/2535)) +- Added support to export a model to ONNX format ([#2596](https://github.com/Lightning-AI/lightning/pull/2596)) +- Added support for `Trainer(num_sanity_val_steps=-1)` to check all validation data before training ([#2246](https://github.com/Lightning-AI/lightning/pull/2246)) - Added struct. output: - * tests for val loop flow ([#2605](https://github.com/PyTorchLightning/pytorch-lightning/pull/2605)) - * `EvalResult` support for train and val. loop ([#2615](https://github.com/PyTorchLightning/pytorch-lightning/pull/2615), - [#2651](https://github.com/PyTorchLightning/pytorch-lightning/pull/2651)) - * weighted average in results obj ([#2930](https://github.com/PyTorchLightning/pytorch-lightning/pull/2930)) - * fix result obj DP auto reduce ([#3013](https://github.com/PyTorchLightning/pytorch-lightning/pull/3013)) -- Added class `LightningDataModule` ([#2668](https://github.com/PyTorchLightning/pytorch-lightning/pull/2668)) -- Added support for PyTorch 1.6 ([#2745](https://github.com/PyTorchLightning/pytorch-lightning/pull/2745)) -- Added call DataModule hooks implicitly in trainer ([#2755](https://github.com/PyTorchLightning/pytorch-lightning/pull/2755)) -- Added support for Mean in DDP Sync ([#2568](https://github.com/PyTorchLightning/pytorch-lightning/pull/2568)) -- Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/PyTorchLightning/pytorch-lightning/pull/2562)) -- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2840](https://github.com/PyTorchLightning/pytorch-lightning/pull/2840)) -- Added support returning python scalars in DP ([#1935](https://github.com/PyTorchLightning/pytorch-lightning/pull/1935)) -- Added support to Tensorboard logger for OmegaConf `hparams` ([#2846](https://github.com/PyTorchLightning/pytorch-lightning/pull/2846)) -- Added tracking of basic states in `Trainer` ([#2541](https://github.com/PyTorchLightning/pytorch-lightning/pull/2541)) -- Tracks all outputs including TBPTT and multiple optimizers ([#2890](https://github.com/PyTorchLightning/pytorch-lightning/pull/2890)) -- Added GPU Usage Logger ([#2932](https://github.com/PyTorchLightning/pytorch-lightning/pull/2932)) -- Added `strict=False` for `load_from_checkpoint` ([#2819](https://github.com/PyTorchLightning/pytorch-lightning/pull/2819)) -- Added saving test predictions on multiple GPUs ([#2926](https://github.com/PyTorchLightning/pytorch-lightning/pull/2926)) -- Auto log the computational graph for loggers that support this ([#3003](https://github.com/PyTorchLightning/pytorch-lightning/pull/3003)) -- Added warning when changing monitor and using results obj ([#3014](https://github.com/PyTorchLightning/pytorch-lightning/pull/3014)) -- Added a hook `transfer_batch_to_device` to the `LightningDataModule` ([#3038](https://github.com/PyTorchLightning/pytorch-lightning/pull/3038)) + * tests for val loop flow ([#2605](https://github.com/Lightning-AI/lightning/pull/2605)) + * `EvalResult` support for train and val. loop ([#2615](https://github.com/Lightning-AI/lightning/pull/2615), + [#2651](https://github.com/Lightning-AI/lightning/pull/2651)) + * weighted average in results obj ([#2930](https://github.com/Lightning-AI/lightning/pull/2930)) + * fix result obj DP auto reduce ([#3013](https://github.com/Lightning-AI/lightning/pull/3013)) +- Added class `LightningDataModule` ([#2668](https://github.com/Lightning-AI/lightning/pull/2668)) +- Added support for PyTorch 1.6 ([#2745](https://github.com/Lightning-AI/lightning/pull/2745)) +- Added call DataModule hooks implicitly in trainer ([#2755](https://github.com/Lightning-AI/lightning/pull/2755)) +- Added support for Mean in DDP Sync ([#2568](https://github.com/Lightning-AI/lightning/pull/2568)) +- Added remaining `sklearn` metrics: `AveragePrecision`, `BalancedAccuracy`, `CohenKappaScore`, `DCG`, `Hamming`, `Hinge`, `Jaccard`, `MeanAbsoluteError`, `MeanSquaredError`, `MeanSquaredLogError`, `MedianAbsoluteError`, `R2Score`, `MeanPoissonDeviance`, `MeanGammaDeviance`, `MeanTweedieDeviance`, `ExplainedVariance` ([#2562](https://github.com/Lightning-AI/lightning/pull/2562)) +- Added support for `limit_{mode}_batches (int)` to work with infinite dataloader (IterableDataset) ([#2840](https://github.com/Lightning-AI/lightning/pull/2840)) +- Added support returning python scalars in DP ([#1935](https://github.com/Lightning-AI/lightning/pull/1935)) +- Added support to Tensorboard logger for OmegaConf `hparams` ([#2846](https://github.com/Lightning-AI/lightning/pull/2846)) +- Added tracking of basic states in `Trainer` ([#2541](https://github.com/Lightning-AI/lightning/pull/2541)) +- Tracks all outputs including TBPTT and multiple optimizers ([#2890](https://github.com/Lightning-AI/lightning/pull/2890)) +- Added GPU Usage Logger ([#2932](https://github.com/Lightning-AI/lightning/pull/2932)) +- Added `strict=False` for `load_from_checkpoint` ([#2819](https://github.com/Lightning-AI/lightning/pull/2819)) +- Added saving test predictions on multiple GPUs ([#2926](https://github.com/Lightning-AI/lightning/pull/2926)) +- Auto log the computational graph for loggers that support this ([#3003](https://github.com/Lightning-AI/lightning/pull/3003)) +- Added warning when changing monitor and using results obj ([#3014](https://github.com/Lightning-AI/lightning/pull/3014)) +- Added a hook `transfer_batch_to_device` to the `LightningDataModule` ([#3038](https://github.com/Lightning-AI/lightning/pull/3038)) ### Changed -- Truncated long version numbers in progress bar ([#2594](https://github.com/PyTorchLightning/pytorch-lightning/pull/2594)) -- Enabling val/test loop disabling ([#2692](https://github.com/PyTorchLightning/pytorch-lightning/pull/2692)) +- Truncated long version numbers in progress bar ([#2594](https://github.com/Lightning-AI/lightning/pull/2594)) +- Enabling val/test loop disabling ([#2692](https://github.com/Lightning-AI/lightning/pull/2692)) - Refactored into `accelerator` module: - * GPU training ([#2704](https://github.com/PyTorchLightning/pytorch-lightning/pull/2704)) - * TPU training ([#2708](https://github.com/PyTorchLightning/pytorch-lightning/pull/2708)) - * DDP(2) backend ([#2796](https://github.com/PyTorchLightning/pytorch-lightning/pull/2796)) - * Retrieve last logged val from result by key ([#3049](https://github.com/PyTorchLightning/pytorch-lightning/pull/3049)) -- Using `.comet.config` file for `CometLogger` ([#1913](https://github.com/PyTorchLightning/pytorch-lightning/pull/1913)) -- Updated hooks arguments - breaking for `setup` and `teardown` ([#2850](https://github.com/PyTorchLightning/pytorch-lightning/pull/2850)) -- Using `gfile` to support remote directories ([#2164](https://github.com/PyTorchLightning/pytorch-lightning/pull/2164)) + * GPU training ([#2704](https://github.com/Lightning-AI/lightning/pull/2704)) + * TPU training ([#2708](https://github.com/Lightning-AI/lightning/pull/2708)) + * DDP(2) backend ([#2796](https://github.com/Lightning-AI/lightning/pull/2796)) + * Retrieve last logged val from result by key ([#3049](https://github.com/Lightning-AI/lightning/pull/3049)) +- Using `.comet.config` file for `CometLogger` ([#1913](https://github.com/Lightning-AI/lightning/pull/1913)) +- Updated hooks arguments - breaking for `setup` and `teardown` ([#2850](https://github.com/Lightning-AI/lightning/pull/2850)) +- Using `gfile` to support remote directories ([#2164](https://github.com/Lightning-AI/lightning/pull/2164)) - Moved optimizer creation after device placement for DDP backends ([#2904](https://github.com/PyTorchLightning/pytorch-lighting/pull/2904)) -- Support `**DictConfig` for `hparam` serialization ([#2519](https://github.com/PyTorchLightning/pytorch-lightning/pull/2519)) -- Removed callback metrics from test results obj ([#2994](https://github.com/PyTorchLightning/pytorch-lightning/pull/2994)) -- Re-enabled naming metrics in ckpt name ([#3060](https://github.com/PyTorchLightning/pytorch-lightning/pull/3060)) -- Changed progress bar epoch counting to start from 0 ([#3061](https://github.com/PyTorchLightning/pytorch-lightning/pull/3061)) +- Support `**DictConfig` for `hparam` serialization ([#2519](https://github.com/Lightning-AI/lightning/pull/2519)) +- Removed callback metrics from test results obj ([#2994](https://github.com/Lightning-AI/lightning/pull/2994)) +- Re-enabled naming metrics in ckpt name ([#3060](https://github.com/Lightning-AI/lightning/pull/3060)) +- Changed progress bar epoch counting to start from 0 ([#3061](https://github.com/Lightning-AI/lightning/pull/3061)) ### Deprecated -- Deprecated Trainer attribute `ckpt_path`, which will now be set by `weights_save_path` ([#2681](https://github.com/PyTorchLightning/pytorch-lightning/pull/2681)) +- Deprecated Trainer attribute `ckpt_path`, which will now be set by `weights_save_path` ([#2681](https://github.com/Lightning-AI/lightning/pull/2681)) ### Removed -- Removed deprecated: ([#2760](https://github.com/PyTorchLightning/pytorch-lightning/pull/2760)) +- Removed deprecated: ([#2760](https://github.com/Lightning-AI/lightning/pull/2760)) * core decorator `data_loader` * Module hook `on_sanity_check_start` and loading `load_from_metrics` * package `pytorch_lightning.logging` @@ -2997,209 +2997,209 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Fixed `accumulate_grad_batches` for last batch ([#2853](https://github.com/PyTorchLightning/pytorch-lightning/pull/2853)) -- Fixed setup call while testing ([#2624](https://github.com/PyTorchLightning/pytorch-lightning/pull/2624)) -- Fixed local rank zero casting ([#2640](https://github.com/PyTorchLightning/pytorch-lightning/pull/2640)) -- Fixed single scalar return from training ([#2587](https://github.com/PyTorchLightning/pytorch-lightning/pull/2587)) -- Fixed Horovod backend to scale LR schedlers with the optimizer ([#2626](https://github.com/PyTorchLightning/pytorch-lightning/pull/2626)) -- Fixed `dtype` and `device` properties not getting updated in submodules ([#2657](https://github.com/PyTorchLightning/pytorch-lightning/pull/2657)) -- Fixed `fast_dev_run` to run for all dataloaders ([#2581](https://github.com/PyTorchLightning/pytorch-lightning/pull/2581)) -- Fixed `save_dir` in loggers getting ignored by default value of `weights_save_path` when user did not specify `weights_save_path` ([#2681](https://github.com/PyTorchLightning/pytorch-lightning/pull/2681)) -- Fixed `weights_save_path` getting ignored when `logger=False` is passed to Trainer ([#2681](https://github.com/PyTorchLightning/pytorch-lightning/pull/2681)) -- Fixed TPU multi-core and Float16 ([#2632](https://github.com/PyTorchLightning/pytorch-lightning/pull/2632)) -- Fixed test metrics not being logged with `LoggerCollection` ([#2723](https://github.com/PyTorchLightning/pytorch-lightning/pull/2723)) -- Fixed data transfer to device when using `torchtext.data.Field` and `include_lengths is True` ([#2689](https://github.com/PyTorchLightning/pytorch-lightning/pull/2689)) -- Fixed shuffle argument for distributed sampler ([#2789](https://github.com/PyTorchLightning/pytorch-lightning/pull/2789)) -- Fixed logging interval ([#2694](https://github.com/PyTorchLightning/pytorch-lightning/pull/2694)) -- Fixed loss value in the progress bar is wrong when `accumulate_grad_batches > 1` ([#2738](https://github.com/PyTorchLightning/pytorch-lightning/pull/2738)) -- Fixed correct CWD for ddp sub-processes when using Hydra ([#2719](https://github.com/PyTorchLightning/pytorch-lightning/pull/2719)) -- Fixed selecting GPUs using `CUDA_VISIBLE_DEVICES` ([#2739](https://github.com/PyTorchLightning/pytorch-lightning/pull/2739)) -- Fixed false `num_classes` warning in metrics ([#2781](https://github.com/PyTorchLightning/pytorch-lightning/pull/2781)) -- Fixed shell injection vulnerability in subprocess call ([#2786](https://github.com/PyTorchLightning/pytorch-lightning/pull/2786)) -- Fixed LR finder and `hparams` compatibility ([#2821](https://github.com/PyTorchLightning/pytorch-lightning/pull/2821)) -- Fixed `ModelCheckpoint` not saving the latest information when `save_last=True` ([#2881](https://github.com/PyTorchLightning/pytorch-lightning/pull/2881)) -- Fixed ImageNet example: learning rate scheduler, number of workers and batch size when using DDP ([#2889](https://github.com/PyTorchLightning/pytorch-lightning/pull/2889)) -- Fixed apex gradient clipping ([#2829](https://github.com/PyTorchLightning/pytorch-lightning/pull/2829)) -- Fixed save apex scaler states ([#2828](https://github.com/PyTorchLightning/pytorch-lightning/pull/2828)) -- Fixed a model loading issue with inheritance and variable positional arguments ([#2911](https://github.com/PyTorchLightning/pytorch-lightning/pull/2911)) -- Fixed passing `non_blocking=True` when transferring a batch object that does not support it ([#2910](https://github.com/PyTorchLightning/pytorch-lightning/pull/2910)) -- Fixed checkpointing to remote file paths ([#2925](https://github.com/PyTorchLightning/pytorch-lightning/pull/2925)) -- Fixed adding val step argument to metrics ([#2986](https://github.com/PyTorchLightning/pytorch-lightning/pull/2986)) -- Fixed an issue that caused `Trainer.test()` to stall in ddp mode ([#2997](https://github.com/PyTorchLightning/pytorch-lightning/pull/2997)) -- Fixed gathering of results with tensors of varying shape ([#3020](https://github.com/PyTorchLightning/pytorch-lightning/pull/3020)) -- Fixed batch size auto-scaling feature to set the new value on the correct model attribute ([#3043](https://github.com/PyTorchLightning/pytorch-lightning/pull/3043)) -- Fixed automatic batch scaling not working with half precision ([#3045](https://github.com/PyTorchLightning/pytorch-lightning/pull/3045)) -- Fixed setting device to root gpu ([#3042](https://github.com/PyTorchLightning/pytorch-lightning/pull/3042)) +- Fixed `accumulate_grad_batches` for last batch ([#2853](https://github.com/Lightning-AI/lightning/pull/2853)) +- Fixed setup call while testing ([#2624](https://github.com/Lightning-AI/lightning/pull/2624)) +- Fixed local rank zero casting ([#2640](https://github.com/Lightning-AI/lightning/pull/2640)) +- Fixed single scalar return from training ([#2587](https://github.com/Lightning-AI/lightning/pull/2587)) +- Fixed Horovod backend to scale LR schedlers with the optimizer ([#2626](https://github.com/Lightning-AI/lightning/pull/2626)) +- Fixed `dtype` and `device` properties not getting updated in submodules ([#2657](https://github.com/Lightning-AI/lightning/pull/2657)) +- Fixed `fast_dev_run` to run for all dataloaders ([#2581](https://github.com/Lightning-AI/lightning/pull/2581)) +- Fixed `save_dir` in loggers getting ignored by default value of `weights_save_path` when user did not specify `weights_save_path` ([#2681](https://github.com/Lightning-AI/lightning/pull/2681)) +- Fixed `weights_save_path` getting ignored when `logger=False` is passed to Trainer ([#2681](https://github.com/Lightning-AI/lightning/pull/2681)) +- Fixed TPU multi-core and Float16 ([#2632](https://github.com/Lightning-AI/lightning/pull/2632)) +- Fixed test metrics not being logged with `LoggerCollection` ([#2723](https://github.com/Lightning-AI/lightning/pull/2723)) +- Fixed data transfer to device when using `torchtext.data.Field` and `include_lengths is True` ([#2689](https://github.com/Lightning-AI/lightning/pull/2689)) +- Fixed shuffle argument for distributed sampler ([#2789](https://github.com/Lightning-AI/lightning/pull/2789)) +- Fixed logging interval ([#2694](https://github.com/Lightning-AI/lightning/pull/2694)) +- Fixed loss value in the progress bar is wrong when `accumulate_grad_batches > 1` ([#2738](https://github.com/Lightning-AI/lightning/pull/2738)) +- Fixed correct CWD for ddp sub-processes when using Hydra ([#2719](https://github.com/Lightning-AI/lightning/pull/2719)) +- Fixed selecting GPUs using `CUDA_VISIBLE_DEVICES` ([#2739](https://github.com/Lightning-AI/lightning/pull/2739)) +- Fixed false `num_classes` warning in metrics ([#2781](https://github.com/Lightning-AI/lightning/pull/2781)) +- Fixed shell injection vulnerability in subprocess call ([#2786](https://github.com/Lightning-AI/lightning/pull/2786)) +- Fixed LR finder and `hparams` compatibility ([#2821](https://github.com/Lightning-AI/lightning/pull/2821)) +- Fixed `ModelCheckpoint` not saving the latest information when `save_last=True` ([#2881](https://github.com/Lightning-AI/lightning/pull/2881)) +- Fixed ImageNet example: learning rate scheduler, number of workers and batch size when using DDP ([#2889](https://github.com/Lightning-AI/lightning/pull/2889)) +- Fixed apex gradient clipping ([#2829](https://github.com/Lightning-AI/lightning/pull/2829)) +- Fixed save apex scaler states ([#2828](https://github.com/Lightning-AI/lightning/pull/2828)) +- Fixed a model loading issue with inheritance and variable positional arguments ([#2911](https://github.com/Lightning-AI/lightning/pull/2911)) +- Fixed passing `non_blocking=True` when transferring a batch object that does not support it ([#2910](https://github.com/Lightning-AI/lightning/pull/2910)) +- Fixed checkpointing to remote file paths ([#2925](https://github.com/Lightning-AI/lightning/pull/2925)) +- Fixed adding val step argument to metrics ([#2986](https://github.com/Lightning-AI/lightning/pull/2986)) +- Fixed an issue that caused `Trainer.test()` to stall in ddp mode ([#2997](https://github.com/Lightning-AI/lightning/pull/2997)) +- Fixed gathering of results with tensors of varying shape ([#3020](https://github.com/Lightning-AI/lightning/pull/3020)) +- Fixed batch size auto-scaling feature to set the new value on the correct model attribute ([#3043](https://github.com/Lightning-AI/lightning/pull/3043)) +- Fixed automatic batch scaling not working with half precision ([#3045](https://github.com/Lightning-AI/lightning/pull/3045)) +- Fixed setting device to root gpu ([#3042](https://github.com/Lightning-AI/lightning/pull/3042)) ## [0.8.5] - 2020-07-09 ### Added -- Added a PSNR metric: peak signal-to-noise ratio ([#2483](https://github.com/PyTorchLightning/pytorch-lightning/pull/2483)) -- Added functional regression metrics ([#2492](https://github.com/PyTorchLightning/pytorch-lightning/pull/2492)) +- Added a PSNR metric: peak signal-to-noise ratio ([#2483](https://github.com/Lightning-AI/lightning/pull/2483)) +- Added functional regression metrics ([#2492](https://github.com/Lightning-AI/lightning/pull/2492)) ### Removed -- Removed auto val reduce ([#2462](https://github.com/PyTorchLightning/pytorch-lightning/pull/2462)) +- Removed auto val reduce ([#2462](https://github.com/Lightning-AI/lightning/pull/2462)) ### Fixed -- Flattening Wandb Hyperparameters ([#2459](https://github.com/PyTorchLightning/pytorch-lightning/pull/2459)) -- Fixed using the same DDP python interpreter and actually running ([#2482](https://github.com/PyTorchLightning/pytorch-lightning/pull/2482)) -- Fixed model summary input type conversion for models that have input dtype different from model parameters ([#2510](https://github.com/PyTorchLightning/pytorch-lightning/pull/2510)) -- Made `TensorBoardLogger` and `CometLogger` pickleable ([#2518](https://github.com/PyTorchLightning/pytorch-lightning/pull/2518)) -- Fixed a problem with `MLflowLogger` creating multiple run folders ([#2502](https://github.com/PyTorchLightning/pytorch-lightning/pull/2502)) -- Fixed global_step increment ([#2455](https://github.com/PyTorchLightning/pytorch-lightning/pull/2455)) -- Fixed TPU hanging example ([#2488](https://github.com/PyTorchLightning/pytorch-lightning/pull/2488)) -- Fixed `argparse` default value bug ([#2526](https://github.com/PyTorchLightning/pytorch-lightning/pull/2526)) -- Fixed Dice and IoU to avoid NaN by adding small eps ([#2545](https://github.com/PyTorchLightning/pytorch-lightning/pull/2545)) -- Fixed accumulate gradients schedule at epoch 0 (continued) ([#2513](https://github.com/PyTorchLightning/pytorch-lightning/pull/2513)) -- Fixed Trainer `.fit()` returning last not best weights in "ddp_spawn" ([#2565](https://github.com/PyTorchLightning/pytorch-lightning/pull/2565)) -- Fixed passing (do not pass) TPU weights back on test ([#2566](https://github.com/PyTorchLightning/pytorch-lightning/pull/2566)) -- Fixed DDP tests and `.test()` ([#2512](https://github.com/PyTorchLightning/pytorch-lightning/pull/2512), - [#2570](https://github.com/PyTorchLightning/pytorch-lightning/pull/2570)) +- Flattening Wandb Hyperparameters ([#2459](https://github.com/Lightning-AI/lightning/pull/2459)) +- Fixed using the same DDP python interpreter and actually running ([#2482](https://github.com/Lightning-AI/lightning/pull/2482)) +- Fixed model summary input type conversion for models that have input dtype different from model parameters ([#2510](https://github.com/Lightning-AI/lightning/pull/2510)) +- Made `TensorBoardLogger` and `CometLogger` pickleable ([#2518](https://github.com/Lightning-AI/lightning/pull/2518)) +- Fixed a problem with `MLflowLogger` creating multiple run folders ([#2502](https://github.com/Lightning-AI/lightning/pull/2502)) +- Fixed global_step increment ([#2455](https://github.com/Lightning-AI/lightning/pull/2455)) +- Fixed TPU hanging example ([#2488](https://github.com/Lightning-AI/lightning/pull/2488)) +- Fixed `argparse` default value bug ([#2526](https://github.com/Lightning-AI/lightning/pull/2526)) +- Fixed Dice and IoU to avoid NaN by adding small eps ([#2545](https://github.com/Lightning-AI/lightning/pull/2545)) +- Fixed accumulate gradients schedule at epoch 0 (continued) ([#2513](https://github.com/Lightning-AI/lightning/pull/2513)) +- Fixed Trainer `.fit()` returning last not best weights in "ddp_spawn" ([#2565](https://github.com/Lightning-AI/lightning/pull/2565)) +- Fixed passing (do not pass) TPU weights back on test ([#2566](https://github.com/Lightning-AI/lightning/pull/2566)) +- Fixed DDP tests and `.test()` ([#2512](https://github.com/Lightning-AI/lightning/pull/2512), + [#2570](https://github.com/Lightning-AI/lightning/pull/2570)) ## [0.8.4] - 2020-07-01 ### Added -- Added reduce ddp results on eval ([#2434](https://github.com/PyTorchLightning/pytorch-lightning/pull/2434)) -- Added a warning when an `IterableDataset` has `__len__` defined ([#2437](https://github.com/PyTorchLightning/pytorch-lightning/pull/2437)) +- Added reduce ddp results on eval ([#2434](https://github.com/Lightning-AI/lightning/pull/2434)) +- Added a warning when an `IterableDataset` has `__len__` defined ([#2437](https://github.com/Lightning-AI/lightning/pull/2437)) ### Changed -- Enabled no returns from eval ([#2446](https://github.com/PyTorchLightning/pytorch-lightning/pull/2446)) +- Enabled no returns from eval ([#2446](https://github.com/Lightning-AI/lightning/pull/2446)) ### Fixed -- Fixes train outputs ([#2428](https://github.com/PyTorchLightning/pytorch-lightning/pull/2428)) -- Fixes Conda dependencies ([#2412](https://github.com/PyTorchLightning/pytorch-lightning/pull/2412)) -- Fixed Apex scaling with decoupled backward ([#2433](https://github.com/PyTorchLightning/pytorch-lightning/pull/2433)) -- Fixed crashing or wrong displaying progressbar because of missing ipywidgets ([#2417](https://github.com/PyTorchLightning/pytorch-lightning/pull/2417)) -- Fixed TPU saving dir ([fc26078e](https://github.com/PyTorchLightning/pytorch-lightning/commit/fc26078e395f8a001f4c6dd7b3fe7ca202f914a3), [04e68f02](https://github.com/PyTorchLightning/pytorch-lightning/commit/04e68f022fc03dd5f1555ee86dea997d42a448ad)) -- Fixed logging on rank 0 only ([#2425](https://github.com/PyTorchLightning/pytorch-lightning/pull/2425)) +- Fixes train outputs ([#2428](https://github.com/Lightning-AI/lightning/pull/2428)) +- Fixes Conda dependencies ([#2412](https://github.com/Lightning-AI/lightning/pull/2412)) +- Fixed Apex scaling with decoupled backward ([#2433](https://github.com/Lightning-AI/lightning/pull/2433)) +- Fixed crashing or wrong displaying progressbar because of missing ipywidgets ([#2417](https://github.com/Lightning-AI/lightning/pull/2417)) +- Fixed TPU saving dir ([fc26078e](https://github.com/Lightning-AI/lightning/commit/fc26078e395f8a001f4c6dd7b3fe7ca202f914a3), [04e68f02](https://github.com/Lightning-AI/lightning/commit/04e68f022fc03dd5f1555ee86dea997d42a448ad)) +- Fixed logging on rank 0 only ([#2425](https://github.com/Lightning-AI/lightning/pull/2425)) ## [0.8.3] - 2020-06-29 ### Fixed -- Fixed AMP wrong call ([593837e](https://github.com/PyTorchLightning/pytorch-lightning/commit/593837e1da24ff6c942b24ed803fc1496a304609)) -- Fixed batch typo ([92d1e75](https://github.com/PyTorchLightning/pytorch-lightning/commit/92d1e75b2638a493d9d21ed5fe00a22093888285)) +- Fixed AMP wrong call ([593837e](https://github.com/Lightning-AI/lightning/commit/593837e1da24ff6c942b24ed803fc1496a304609)) +- Fixed batch typo ([92d1e75](https://github.com/Lightning-AI/lightning/commit/92d1e75b2638a493d9d21ed5fe00a22093888285)) ## [0.8.2] - 2020-06-28 ### Added -- Added TorchText support for moving data to GPU ([#2379](https://github.com/PyTorchLightning/pytorch-lightning/pull/2379)) +- Added TorchText support for moving data to GPU ([#2379](https://github.com/Lightning-AI/lightning/pull/2379)) ### Changed -- Changed epoch indexing from 0 instead of 1 ([#2289](https://github.com/PyTorchLightning/pytorch-lightning/pull/2289)) -- Refactor Model `backward` ([#2276](https://github.com/PyTorchLightning/pytorch-lightning/pull/2276)) -- Refactored `training_batch` + tests to verify correctness ([#2327](https://github.com/PyTorchLightning/pytorch-lightning/pull/2327), - [#2328](https://github.com/PyTorchLightning/pytorch-lightning/pull/2328)) -- Refactored training loop ([#2336](https://github.com/PyTorchLightning/pytorch-lightning/pull/2336)) -- Made optimization steps for hooks ([#2363](https://github.com/PyTorchLightning/pytorch-lightning/pull/2363)) -- Changed default apex level to 'O2' ([#2362](https://github.com/PyTorchLightning/pytorch-lightning/pull/2362)) +- Changed epoch indexing from 0 instead of 1 ([#2289](https://github.com/Lightning-AI/lightning/pull/2289)) +- Refactor Model `backward` ([#2276](https://github.com/Lightning-AI/lightning/pull/2276)) +- Refactored `training_batch` + tests to verify correctness ([#2327](https://github.com/Lightning-AI/lightning/pull/2327), + [#2328](https://github.com/Lightning-AI/lightning/pull/2328)) +- Refactored training loop ([#2336](https://github.com/Lightning-AI/lightning/pull/2336)) +- Made optimization steps for hooks ([#2363](https://github.com/Lightning-AI/lightning/pull/2363)) +- Changed default apex level to 'O2' ([#2362](https://github.com/Lightning-AI/lightning/pull/2362)) ### Removed -- Moved `TrainsLogger` to Bolts ([#2384](https://github.com/PyTorchLightning/pytorch-lightning/pull/2384)) - -### Fixed - -- Fixed parsing TPU arguments and TPU tests ([#2094](https://github.com/PyTorchLightning/pytorch-lightning/pull/2094)) -- Fixed number batches in case of multiple dataloaders and `limit_{*}_batches` ([#1920](https://github.com/PyTorchLightning/pytorch-lightning/pull/1920), - [#2226](https://github.com/PyTorchLightning/pytorch-lightning/pull/2226)) -- Fixed an issue with forward hooks not being removed after model summary ([#2298](https://github.com/PyTorchLightning/pytorch-lightning/pull/2298)) -- Fix for `load_from_checkpoint()` not working with absolute path on Windows ([#2294](https://github.com/PyTorchLightning/pytorch-lightning/pull/2294)) -- Fixed an issue how _has_len handles `NotImplementedError` e.g. raised by `torchtext.data.Iterator` ([#2293](https://github.com/PyTorchLightning/pytorch-lightning/pull/2293)), ([#2307](https://github.com/PyTorchLightning/pytorch-lightning/pull/2307)) -- Fixed `average_precision` metric ([#2319](https://github.com/PyTorchLightning/pytorch-lightning/pull/2319)) -- Fixed ROC metric for CUDA tensors ([#2304](https://github.com/PyTorchLightning/pytorch-lightning/pull/2304)) -- Fixed lost compatibility with custom datatypes implementing `.to` ([#2335](https://github.com/PyTorchLightning/pytorch-lightning/pull/2335)) -- Fixed loading model with kwargs ([#2387](https://github.com/PyTorchLightning/pytorch-lightning/pull/2387)) -- Fixed sum(0) for `trainer.num_val_batches` ([#2268](https://github.com/PyTorchLightning/pytorch-lightning/pull/2268)) -- Fixed checking if the parameters are a `DictConfig` Object ([#2216](https://github.com/PyTorchLightning/pytorch-lightning/pull/2216)) -- Fixed SLURM weights saving ([#2341](https://github.com/PyTorchLightning/pytorch-lightning/pull/2341)) -- Fixed swaps LR scheduler order ([#2356](https://github.com/PyTorchLightning/pytorch-lightning/pull/2356)) -- Fixed adding tensorboard `hparams` logging test ([#2342](https://github.com/PyTorchLightning/pytorch-lightning/pull/2342)) -- Fixed use model ref for tear down ([#2360](https://github.com/PyTorchLightning/pytorch-lightning/pull/2360)) -- Fixed logger crash on DDP ([#2388](https://github.com/PyTorchLightning/pytorch-lightning/pull/2388)) -- Fixed several issues with early stopping and checkpoint callbacks ([#1504](https://github.com/PyTorchLightning/pytorch-lightning/pull/1504), - [#2391](https://github.com/PyTorchLightning/pytorch-lightning/pull/2391)) -- Fixed loading past checkpoints from v0.7.x ([#2405](https://github.com/PyTorchLightning/pytorch-lightning/pull/2405)) -- Fixed loading model without arguments ([#2403](https://github.com/PyTorchLightning/pytorch-lightning/pull/2403)) -- Fixed Windows compatibility issue ([#2358](https://github.com/PyTorchLightning/pytorch-lightning/pull/2358)) +- Moved `TrainsLogger` to Bolts ([#2384](https://github.com/Lightning-AI/lightning/pull/2384)) + +### Fixed + +- Fixed parsing TPU arguments and TPU tests ([#2094](https://github.com/Lightning-AI/lightning/pull/2094)) +- Fixed number batches in case of multiple dataloaders and `limit_{*}_batches` ([#1920](https://github.com/Lightning-AI/lightning/pull/1920), + [#2226](https://github.com/Lightning-AI/lightning/pull/2226)) +- Fixed an issue with forward hooks not being removed after model summary ([#2298](https://github.com/Lightning-AI/lightning/pull/2298)) +- Fix for `load_from_checkpoint()` not working with absolute path on Windows ([#2294](https://github.com/Lightning-AI/lightning/pull/2294)) +- Fixed an issue how _has_len handles `NotImplementedError` e.g. raised by `torchtext.data.Iterator` ([#2293](https://github.com/Lightning-AI/lightning/pull/2293)), ([#2307](https://github.com/Lightning-AI/lightning/pull/2307)) +- Fixed `average_precision` metric ([#2319](https://github.com/Lightning-AI/lightning/pull/2319)) +- Fixed ROC metric for CUDA tensors ([#2304](https://github.com/Lightning-AI/lightning/pull/2304)) +- Fixed lost compatibility with custom datatypes implementing `.to` ([#2335](https://github.com/Lightning-AI/lightning/pull/2335)) +- Fixed loading model with kwargs ([#2387](https://github.com/Lightning-AI/lightning/pull/2387)) +- Fixed sum(0) for `trainer.num_val_batches` ([#2268](https://github.com/Lightning-AI/lightning/pull/2268)) +- Fixed checking if the parameters are a `DictConfig` Object ([#2216](https://github.com/Lightning-AI/lightning/pull/2216)) +- Fixed SLURM weights saving ([#2341](https://github.com/Lightning-AI/lightning/pull/2341)) +- Fixed swaps LR scheduler order ([#2356](https://github.com/Lightning-AI/lightning/pull/2356)) +- Fixed adding tensorboard `hparams` logging test ([#2342](https://github.com/Lightning-AI/lightning/pull/2342)) +- Fixed use model ref for tear down ([#2360](https://github.com/Lightning-AI/lightning/pull/2360)) +- Fixed logger crash on DDP ([#2388](https://github.com/Lightning-AI/lightning/pull/2388)) +- Fixed several issues with early stopping and checkpoint callbacks ([#1504](https://github.com/Lightning-AI/lightning/pull/1504), + [#2391](https://github.com/Lightning-AI/lightning/pull/2391)) +- Fixed loading past checkpoints from v0.7.x ([#2405](https://github.com/Lightning-AI/lightning/pull/2405)) +- Fixed loading model without arguments ([#2403](https://github.com/Lightning-AI/lightning/pull/2403)) +- Fixed Windows compatibility issue ([#2358](https://github.com/Lightning-AI/lightning/pull/2358)) ## [0.8.1] - 2020-06-19 ### Fixed -- Fixed the `load_from_checkpoint` path detected as URL bug ([#2244](https://github.com/PyTorchLightning/pytorch-lightning/pull/2244)) -- Fixed hooks - added barrier ([#2245](https://github.com/PyTorchLightning/pytorch-lightning/pull/2245), - [#2257](https://github.com/PyTorchLightning/pytorch-lightning/pull/2257), - [#2260](https://github.com/PyTorchLightning/pytorch-lightning/pull/220)) -- Fixed `hparams` - remove frame inspection on `self.hparams` ([#2253](https://github.com/PyTorchLightning/pytorch-lightning/pull/2253)) -- Fixed setup and on fit calls ([#2252](https://github.com/PyTorchLightning/pytorch-lightning/pull/2252)) -- Fixed GPU template ([#2255](https://github.com/PyTorchLightning/pytorch-lightning/pull/2255)) +- Fixed the `load_from_checkpoint` path detected as URL bug ([#2244](https://github.com/Lightning-AI/lightning/pull/2244)) +- Fixed hooks - added barrier ([#2245](https://github.com/Lightning-AI/lightning/pull/2245), + [#2257](https://github.com/Lightning-AI/lightning/pull/2257), + [#2260](https://github.com/Lightning-AI/lightning/pull/220)) +- Fixed `hparams` - remove frame inspection on `self.hparams` ([#2253](https://github.com/Lightning-AI/lightning/pull/2253)) +- Fixed setup and on fit calls ([#2252](https://github.com/Lightning-AI/lightning/pull/2252)) +- Fixed GPU template ([#2255](https://github.com/Lightning-AI/lightning/pull/2255)) ## [0.8.0] - 2020-06-18 ### Added -- Added `overfit_batches`, `limit_{val|test}_batches` flags (overfit now uses training set for all three) ([#2213](https://github.com/PyTorchLightning/pytorch-lightning/pull/2213)) +- Added `overfit_batches`, `limit_{val|test}_batches` flags (overfit now uses training set for all three) ([#2213](https://github.com/Lightning-AI/lightning/pull/2213)) - Added metrics - * Base classes ([#1326](https://github.com/PyTorchLightning/pytorch-lightning/pull/1326), - [#1877](https://github.com/PyTorchLightning/pytorch-lightning/pull/1877)) - * Sklearn metrics classes ([#1327](https://github.com/PyTorchLightning/pytorch-lightning/pull/1327)) - * Native torch metrics ([#1488](https://github.com/PyTorchLightning/pytorch-lightning/pull/1488), - [#2062](https://github.com/PyTorchLightning/pytorch-lightning/pull/2062)) - * docs for all Metrics ([#2184](https://github.com/PyTorchLightning/pytorch-lightning/pull/2184), - [#2209](https://github.com/PyTorchLightning/pytorch-lightning/pull/2209)) - * Regression metrics ([#2221](https://github.com/PyTorchLightning/pytorch-lightning/pull/2221)) -- Allow dataloaders without sampler field present ([#1907](https://github.com/PyTorchLightning/pytorch-lightning/pull/1907)) -- Added option `save_last` to save the model at the end of every epoch in `ModelCheckpoint` ([#1908](https://github.com/PyTorchLightning/pytorch-lightning/pull/1908)) -- Early stopping checks `on_validation_end` ([#1458](https://github.com/PyTorchLightning/pytorch-lightning/pull/1458)) -- Speed up single-core TPU training by loading data using `ParallelLoader` ([#2033](https://github.com/PyTorchLightning/pytorch-lightning/pull/2033)) -- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([#1756](https://github.com/PyTorchLightning/pytorch-lightning/pull/1756)) -- Added [black](https://black.readthedocs.io/en/stable/) formatter for the code with code-checker on pull ([#1610](https://github.com/PyTorchLightning/pytorch-lightning/pull/1610)) -- Added back the slow spawn ddp implementation as `ddp_spawn` ([#2115](https://github.com/PyTorchLightning/pytorch-lightning/pull/2115)) -- Added loading checkpoints from URLs ([#1667](https://github.com/PyTorchLightning/pytorch-lightning/pull/1667)) -- Added a callback method `on_keyboard_interrupt` for handling KeyboardInterrupt events during training ([#2134](https://github.com/PyTorchLightning/pytorch-lightning/pull/2134)) -- Added a decorator `auto_move_data` that moves data to the correct device when using the LightningModule for inference ([#1905](https://github.com/PyTorchLightning/pytorch-lightning/pull/1905)) -- Added `ckpt_path` option to `LightningModule.test(...)` to load particular checkpoint ([#2190](https://github.com/PyTorchLightning/pytorch-lightning/pull/2190)) -- Added `setup` and `teardown` hooks for model ([#2229](https://github.com/PyTorchLightning/pytorch-lightning/pull/2229)) + * Base classes ([#1326](https://github.com/Lightning-AI/lightning/pull/1326), + [#1877](https://github.com/Lightning-AI/lightning/pull/1877)) + * Sklearn metrics classes ([#1327](https://github.com/Lightning-AI/lightning/pull/1327)) + * Native torch metrics ([#1488](https://github.com/Lightning-AI/lightning/pull/1488), + [#2062](https://github.com/Lightning-AI/lightning/pull/2062)) + * docs for all Metrics ([#2184](https://github.com/Lightning-AI/lightning/pull/2184), + [#2209](https://github.com/Lightning-AI/lightning/pull/2209)) + * Regression metrics ([#2221](https://github.com/Lightning-AI/lightning/pull/2221)) +- Allow dataloaders without sampler field present ([#1907](https://github.com/Lightning-AI/lightning/pull/1907)) +- Added option `save_last` to save the model at the end of every epoch in `ModelCheckpoint` ([#1908](https://github.com/Lightning-AI/lightning/pull/1908)) +- Early stopping checks `on_validation_end` ([#1458](https://github.com/Lightning-AI/lightning/pull/1458)) +- Speed up single-core TPU training by loading data using `ParallelLoader` ([#2033](https://github.com/Lightning-AI/lightning/pull/2033)) +- Added a model hook `transfer_batch_to_device` that enables moving custom data structures to the target device ([#1756](https://github.com/Lightning-AI/lightning/pull/1756)) +- Added [black](https://black.readthedocs.io/en/stable/) formatter for the code with code-checker on pull ([#1610](https://github.com/Lightning-AI/lightning/pull/1610)) +- Added back the slow spawn ddp implementation as `ddp_spawn` ([#2115](https://github.com/Lightning-AI/lightning/pull/2115)) +- Added loading checkpoints from URLs ([#1667](https://github.com/Lightning-AI/lightning/pull/1667)) +- Added a callback method `on_keyboard_interrupt` for handling KeyboardInterrupt events during training ([#2134](https://github.com/Lightning-AI/lightning/pull/2134)) +- Added a decorator `auto_move_data` that moves data to the correct device when using the LightningModule for inference ([#1905](https://github.com/Lightning-AI/lightning/pull/1905)) +- Added `ckpt_path` option to `LightningModule.test(...)` to load particular checkpoint ([#2190](https://github.com/Lightning-AI/lightning/pull/2190)) +- Added `setup` and `teardown` hooks for model ([#2229](https://github.com/Lightning-AI/lightning/pull/2229)) ### Changed -- Allow user to select individual TPU core to train on ([#1729](https://github.com/PyTorchLightning/pytorch-lightning/pull/1729)) -- Removed non-finite values from loss in `LRFinder` ([#1862](https://github.com/PyTorchLightning/pytorch-lightning/pull/1862)) -- Allow passing model hyperparameters as complete kwarg list ([#1896](https://github.com/PyTorchLightning/pytorch-lightning/pull/1896)) -- Renamed `ModelCheckpoint`'s attributes `best` to `best_model_score` and `kth_best_model` to `kth_best_model_path` ([#1799](https://github.com/PyTorchLightning/pytorch-lightning/pull/1799)) -- Re-Enable Logger's `ImportError`s ([#1938](https://github.com/PyTorchLightning/pytorch-lightning/pull/1938)) -- Changed the default value of the Trainer argument `weights_summary` from `full` to `top` ([#2029](https://github.com/PyTorchLightning/pytorch-lightning/pull/2029)) -- Raise an error when lightning replaces an existing sampler ([#2020](https://github.com/PyTorchLightning/pytorch-lightning/pull/2020)) -- Enabled `prepare_data` from correct processes - clarify local vs global rank ([#2166](https://github.com/PyTorchLightning/pytorch-lightning/pull/2166)) -- Remove explicit flush from tensorboard logger ([#2126](https://github.com/PyTorchLightning/pytorch-lightning/pull/2126)) -- Changed epoch indexing from 1 instead of 0 ([#2206](https://github.com/PyTorchLightning/pytorch-lightning/pull/2206)) +- Allow user to select individual TPU core to train on ([#1729](https://github.com/Lightning-AI/lightning/pull/1729)) +- Removed non-finite values from loss in `LRFinder` ([#1862](https://github.com/Lightning-AI/lightning/pull/1862)) +- Allow passing model hyperparameters as complete kwarg list ([#1896](https://github.com/Lightning-AI/lightning/pull/1896)) +- Renamed `ModelCheckpoint`'s attributes `best` to `best_model_score` and `kth_best_model` to `kth_best_model_path` ([#1799](https://github.com/Lightning-AI/lightning/pull/1799)) +- Re-Enable Logger's `ImportError`s ([#1938](https://github.com/Lightning-AI/lightning/pull/1938)) +- Changed the default value of the Trainer argument `weights_summary` from `full` to `top` ([#2029](https://github.com/Lightning-AI/lightning/pull/2029)) +- Raise an error when lightning replaces an existing sampler ([#2020](https://github.com/Lightning-AI/lightning/pull/2020)) +- Enabled `prepare_data` from correct processes - clarify local vs global rank ([#2166](https://github.com/Lightning-AI/lightning/pull/2166)) +- Remove explicit flush from tensorboard logger ([#2126](https://github.com/Lightning-AI/lightning/pull/2126)) +- Changed epoch indexing from 1 instead of 0 ([#2206](https://github.com/Lightning-AI/lightning/pull/2206)) ### Deprecated -- Deprecated flags: ([#2213](https://github.com/PyTorchLightning/pytorch-lightning/pull/2213)) +- Deprecated flags: ([#2213](https://github.com/Lightning-AI/lightning/pull/2213)) * `overfit_pct` in favour of `overfit_batches` * `val_percent_check` in favour of `limit_val_batches` * `test_percent_check` in favour of `limit_test_batches` -- Deprecated `ModelCheckpoint`'s attributes `best` and `kth_best_model` ([#1799](https://github.com/PyTorchLightning/pytorch-lightning/pull/1799)) -- Dropped official support/testing for older PyTorch versions <1.3 ([#1917](https://github.com/PyTorchLightning/pytorch-lightning/pull/1917)) -- Deprecated Trainer `proc_rank` in favour of `global_rank` ([#2166](https://github.com/PyTorchLightning/pytorch-lightning/pull/2166), - [#2269](https://github.com/PyTorchLightning/pytorch-lightning/pull/2269)) +- Deprecated `ModelCheckpoint`'s attributes `best` and `kth_best_model` ([#1799](https://github.com/Lightning-AI/lightning/pull/1799)) +- Dropped official support/testing for older PyTorch versions <1.3 ([#1917](https://github.com/Lightning-AI/lightning/pull/1917)) +- Deprecated Trainer `proc_rank` in favour of `global_rank` ([#2166](https://github.com/Lightning-AI/lightning/pull/2166), + [#2269](https://github.com/Lightning-AI/lightning/pull/2269)) ### Removed -- Removed unintended Trainer argument `progress_bar_callback`, the callback should be passed in by `Trainer(callbacks=[...])` instead ([#1855](https://github.com/PyTorchLightning/pytorch-lightning/pull/1855)) -- Removed obsolete `self._device` in Trainer ([#1849](https://github.com/PyTorchLightning/pytorch-lightning/pull/1849)) -- Removed deprecated API ([#2073](https://github.com/PyTorchLightning/pytorch-lightning/pull/2073)) +- Removed unintended Trainer argument `progress_bar_callback`, the callback should be passed in by `Trainer(callbacks=[...])` instead ([#1855](https://github.com/Lightning-AI/lightning/pull/1855)) +- Removed obsolete `self._device` in Trainer ([#1849](https://github.com/Lightning-AI/lightning/pull/1849)) +- Removed deprecated API ([#2073](https://github.com/Lightning-AI/lightning/pull/2073)) * Packages: `pytorch_lightning.pt_overrides`, `pytorch_lightning.root_module` * Modules: `pytorch_lightning.logging.comet_logger`, `pytorch_lightning.logging.mlflow_logger`, `pytorch_lightning.logging.test_tube_logger`, `pytorch_lightning.overrides.override_data_parallel`, `pytorch_lightning.core.model_saving`, `pytorch_lightning.core.root_module` * Trainer arguments: `add_row_log_interval`, `default_save_path`, `gradient_clip`, `nb_gpu_nodes`, `max_nb_epochs`, `min_nb_epochs`, `nb_sanity_val_steps` @@ -3207,386 +3207,386 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Run graceful training teardown on interpreter exit ([#1631](https://github.com/PyTorchLightning/pytorch-lightning/pull/1631)) -- Fixed user warning when apex was used together with learning rate schedulers ([#1873](https://github.com/PyTorchLightning/pytorch-lightning/pull/1873)) -- Fixed multiple calls of `EarlyStopping` callback ([#1863](https://github.com/PyTorchLightning/pytorch-lightning/pull/1863)) -- Fixed an issue with `Trainer.from_argparse_args` when passing in unknown Trainer args ([#1932](https://github.com/PyTorchLightning/pytorch-lightning/pull/1932)) -- Fixed bug related to logger not being reset correctly for model after tuner algorithms ([#1933](https://github.com/PyTorchLightning/pytorch-lightning/pull/1933)) -- Fixed root node resolution for SLURM cluster with dash in host name ([#1954](https://github.com/PyTorchLightning/pytorch-lightning/pull/1954)) -- Fixed `LearningRateLogger` in multi-scheduler setting ([#1944](https://github.com/PyTorchLightning/pytorch-lightning/pull/1944)) -- Fixed test configuration check and testing ([#1804](https://github.com/PyTorchLightning/pytorch-lightning/pull/1804)) -- Fixed an issue with Trainer constructor silently ignoring unknown/misspelled arguments ([#1820](https://github.com/PyTorchLightning/pytorch-lightning/pull/1820)) -- Fixed `save_weights_only` in ModelCheckpoint ([#1780](https://github.com/PyTorchLightning/pytorch-lightning/pull/1780)) -- Allow use of same `WandbLogger` instance for multiple training loops ([#2055](https://github.com/PyTorchLightning/pytorch-lightning/pull/2055)) -- Fixed an issue with `_auto_collect_arguments` collecting local variables that are not constructor arguments and not working for signatures that have the instance not named `self` ([#2048](https://github.com/PyTorchLightning/pytorch-lightning/pull/2048)) -- Fixed mistake in parameters' grad norm tracking ([#2012](https://github.com/PyTorchLightning/pytorch-lightning/pull/2012)) -- Fixed CPU and hanging GPU crash ([#2118](https://github.com/PyTorchLightning/pytorch-lightning/pull/2118)) -- Fixed an issue with the model summary and `example_input_array` depending on a specific ordering of the submodules in a LightningModule ([#1773](https://github.com/PyTorchLightning/pytorch-lightning/pull/1773)) -- Fixed Tpu logging ([#2230](https://github.com/PyTorchLightning/pytorch-lightning/pull/2230)) -- Fixed Pid port + duplicate `rank_zero` logging ([#2140](https://github.com/PyTorchLightning/pytorch-lightning/pull/2140), - [#2231](https://github.com/PyTorchLightning/pytorch-lightning/pull/2231)) +- Run graceful training teardown on interpreter exit ([#1631](https://github.com/Lightning-AI/lightning/pull/1631)) +- Fixed user warning when apex was used together with learning rate schedulers ([#1873](https://github.com/Lightning-AI/lightning/pull/1873)) +- Fixed multiple calls of `EarlyStopping` callback ([#1863](https://github.com/Lightning-AI/lightning/pull/1863)) +- Fixed an issue with `Trainer.from_argparse_args` when passing in unknown Trainer args ([#1932](https://github.com/Lightning-AI/lightning/pull/1932)) +- Fixed bug related to logger not being reset correctly for model after tuner algorithms ([#1933](https://github.com/Lightning-AI/lightning/pull/1933)) +- Fixed root node resolution for SLURM cluster with dash in host name ([#1954](https://github.com/Lightning-AI/lightning/pull/1954)) +- Fixed `LearningRateLogger` in multi-scheduler setting ([#1944](https://github.com/Lightning-AI/lightning/pull/1944)) +- Fixed test configuration check and testing ([#1804](https://github.com/Lightning-AI/lightning/pull/1804)) +- Fixed an issue with Trainer constructor silently ignoring unknown/misspelled arguments ([#1820](https://github.com/Lightning-AI/lightning/pull/1820)) +- Fixed `save_weights_only` in ModelCheckpoint ([#1780](https://github.com/Lightning-AI/lightning/pull/1780)) +- Allow use of same `WandbLogger` instance for multiple training loops ([#2055](https://github.com/Lightning-AI/lightning/pull/2055)) +- Fixed an issue with `_auto_collect_arguments` collecting local variables that are not constructor arguments and not working for signatures that have the instance not named `self` ([#2048](https://github.com/Lightning-AI/lightning/pull/2048)) +- Fixed mistake in parameters' grad norm tracking ([#2012](https://github.com/Lightning-AI/lightning/pull/2012)) +- Fixed CPU and hanging GPU crash ([#2118](https://github.com/Lightning-AI/lightning/pull/2118)) +- Fixed an issue with the model summary and `example_input_array` depending on a specific ordering of the submodules in a LightningModule ([#1773](https://github.com/Lightning-AI/lightning/pull/1773)) +- Fixed Tpu logging ([#2230](https://github.com/Lightning-AI/lightning/pull/2230)) +- Fixed Pid port + duplicate `rank_zero` logging ([#2140](https://github.com/Lightning-AI/lightning/pull/2140), + [#2231](https://github.com/Lightning-AI/lightning/pull/2231)) ## [0.7.6] - 2020-05-16 ### Added -- Added callback for logging learning rates ([#1498](https://github.com/PyTorchLightning/pytorch-lightning/pull/1498)) -- Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/PyTorchLightning/pytorch-lightning/pull/1564)) -- Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/PyTorchLightning/pytorch-lightning/pull/1723)). -- Added auto scaling of batch size ([#1638](https://github.com/PyTorchLightning/pytorch-lightning/pull/1638)) -- The progress bar metrics now also get updated in `training_epoch_end` ([#1724](https://github.com/PyTorchLightning/pytorch-lightning/pull/1724)) -- Enable `NeptuneLogger` to work with `distributed_backend=ddp` ([#1753](https://github.com/PyTorchLightning/pytorch-lightning/pull/1753)) -- Added option to provide seed to random generators to ensure reproducibility ([#1572](https://github.com/PyTorchLightning/pytorch-lightning/pull/1572)) -- Added override for hparams in `load_from_ckpt` ([#1797](https://github.com/PyTorchLightning/pytorch-lightning/pull/1797)) -- Added support multi-node distributed execution under `torchelastic` ([#1811](https://github.com/PyTorchLightning/pytorch-lightning/pull/1811), - [#1818](https://github.com/PyTorchLightning/pytorch-lightning/pull/1818)) -- Added using `store_true` for bool args ([#1822](https://github.com/PyTorchLightning/pytorch-lightning/pull/1822), - [#1842](https://github.com/PyTorchLightning/pytorch-lightning/pull/1842)) -- Added dummy logger for internally disabling logging for some features ([#1836](https://github.com/PyTorchLightning/pytorch-lightning/pull/1836)) +- Added callback for logging learning rates ([#1498](https://github.com/Lightning-AI/lightning/pull/1498)) +- Added transfer learning example (for a binary classification task in computer vision) ([#1564](https://github.com/Lightning-AI/lightning/pull/1564)) +- Added type hints in `Trainer.fit()` and `Trainer.test()` to reflect that also a list of dataloaders can be passed in ([#1723](https://github.com/Lightning-AI/lightning/pull/1723)). +- Added auto scaling of batch size ([#1638](https://github.com/Lightning-AI/lightning/pull/1638)) +- The progress bar metrics now also get updated in `training_epoch_end` ([#1724](https://github.com/Lightning-AI/lightning/pull/1724)) +- Enable `NeptuneLogger` to work with `distributed_backend=ddp` ([#1753](https://github.com/Lightning-AI/lightning/pull/1753)) +- Added option to provide seed to random generators to ensure reproducibility ([#1572](https://github.com/Lightning-AI/lightning/pull/1572)) +- Added override for hparams in `load_from_ckpt` ([#1797](https://github.com/Lightning-AI/lightning/pull/1797)) +- Added support multi-node distributed execution under `torchelastic` ([#1811](https://github.com/Lightning-AI/lightning/pull/1811), + [#1818](https://github.com/Lightning-AI/lightning/pull/1818)) +- Added using `store_true` for bool args ([#1822](https://github.com/Lightning-AI/lightning/pull/1822), + [#1842](https://github.com/Lightning-AI/lightning/pull/1842)) +- Added dummy logger for internally disabling logging for some features ([#1836](https://github.com/Lightning-AI/lightning/pull/1836)) ### Changed -- Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/PyTorchLightning/pytorch-lightning/pull/1843)) -- Replace mata_tags.csv with hparams.yaml ([#1271](https://github.com/PyTorchLightning/pytorch-lightning/pull/1271)) -- Reduction when `batch_size < num_gpus` ([#1609](https://github.com/PyTorchLightning/pytorch-lightning/pull/1609)) -- Updated LightningTemplateModel to look more like Colab example ([#1577](https://github.com/PyTorchLightning/pytorch-lightning/pull/1577)) -- Don't convert `namedtuple` to `tuple` when transferring the batch to target device ([#1589](https://github.com/PyTorchLightning/pytorch-lightning/pull/1589)) -- Allow passing hparams as keyword argument to LightningModule when loading from checkpoint ([#1639](https://github.com/PyTorchLightning/pytorch-lightning/pull/1639)) -- Args should come after the last positional argument ([#1807](https://github.com/PyTorchLightning/pytorch-lightning/pull/1807)) -- Made ddp the default if no backend specified with multiple GPUs ([#1789](https://github.com/PyTorchLightning/pytorch-lightning/pull/1789)) +- Enable `non-blocking` for device transfers to GPU ([#1843](https://github.com/Lightning-AI/lightning/pull/1843)) +- Replace mata_tags.csv with hparams.yaml ([#1271](https://github.com/Lightning-AI/lightning/pull/1271)) +- Reduction when `batch_size < num_gpus` ([#1609](https://github.com/Lightning-AI/lightning/pull/1609)) +- Updated LightningTemplateModel to look more like Colab example ([#1577](https://github.com/Lightning-AI/lightning/pull/1577)) +- Don't convert `namedtuple` to `tuple` when transferring the batch to target device ([#1589](https://github.com/Lightning-AI/lightning/pull/1589)) +- Allow passing hparams as keyword argument to LightningModule when loading from checkpoint ([#1639](https://github.com/Lightning-AI/lightning/pull/1639)) +- Args should come after the last positional argument ([#1807](https://github.com/Lightning-AI/lightning/pull/1807)) +- Made ddp the default if no backend specified with multiple GPUs ([#1789](https://github.com/Lightning-AI/lightning/pull/1789)) ### Deprecated -- Deprecated `tags_csv` in favor of `hparams_file` ([#1271](https://github.com/PyTorchLightning/pytorch-lightning/pull/1271)) +- Deprecated `tags_csv` in favor of `hparams_file` ([#1271](https://github.com/Lightning-AI/lightning/pull/1271)) ### Fixed -- Fixed broken link in PR template ([#1675](https://github.com/PyTorchLightning/pytorch-lightning/pull/1675)) -- Fixed ModelCheckpoint not None checking filepath ([#1654](https://github.com/PyTorchLightning/pytorch-lightning/pull/1654)) -- Trainer now calls `on_load_checkpoint()` when resuming from a checkpoint ([#1666](https://github.com/PyTorchLightning/pytorch-lightning/pull/1666)) -- Fixed sampler logic for ddp with iterable dataset ([#1734](https://github.com/PyTorchLightning/pytorch-lightning/pull/1734)) -- Fixed `_reset_eval_dataloader()` for IterableDataset ([#1560](https://github.com/PyTorchLightning/pytorch-lightning/pull/1560)) -- Fixed Horovod distributed backend to set the `root_gpu` property ([#1669](https://github.com/PyTorchLightning/pytorch-lightning/pull/1669)) -- Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/PyTorchLightning/pytorch-lightning/pull/1492)) -- Fixed disabling progress bar on non-zero ranks using Horovod backend ([#1709](https://github.com/PyTorchLightning/pytorch-lightning/pull/1709)) -- Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([#1676](https://github.com/PyTorchLightning/pytorch-lightning/pull/1676)) -- Fixed a bug in Trainer that prepended the checkpoint path with `version_` when it shouldn't ([#1748](https://github.com/PyTorchLightning/pytorch-lightning/pull/1748)) -- Fixed lr key name in case of param groups in LearningRateLogger ([#1719](https://github.com/PyTorchLightning/pytorch-lightning/pull/1719)) -- Fixed accumulation parameter and suggestion method for learning rate finder ([#1801](https://github.com/PyTorchLightning/pytorch-lightning/pull/1801)) -- Fixed num processes wasn't being set properly and auto sampler was ddp failing ([#1819](https://github.com/PyTorchLightning/pytorch-lightning/pull/1819)) -- Fixed bugs in semantic segmentation example ([#1824](https://github.com/PyTorchLightning/pytorch-lightning/pull/1824)) -- Fixed saving native AMP scaler state ([#1777](https://github.com/PyTorchLightning/pytorch-lightning/pull/1777)) -- Fixed native amp + ddp ([#1788](https://github.com/PyTorchLightning/pytorch-lightning/pull/1788)) -- Fixed `hparam` logging with metrics ([#1647](https://github.com/PyTorchLightning/pytorch-lightning/pull/1647)) +- Fixed broken link in PR template ([#1675](https://github.com/Lightning-AI/lightning/pull/1675)) +- Fixed ModelCheckpoint not None checking filepath ([#1654](https://github.com/Lightning-AI/lightning/pull/1654)) +- Trainer now calls `on_load_checkpoint()` when resuming from a checkpoint ([#1666](https://github.com/Lightning-AI/lightning/pull/1666)) +- Fixed sampler logic for ddp with iterable dataset ([#1734](https://github.com/Lightning-AI/lightning/pull/1734)) +- Fixed `_reset_eval_dataloader()` for IterableDataset ([#1560](https://github.com/Lightning-AI/lightning/pull/1560)) +- Fixed Horovod distributed backend to set the `root_gpu` property ([#1669](https://github.com/Lightning-AI/lightning/pull/1669)) +- Fixed wandb logger `global_step` affects other loggers ([#1492](https://github.com/Lightning-AI/lightning/pull/1492)) +- Fixed disabling progress bar on non-zero ranks using Horovod backend ([#1709](https://github.com/Lightning-AI/lightning/pull/1709)) +- Fixed bugs that prevent lr finder to be used together with early stopping and validation dataloaders ([#1676](https://github.com/Lightning-AI/lightning/pull/1676)) +- Fixed a bug in Trainer that prepended the checkpoint path with `version_` when it shouldn't ([#1748](https://github.com/Lightning-AI/lightning/pull/1748)) +- Fixed lr key name in case of param groups in LearningRateLogger ([#1719](https://github.com/Lightning-AI/lightning/pull/1719)) +- Fixed accumulation parameter and suggestion method for learning rate finder ([#1801](https://github.com/Lightning-AI/lightning/pull/1801)) +- Fixed num processes wasn't being set properly and auto sampler was ddp failing ([#1819](https://github.com/Lightning-AI/lightning/pull/1819)) +- Fixed bugs in semantic segmentation example ([#1824](https://github.com/Lightning-AI/lightning/pull/1824)) +- Fixed saving native AMP scaler state ([#1777](https://github.com/Lightning-AI/lightning/pull/1777)) +- Fixed native amp + ddp ([#1788](https://github.com/Lightning-AI/lightning/pull/1788)) +- Fixed `hparam` logging with metrics ([#1647](https://github.com/Lightning-AI/lightning/pull/1647)) ## [0.7.5] - 2020-04-27 ### Changed -- Allow logging of metrics together with `hparams` ([#1630](https://github.com/PyTorchLightning/pytorch-lightning/pull/1630)) +- Allow logging of metrics together with `hparams` ([#1630](https://github.com/Lightning-AI/lightning/pull/1630)) ### Removed -- Removed Warning from trainer loop ([#1634](https://github.com/PyTorchLightning/pytorch-lightning/pull/1634)) +- Removed Warning from trainer loop ([#1634](https://github.com/Lightning-AI/lightning/pull/1634)) ### Fixed -- Fixed ModelCheckpoint not being fixable ([#1632](https://github.com/PyTorchLightning/pytorch-lightning/pull/1632)) -- Fixed CPU DDP breaking change and DDP change ([#1635](https://github.com/PyTorchLightning/pytorch-lightning/pull/1635)) -- Tested pickling ([#1636](https://github.com/PyTorchLightning/pytorch-lightning/pull/1636)) +- Fixed ModelCheckpoint not being fixable ([#1632](https://github.com/Lightning-AI/lightning/pull/1632)) +- Fixed CPU DDP breaking change and DDP change ([#1635](https://github.com/Lightning-AI/lightning/pull/1635)) +- Tested pickling ([#1636](https://github.com/Lightning-AI/lightning/pull/1636)) ## [0.7.4] - 2020-04-26 ### Added -- Added flag `replace_sampler_ddp` to manually disable sampler replacement in DDP ([#1513](https://github.com/PyTorchLightning/pytorch-lightning/pull/1513)) +- Added flag `replace_sampler_ddp` to manually disable sampler replacement in DDP ([#1513](https://github.com/Lightning-AI/lightning/pull/1513)) - Added `auto_select_gpus` flag to trainer that enables automatic selection of available GPUs on exclusive mode systems. -- Added learning rate finder ([#1347](https://github.com/PyTorchLightning/pytorch-lightning/pull/1347)) -- Added support for DDP mode in clusters without SLURM ([#1387](https://github.com/PyTorchLightning/pytorch-lightning/pull/1387)) -- Added `test_dataloaders` parameter to `Trainer.test()` ([#1434](https://github.com/PyTorchLightning/pytorch-lightning/pull/1434)) -- Added `terminate_on_nan` flag to trainer that performs a NaN check with each training iteration when set to `True` ([#1475](https://github.com/PyTorchLightning/pytorch-lightning/pull/1475)) -- Added speed parity tests (max 1 sec difference per epoch)([#1482](https://github.com/PyTorchLightning/pytorch-lightning/pull/1482)) -- Added `ddp_cpu` backend for testing ddp without GPUs ([#1158](https://github.com/PyTorchLightning/pytorch-lightning/pull/1158)) -- Added [Horovod](http://horovod.ai) support as a distributed backend `Trainer(distributed_backend='horovod')` ([#1529](https://github.com/PyTorchLightning/pytorch-lightning/pull/1529)) -- Added support for 8 core distributed training on Kaggle TPU's ([#1568](https://github.com/PyTorchLightning/pytorch-lightning/pull/1568)) -- Added support for native AMP ([#1561](https://github.com/PyTorchLightning/pytorch-lightning/pull/1561), - [#1580](https://github.com/PyTorchLightning/pytorch-lightning/pull/1580)) +- Added learning rate finder ([#1347](https://github.com/Lightning-AI/lightning/pull/1347)) +- Added support for DDP mode in clusters without SLURM ([#1387](https://github.com/Lightning-AI/lightning/pull/1387)) +- Added `test_dataloaders` parameter to `Trainer.test()` ([#1434](https://github.com/Lightning-AI/lightning/pull/1434)) +- Added `terminate_on_nan` flag to trainer that performs a NaN check with each training iteration when set to `True` ([#1475](https://github.com/Lightning-AI/lightning/pull/1475)) +- Added speed parity tests (max 1 sec difference per epoch)([#1482](https://github.com/Lightning-AI/lightning/pull/1482)) +- Added `ddp_cpu` backend for testing ddp without GPUs ([#1158](https://github.com/Lightning-AI/lightning/pull/1158)) +- Added [Horovod](http://horovod.ai) support as a distributed backend `Trainer(distributed_backend='horovod')` ([#1529](https://github.com/Lightning-AI/lightning/pull/1529)) +- Added support for 8 core distributed training on Kaggle TPU's ([#1568](https://github.com/Lightning-AI/lightning/pull/1568)) +- Added support for native AMP ([#1561](https://github.com/Lightning-AI/lightning/pull/1561), + [#1580](https://github.com/Lightning-AI/lightning/pull/1580)) ### Changed -- Changed the default behaviour to no longer include a NaN check with each training iteration ([#1475](https://github.com/PyTorchLightning/pytorch-lightning/pull/1475)) -- Decoupled the progress bar from trainer` it is a callback now and can be customized or even be replaced entirely ([#1450](https://github.com/PyTorchLightning/pytorch-lightning/pull/1450)). -- Changed lr schedule step interval behavior to update every backwards pass instead of every forwards pass ([#1477](https://github.com/PyTorchLightning/pytorch-lightning/pull/1477)) -- Defines shared proc. rank, remove rank from instances (e.g. loggers) ([#1408](https://github.com/PyTorchLightning/pytorch-lightning/pull/1408)) -- Updated semantic segmentation example with custom U-Net and logging ([#1371](https://github.com/PyTorchLightning/pytorch-lightning/pull/1371)) -- Disabled val and test shuffling ([#1600](https://github.com/PyTorchLightning/pytorch-lightning/pull/1600)) +- Changed the default behaviour to no longer include a NaN check with each training iteration ([#1475](https://github.com/Lightning-AI/lightning/pull/1475)) +- Decoupled the progress bar from trainer` it is a callback now and can be customized or even be replaced entirely ([#1450](https://github.com/Lightning-AI/lightning/pull/1450)). +- Changed lr schedule step interval behavior to update every backwards pass instead of every forwards pass ([#1477](https://github.com/Lightning-AI/lightning/pull/1477)) +- Defines shared proc. rank, remove rank from instances (e.g. loggers) ([#1408](https://github.com/Lightning-AI/lightning/pull/1408)) +- Updated semantic segmentation example with custom U-Net and logging ([#1371](https://github.com/Lightning-AI/lightning/pull/1371)) +- Disabled val and test shuffling ([#1600](https://github.com/Lightning-AI/lightning/pull/1600)) ### Deprecated -- Deprecated `training_tqdm_dict` in favor of `progress_bar_dict` ([#1450](https://github.com/PyTorchLightning/pytorch-lightning/pull/1450)). +- Deprecated `training_tqdm_dict` in favor of `progress_bar_dict` ([#1450](https://github.com/Lightning-AI/lightning/pull/1450)). ### Removed -- Removed `test_dataloaders` parameter from `Trainer.fit()` ([#1434](https://github.com/PyTorchLightning/pytorch-lightning/pull/1434)) +- Removed `test_dataloaders` parameter from `Trainer.fit()` ([#1434](https://github.com/Lightning-AI/lightning/pull/1434)) ### Fixed -- Added the possibility to pass nested metrics dictionaries to loggers ([#1582](https://github.com/PyTorchLightning/pytorch-lightning/pull/1582)) -- Fixed memory leak from opt return ([#1528](https://github.com/PyTorchLightning/pytorch-lightning/pull/1528)) -- Fixed saving checkpoint before deleting old ones ([#1453](https://github.com/PyTorchLightning/pytorch-lightning/pull/1453)) -- Fixed loggers - flushing last logged metrics even before continue, e.g. `trainer.test()` results ([#1459](https://github.com/PyTorchLightning/pytorch-lightning/pull/1459)) -- Fixed optimizer configuration when `configure_optimizers` returns dict without `lr_scheduler` ([#1443](https://github.com/PyTorchLightning/pytorch-lightning/pull/1443)) -- Fixed `LightningModule` - mixing hparams and arguments in `LightningModule.__init__()` crashes load_from_checkpoint() ([#1505](https://github.com/PyTorchLightning/pytorch-lightning/pull/1505)) -- Added a missing call to the `on_before_zero_grad` model hook ([#1493](https://github.com/PyTorchLightning/pytorch-lightning/pull/1493)). -- Allow use of sweeps with `WandbLogger` ([#1512](https://github.com/PyTorchLightning/pytorch-lightning/pull/1512)) -- Fixed a bug that caused the `callbacks` Trainer argument to reference a global variable ([#1534](https://github.com/PyTorchLightning/pytorch-lightning/pull/1534)). -- Fixed a bug that set all boolean CLI arguments from `Trainer.add_argparse_args` always to True ([#1571](https://github.com/PyTorchLightning/pytorch-lightning/pull/1571)) -- Fixed do not copy the batch when training on a single GPU ([#1576](https://github.com/PyTorchLightning/pytorch-lightning/pull/1576), - [#1579](https://github.com/PyTorchLightning/pytorch-lightning/pull/1579)) -- Fixed soft checkpoint removing on DDP ([#1408](https://github.com/PyTorchLightning/pytorch-lightning/pull/1408)) -- Fixed automatic parser bug ([#1585](https://github.com/PyTorchLightning/pytorch-lightning/pull/1585)) -- Fixed bool conversion from string ([#1606](https://github.com/PyTorchLightning/pytorch-lightning/pull/1606)) +- Added the possibility to pass nested metrics dictionaries to loggers ([#1582](https://github.com/Lightning-AI/lightning/pull/1582)) +- Fixed memory leak from opt return ([#1528](https://github.com/Lightning-AI/lightning/pull/1528)) +- Fixed saving checkpoint before deleting old ones ([#1453](https://github.com/Lightning-AI/lightning/pull/1453)) +- Fixed loggers - flushing last logged metrics even before continue, e.g. `trainer.test()` results ([#1459](https://github.com/Lightning-AI/lightning/pull/1459)) +- Fixed optimizer configuration when `configure_optimizers` returns dict without `lr_scheduler` ([#1443](https://github.com/Lightning-AI/lightning/pull/1443)) +- Fixed `LightningModule` - mixing hparams and arguments in `LightningModule.__init__()` crashes load_from_checkpoint() ([#1505](https://github.com/Lightning-AI/lightning/pull/1505)) +- Added a missing call to the `on_before_zero_grad` model hook ([#1493](https://github.com/Lightning-AI/lightning/pull/1493)). +- Allow use of sweeps with `WandbLogger` ([#1512](https://github.com/Lightning-AI/lightning/pull/1512)) +- Fixed a bug that caused the `callbacks` Trainer argument to reference a global variable ([#1534](https://github.com/Lightning-AI/lightning/pull/1534)). +- Fixed a bug that set all boolean CLI arguments from `Trainer.add_argparse_args` always to True ([#1571](https://github.com/Lightning-AI/lightning/pull/1571)) +- Fixed do not copy the batch when training on a single GPU ([#1576](https://github.com/Lightning-AI/lightning/pull/1576), + [#1579](https://github.com/Lightning-AI/lightning/pull/1579)) +- Fixed soft checkpoint removing on DDP ([#1408](https://github.com/Lightning-AI/lightning/pull/1408)) +- Fixed automatic parser bug ([#1585](https://github.com/Lightning-AI/lightning/pull/1585)) +- Fixed bool conversion from string ([#1606](https://github.com/Lightning-AI/lightning/pull/1606)) ## [0.7.3] - 2020-04-09 ### Added -- Added `rank_zero_warn` for warning only in rank 0 ([#1428](https://github.com/PyTorchLightning/pytorch-lightning/pull/1428)) +- Added `rank_zero_warn` for warning only in rank 0 ([#1428](https://github.com/Lightning-AI/lightning/pull/1428)) ### Fixed -- Fixed default `DistributedSampler` for DDP training ([#1425](https://github.com/PyTorchLightning/pytorch-lightning/pull/1425)) -- Fixed workers warning not on windows ([#1430](https://github.com/PyTorchLightning/pytorch-lightning/pull/1430)) -- Fixed returning tuple from `run_training_batch` ([#1431](https://github.com/PyTorchLightning/pytorch-lightning/pull/1431)) -- Fixed gradient clipping ([#1438](https://github.com/PyTorchLightning/pytorch-lightning/pull/1438)) -- Fixed pretty print ([#1441](https://github.com/PyTorchLightning/pytorch-lightning/pull/1441)) +- Fixed default `DistributedSampler` for DDP training ([#1425](https://github.com/Lightning-AI/lightning/pull/1425)) +- Fixed workers warning not on windows ([#1430](https://github.com/Lightning-AI/lightning/pull/1430)) +- Fixed returning tuple from `run_training_batch` ([#1431](https://github.com/Lightning-AI/lightning/pull/1431)) +- Fixed gradient clipping ([#1438](https://github.com/Lightning-AI/lightning/pull/1438)) +- Fixed pretty print ([#1441](https://github.com/Lightning-AI/lightning/pull/1441)) ## [0.7.2] - 2020-04-07 ### Added -- Added same step loggers' metrics aggregation ([#1278](https://github.com/PyTorchLightning/pytorch-lightning/pull/1278)) -- Added parity test between a vanilla MNIST model and lightning model ([#1284](https://github.com/PyTorchLightning/pytorch-lightning/pull/1284)) -- Added parity test between a vanilla RNN model and lightning model ([#1351](https://github.com/PyTorchLightning/pytorch-lightning/pull/1351)) -- Added Reinforcement Learning - Deep Q-network (DQN) lightning example ([#1232](https://github.com/PyTorchLightning/pytorch-lightning/pull/1232)) -- Added support for hierarchical `dict` ([#1152](https://github.com/PyTorchLightning/pytorch-lightning/pull/1152)) -- Added `TrainsLogger` class ([#1122](https://github.com/PyTorchLightning/pytorch-lightning/pull/1122)) -- Added type hints to `pytorch_lightning.core` ([#946](https://github.com/PyTorchLightning/pytorch-lightning/pull/946)) -- Added support for `IterableDataset` in validation and testing ([#1104](https://github.com/PyTorchLightning/pytorch-lightning/pull/1104)) -- Added support for non-primitive types in `hparams` for `TensorboardLogger` ([#1130](https://github.com/PyTorchLightning/pytorch-lightning/pull/1130)) -- Added a check that stops the training when loss or weights contain `NaN` or `inf` values. ([#1097](https://github.com/PyTorchLightning/pytorch-lightning/pull/1097)) -- Added support for `IterableDataset` when `val_check_interval=1.0` (default), this will trigger validation at the end of each epoch. ([#1283](https://github.com/PyTorchLightning/pytorch-lightning/pull/1283)) -- Added `summary` method to Profilers. ([#1259](https://github.com/PyTorchLightning/pytorch-lightning/pull/1259)) -- Added informative errors if user defined dataloader has zero length ([#1280](https://github.com/PyTorchLightning/pytorch-lightning/pull/1280)) -- Added testing for python 3.8 ([#915](https://github.com/PyTorchLightning/pytorch-lightning/pull/915)) -- Added model configuration checking ([#1199](https://github.com/PyTorchLightning/pytorch-lightning/pull/1199)) -- Added support for optimizer frequencies through `LightningModule.configure_optimizers()` ([#1269](https://github.com/PyTorchLightning/pytorch-lightning/pull/1269)) -- Added option to run without an optimizer by returning `None` from `configure_optimizers`. ([#1279](https://github.com/PyTorchLightning/pytorch-lightning/pull/1279)) -- Added a warning when the number of data loader workers is small. ([#1378](https://github.com/PyTorchLightning/pytorch-lightning/pull/1378)) +- Added same step loggers' metrics aggregation ([#1278](https://github.com/Lightning-AI/lightning/pull/1278)) +- Added parity test between a vanilla MNIST model and lightning model ([#1284](https://github.com/Lightning-AI/lightning/pull/1284)) +- Added parity test between a vanilla RNN model and lightning model ([#1351](https://github.com/Lightning-AI/lightning/pull/1351)) +- Added Reinforcement Learning - Deep Q-network (DQN) lightning example ([#1232](https://github.com/Lightning-AI/lightning/pull/1232)) +- Added support for hierarchical `dict` ([#1152](https://github.com/Lightning-AI/lightning/pull/1152)) +- Added `TrainsLogger` class ([#1122](https://github.com/Lightning-AI/lightning/pull/1122)) +- Added type hints to `pytorch_lightning.core` ([#946](https://github.com/Lightning-AI/lightning/pull/946)) +- Added support for `IterableDataset` in validation and testing ([#1104](https://github.com/Lightning-AI/lightning/pull/1104)) +- Added support for non-primitive types in `hparams` for `TensorboardLogger` ([#1130](https://github.com/Lightning-AI/lightning/pull/1130)) +- Added a check that stops the training when loss or weights contain `NaN` or `inf` values. ([#1097](https://github.com/Lightning-AI/lightning/pull/1097)) +- Added support for `IterableDataset` when `val_check_interval=1.0` (default), this will trigger validation at the end of each epoch. ([#1283](https://github.com/Lightning-AI/lightning/pull/1283)) +- Added `summary` method to Profilers. ([#1259](https://github.com/Lightning-AI/lightning/pull/1259)) +- Added informative errors if user defined dataloader has zero length ([#1280](https://github.com/Lightning-AI/lightning/pull/1280)) +- Added testing for python 3.8 ([#915](https://github.com/Lightning-AI/lightning/pull/915)) +- Added model configuration checking ([#1199](https://github.com/Lightning-AI/lightning/pull/1199)) +- Added support for optimizer frequencies through `LightningModule.configure_optimizers()` ([#1269](https://github.com/Lightning-AI/lightning/pull/1269)) +- Added option to run without an optimizer by returning `None` from `configure_optimizers`. ([#1279](https://github.com/Lightning-AI/lightning/pull/1279)) +- Added a warning when the number of data loader workers is small. ([#1378](https://github.com/Lightning-AI/lightning/pull/1378)) ### Changed -- Changed (renamed and refatored) `TensorRunningMean` -> `TensorRunningAccum`: running accumulations were generalized. ([#1278](https://github.com/PyTorchLightning/pytorch-lightning/pull/1278)) -- Changed `progress_bar_refresh_rate` trainer flag to disable progress bar when set to 0. ([#1108](https://github.com/PyTorchLightning/pytorch-lightning/pull/1108)) -- Enhanced `load_from_checkpoint` to also forward params to the model ([#1307](https://github.com/PyTorchLightning/pytorch-lightning/pull/1307)) -- Updated references to `self.forward()` to instead use the `__call__` interface. ([#1211](https://github.com/PyTorchLightning/pytorch-lightning/pull/1211)) -- Changed default behaviour of `configure_optimizers` to use no optimizer rather than Adam. ([#1279](https://github.com/PyTorchLightning/pytorch-lightning/pull/1279)) -- Allow to upload models on W&B ([#1339](https://github.com/PyTorchLightning/pytorch-lightning/pull/1339)) -- On DP and DDP2 unsqueeze is automated now ([#1319](https://github.com/PyTorchLightning/pytorch-lightning/pull/1319)) -- Did not always create a DataLoader during reinstantiation, but the same type as before (if subclass of DataLoader) ([#1346](https://github.com/PyTorchLightning/pytorch-lightning/pull/1346)) -- Did not interfere with a default sampler ([#1318](https://github.com/PyTorchLightning/pytorch-lightning/pull/1318)) -- Remove default Adam optimizer ([#1317](https://github.com/PyTorchLightning/pytorch-lightning/pull/1317)) -- Give warnings for unimplemented required lightning methods ([#1317](https://github.com/PyTorchLightning/pytorch-lightning/pull/1317)) -- Made `evaluate` method private >> `Trainer._evaluate(...)`. ([#1260](https://github.com/PyTorchLightning/pytorch-lightning/pull/1260)) -- Simplify the PL examples structure (shallower and more readable) ([#1247](https://github.com/PyTorchLightning/pytorch-lightning/pull/1247)) -- Changed min max gpu memory to be on their own plots ([#1358](https://github.com/PyTorchLightning/pytorch-lightning/pull/1358)) -- Remove `.item` which causes sync issues ([#1254](https://github.com/PyTorchLightning/pytorch-lightning/pull/1254)) -- Changed smoothing in TQDM to decrease variability of time remaining between training / eval ([#1194](https://github.com/PyTorchLightning/pytorch-lightning/pull/1194)) -- Change default logger to dedicated one ([#1064](https://github.com/PyTorchLightning/pytorch-lightning/pull/1064)) +- Changed (renamed and refatored) `TensorRunningMean` -> `TensorRunningAccum`: running accumulations were generalized. ([#1278](https://github.com/Lightning-AI/lightning/pull/1278)) +- Changed `progress_bar_refresh_rate` trainer flag to disable progress bar when set to 0. ([#1108](https://github.com/Lightning-AI/lightning/pull/1108)) +- Enhanced `load_from_checkpoint` to also forward params to the model ([#1307](https://github.com/Lightning-AI/lightning/pull/1307)) +- Updated references to `self.forward()` to instead use the `__call__` interface. ([#1211](https://github.com/Lightning-AI/lightning/pull/1211)) +- Changed default behaviour of `configure_optimizers` to use no optimizer rather than Adam. ([#1279](https://github.com/Lightning-AI/lightning/pull/1279)) +- Allow to upload models on W&B ([#1339](https://github.com/Lightning-AI/lightning/pull/1339)) +- On DP and DDP2 unsqueeze is automated now ([#1319](https://github.com/Lightning-AI/lightning/pull/1319)) +- Did not always create a DataLoader during reinstantiation, but the same type as before (if subclass of DataLoader) ([#1346](https://github.com/Lightning-AI/lightning/pull/1346)) +- Did not interfere with a default sampler ([#1318](https://github.com/Lightning-AI/lightning/pull/1318)) +- Remove default Adam optimizer ([#1317](https://github.com/Lightning-AI/lightning/pull/1317)) +- Give warnings for unimplemented required lightning methods ([#1317](https://github.com/Lightning-AI/lightning/pull/1317)) +- Made `evaluate` method private >> `Trainer._evaluate(...)`. ([#1260](https://github.com/Lightning-AI/lightning/pull/1260)) +- Simplify the PL examples structure (shallower and more readable) ([#1247](https://github.com/Lightning-AI/lightning/pull/1247)) +- Changed min max gpu memory to be on their own plots ([#1358](https://github.com/Lightning-AI/lightning/pull/1358)) +- Remove `.item` which causes sync issues ([#1254](https://github.com/Lightning-AI/lightning/pull/1254)) +- Changed smoothing in TQDM to decrease variability of time remaining between training / eval ([#1194](https://github.com/Lightning-AI/lightning/pull/1194)) +- Change default logger to dedicated one ([#1064](https://github.com/Lightning-AI/lightning/pull/1064)) ### Deprecated -- Deprecated Trainer argument `print_nan_grads` ([#1097](https://github.com/PyTorchLightning/pytorch-lightning/pull/1097)) -- Deprecated Trainer argument `show_progress_bar` ([#1108](https://github.com/PyTorchLightning/pytorch-lightning/pull/1108)) +- Deprecated Trainer argument `print_nan_grads` ([#1097](https://github.com/Lightning-AI/lightning/pull/1097)) +- Deprecated Trainer argument `show_progress_bar` ([#1108](https://github.com/Lightning-AI/lightning/pull/1108)) ### Removed -- Removed test for no test dataloader in .fit ([#1495](https://github.com/PyTorchLightning/pytorch-lightning/pull/1495)) -- Removed duplicated module `pytorch_lightning.utilities.arg_parse` for loading CLI arguments ([#1167](https://github.com/PyTorchLightning/pytorch-lightning/pull/1167)) -- Removed wandb logger's `finalize` method ([#1193](https://github.com/PyTorchLightning/pytorch-lightning/pull/1193)) -- Dropped `torchvision` dependency in tests and added own MNIST dataset class instead ([#986](https://github.com/PyTorchLightning/pytorch-lightning/pull/986)) - -### Fixed - -- Fixed `model_checkpoint` when saving all models ([#1359](https://github.com/PyTorchLightning/pytorch-lightning/pull/1359)) -- `Trainer.add_argparse_args` classmethod fixed. Now it adds a type for the arguments ([#1147](https://github.com/PyTorchLightning/pytorch-lightning/pull/1147)) -- Fixed bug related to type checking of `ReduceLROnPlateau` lr schedulers([#1126](https://github.com/PyTorchLightning/pytorch-lightning/pull/1126)) -- Fixed a bug to ensure lightning checkpoints to be backward compatible ([#1132](https://github.com/PyTorchLightning/pytorch-lightning/pull/1132)) -- Fixed a bug that created an extra dataloader with active `reload_dataloaders_every_epoch` ([#1196](https://github.com/PyTorchLightning/pytorch-lightning/pull/1196)) -- Fixed all warnings and errors in the docs build process ([#1191](https://github.com/PyTorchLightning/pytorch-lightning/pull/1191)) -- Fixed an issue where `val_percent_check=0` would not disable validation ([#1251](https://github.com/PyTorchLightning/pytorch-lightning/pull/1251)) -- Fixed average of incomplete `TensorRunningMean` ([#1309](https://github.com/PyTorchLightning/pytorch-lightning/pull/1309)) -- Fixed `WandbLogger.watch` with `wandb.init()` ([#1311](https://github.com/PyTorchLightning/pytorch-lightning/pull/1311)) -- Fixed an issue with early stopping that would prevent it from monitoring training metrics when validation is disabled / not implemented ([#1235](https://github.com/PyTorchLightning/pytorch-lightning/pull/1235)). -- Fixed a bug that would cause `trainer.test()` to run on the validation set when overloading `validation_epoch_end` and `test_end` ([#1353](https://github.com/PyTorchLightning/pytorch-lightning/pull/1353)) -- Fixed `WandbLogger.watch` - use of the watch method without importing `wandb` ([#1311](https://github.com/PyTorchLightning/pytorch-lightning/pull/1311)) -- Fixed `WandbLogger` to be used with 'ddp' - allow reinits in sub-processes ([#1149](https://github.com/PyTorchLightning/pytorch-lightning/pull/1149), - [#1360](https://github.com/PyTorchLightning/pytorch-lightning/pull/1360)) -- Made `training_epoch_end` behave like `validation_epoch_end` ([#1357](https://github.com/PyTorchLightning/pytorch-lightning/pull/1357)) -- Fixed `fast_dev_run` running validation twice ([#1365](https://github.com/PyTorchLightning/pytorch-lightning/pull/1365)) -- Fixed pickle error from quick patch `__code__` ([#1352](https://github.com/PyTorchLightning/pytorch-lightning/pull/1352)) -- Fixed memory leak on GPU0 ([#1094](https://github.com/PyTorchLightning/pytorch-lightning/pull/1094), - [#1349](https://github.com/PyTorchLightning/pytorch-lightning/pull/1349)) -- Fixed checkpointing interval ([#1272](https://github.com/PyTorchLightning/pytorch-lightning/pull/1272)) -- Fixed validation and training loops run the partial dataset ([#1192](https://github.com/PyTorchLightning/pytorch-lightning/pull/1192)) -- Fixed running `on_validation_end` only on main process in DDP ([#1125](https://github.com/PyTorchLightning/pytorch-lightning/pull/1125)) -- Fixed `load_spawn_weights` only in proc rank 0 ([#1385](https://github.com/PyTorchLightning/pytorch-lightning/pull/1385)) -- Fixes using deprecated `use_amp` attribute ([#1145](https://github.com/PyTorchLightning/pytorch-lightning/pull/1145)) -- Fixed Tensorboard logger error: lightning_logs directory not exists in multi-node DDP on nodes with rank != 0 ([#1377](https://github.com/PyTorchLightning/pytorch-lightning/pull/1377)) -- Fixed `Unimplemented backend XLA` error on TPU ([#1387](https://github.com/PyTorchLightning/pytorch-lightning/pull/1387)) +- Removed test for no test dataloader in .fit ([#1495](https://github.com/Lightning-AI/lightning/pull/1495)) +- Removed duplicated module `pytorch_lightning.utilities.arg_parse` for loading CLI arguments ([#1167](https://github.com/Lightning-AI/lightning/pull/1167)) +- Removed wandb logger's `finalize` method ([#1193](https://github.com/Lightning-AI/lightning/pull/1193)) +- Dropped `torchvision` dependency in tests and added own MNIST dataset class instead ([#986](https://github.com/Lightning-AI/lightning/pull/986)) + +### Fixed + +- Fixed `model_checkpoint` when saving all models ([#1359](https://github.com/Lightning-AI/lightning/pull/1359)) +- `Trainer.add_argparse_args` classmethod fixed. Now it adds a type for the arguments ([#1147](https://github.com/Lightning-AI/lightning/pull/1147)) +- Fixed bug related to type checking of `ReduceLROnPlateau` lr schedulers([#1126](https://github.com/Lightning-AI/lightning/pull/1126)) +- Fixed a bug to ensure lightning checkpoints to be backward compatible ([#1132](https://github.com/Lightning-AI/lightning/pull/1132)) +- Fixed a bug that created an extra dataloader with active `reload_dataloaders_every_epoch` ([#1196](https://github.com/Lightning-AI/lightning/pull/1196)) +- Fixed all warnings and errors in the docs build process ([#1191](https://github.com/Lightning-AI/lightning/pull/1191)) +- Fixed an issue where `val_percent_check=0` would not disable validation ([#1251](https://github.com/Lightning-AI/lightning/pull/1251)) +- Fixed average of incomplete `TensorRunningMean` ([#1309](https://github.com/Lightning-AI/lightning/pull/1309)) +- Fixed `WandbLogger.watch` with `wandb.init()` ([#1311](https://github.com/Lightning-AI/lightning/pull/1311)) +- Fixed an issue with early stopping that would prevent it from monitoring training metrics when validation is disabled / not implemented ([#1235](https://github.com/Lightning-AI/lightning/pull/1235)). +- Fixed a bug that would cause `trainer.test()` to run on the validation set when overloading `validation_epoch_end` and `test_end` ([#1353](https://github.com/Lightning-AI/lightning/pull/1353)) +- Fixed `WandbLogger.watch` - use of the watch method without importing `wandb` ([#1311](https://github.com/Lightning-AI/lightning/pull/1311)) +- Fixed `WandbLogger` to be used with 'ddp' - allow reinits in sub-processes ([#1149](https://github.com/Lightning-AI/lightning/pull/1149), + [#1360](https://github.com/Lightning-AI/lightning/pull/1360)) +- Made `training_epoch_end` behave like `validation_epoch_end` ([#1357](https://github.com/Lightning-AI/lightning/pull/1357)) +- Fixed `fast_dev_run` running validation twice ([#1365](https://github.com/Lightning-AI/lightning/pull/1365)) +- Fixed pickle error from quick patch `__code__` ([#1352](https://github.com/Lightning-AI/lightning/pull/1352)) +- Fixed memory leak on GPU0 ([#1094](https://github.com/Lightning-AI/lightning/pull/1094), + [#1349](https://github.com/Lightning-AI/lightning/pull/1349)) +- Fixed checkpointing interval ([#1272](https://github.com/Lightning-AI/lightning/pull/1272)) +- Fixed validation and training loops run the partial dataset ([#1192](https://github.com/Lightning-AI/lightning/pull/1192)) +- Fixed running `on_validation_end` only on main process in DDP ([#1125](https://github.com/Lightning-AI/lightning/pull/1125)) +- Fixed `load_spawn_weights` only in proc rank 0 ([#1385](https://github.com/Lightning-AI/lightning/pull/1385)) +- Fixes using deprecated `use_amp` attribute ([#1145](https://github.com/Lightning-AI/lightning/pull/1145)) +- Fixed Tensorboard logger error: lightning_logs directory not exists in multi-node DDP on nodes with rank != 0 ([#1377](https://github.com/Lightning-AI/lightning/pull/1377)) +- Fixed `Unimplemented backend XLA` error on TPU ([#1387](https://github.com/Lightning-AI/lightning/pull/1387)) ## [0.7.1] - 2020-03-07 ### Fixed -- Fixes `print` issues and `data_loader` ([#1080](https://github.com/PyTorchLightning/pytorch-lightning/pull/1080)) +- Fixes `print` issues and `data_loader` ([#1080](https://github.com/Lightning-AI/lightning/pull/1080)) ## [0.7.0] - 2020-03-06 ### Added -- Added automatic sampler setup. Depending on DDP or TPU, lightning configures the sampler correctly (user needs to do nothing) ([#926](https://github.com/PyTorchLightning/pytorch-lightning/pull/926)) -- Added `reload_dataloaders_every_epoch=False` flag for trainer. Some users require reloading data every epoch ([#926](https://github.com/PyTorchLightning/pytorch-lightning/pull/926)) -- Added `progress_bar_refresh_rate=50` flag for trainer. Throttle refresh rate on notebooks ([#926](https://github.com/PyTorchLightning/pytorch-lightning/pull/926)) +- Added automatic sampler setup. Depending on DDP or TPU, lightning configures the sampler correctly (user needs to do nothing) ([#926](https://github.com/Lightning-AI/lightning/pull/926)) +- Added `reload_dataloaders_every_epoch=False` flag for trainer. Some users require reloading data every epoch ([#926](https://github.com/Lightning-AI/lightning/pull/926)) +- Added `progress_bar_refresh_rate=50` flag for trainer. Throttle refresh rate on notebooks ([#926](https://github.com/Lightning-AI/lightning/pull/926)) - Updated governance docs -- Added a check to ensure that the metric used for early stopping exists before training commences ([#542](https://github.com/PyTorchLightning/pytorch-lightning/pull/542)) -- Added `optimizer_idx` argument to `backward` hook ([#733](https://github.com/PyTorchLightning/pytorch-lightning/pull/733)) -- Added `entity` argument to `WandbLogger` to be passed to `wandb.init` ([#783](https://github.com/PyTorchLightning/pytorch-lightning/pull/783)) -- Added a tool for profiling training runs ([#782](https://github.com/PyTorchLightning/pytorch-lightning/pull/782)) -- Improved flexibility for naming of TensorBoard logs, can now set `version` to a `str` to just save to that directory, and use `name=''` to prevent experiment-name directory ([#804](https://github.com/PyTorchLightning/pytorch-lightning/pull/804)) -- Added option to specify `step` key when logging metrics ([#808](https://github.com/PyTorchLightning/pytorch-lightning/pull/808)) -- Added `train_dataloader`, `val_dataloader` and `test_dataloader` arguments to `Trainer.fit()`, for alternative data parsing ([#759](https://github.com/PyTorchLightning/pytorch-lightning/pull/759)) -- Added Tensor Processing Unit (TPU) support ([#868](https://github.com/PyTorchLightning/pytorch-lightning/pull/868)) -- Added semantic segmentation example ([#751](https://github.com/PyTorchLightning/pytorch-lightning/pull/751),[#876](https://github.com/PyTorchLightning/pytorch-lightning/pull/876), - [#881](https://github.com/PyTorchLightning/pytorch-lightning/pull/881)) -- Split callbacks in multiple files ([#849](https://github.com/PyTorchLightning/pytorch-lightning/pull/849)) -- Support for user defined callbacks ([#889](https://github.com/PyTorchLightning/pytorch-lightning/pull/889) and [#950](https://github.com/PyTorchLightning/pytorch-lightning/pull/950)) -- Added support for multiple loggers to be passed to `Trainer` as an iterable (e.g. list, tuple, etc.) ([#903](https://github.com/PyTorchLightning/pytorch-lightning/pull/903)) -- Added support for step-based learning rate scheduling ([#941](https://github.com/PyTorchLightning/pytorch-lightning/pull/941)) -- Added support for logging `hparams` as dict ([#1029](https://github.com/PyTorchLightning/pytorch-lightning/pull/1029)) -- Checkpoint and early stopping now work without val. step ([#1041](https://github.com/PyTorchLightning/pytorch-lightning/pull/1041)) -- Support graceful training cleanup after Keyboard Interrupt ([#856](https://github.com/PyTorchLightning/pytorch-lightning/pull/856), - [#1019](https://github.com/PyTorchLightning/pytorch-lightning/pull/1019)) -- Added type hints for function arguments ([#912](https://github.com/PyTorchLightning/pytorch-lightning/pull/912), ) -- Added default `argparser` for `Trainer` ([#952](https://github.com/PyTorchLightning/pytorch-lightning/pull/1023), - [#1023](https://github.com/PyTorchLightning/pytorch-lightning/pull/1023)) -- Added TPU gradient clipping ([#963](https://github.com/PyTorchLightning/pytorch-lightning/pull/963)) -- Added max/min number of steps in `Trainer` ([#728](https://github.com/PyTorchLightning/pytorch-lightning/pull/728)) +- Added a check to ensure that the metric used for early stopping exists before training commences ([#542](https://github.com/Lightning-AI/lightning/pull/542)) +- Added `optimizer_idx` argument to `backward` hook ([#733](https://github.com/Lightning-AI/lightning/pull/733)) +- Added `entity` argument to `WandbLogger` to be passed to `wandb.init` ([#783](https://github.com/Lightning-AI/lightning/pull/783)) +- Added a tool for profiling training runs ([#782](https://github.com/Lightning-AI/lightning/pull/782)) +- Improved flexibility for naming of TensorBoard logs, can now set `version` to a `str` to just save to that directory, and use `name=''` to prevent experiment-name directory ([#804](https://github.com/Lightning-AI/lightning/pull/804)) +- Added option to specify `step` key when logging metrics ([#808](https://github.com/Lightning-AI/lightning/pull/808)) +- Added `train_dataloader`, `val_dataloader` and `test_dataloader` arguments to `Trainer.fit()`, for alternative data parsing ([#759](https://github.com/Lightning-AI/lightning/pull/759)) +- Added Tensor Processing Unit (TPU) support ([#868](https://github.com/Lightning-AI/lightning/pull/868)) +- Added semantic segmentation example ([#751](https://github.com/Lightning-AI/lightning/pull/751),[#876](https://github.com/Lightning-AI/lightning/pull/876), + [#881](https://github.com/Lightning-AI/lightning/pull/881)) +- Split callbacks in multiple files ([#849](https://github.com/Lightning-AI/lightning/pull/849)) +- Support for user defined callbacks ([#889](https://github.com/Lightning-AI/lightning/pull/889) and [#950](https://github.com/Lightning-AI/lightning/pull/950)) +- Added support for multiple loggers to be passed to `Trainer` as an iterable (e.g. list, tuple, etc.) ([#903](https://github.com/Lightning-AI/lightning/pull/903)) +- Added support for step-based learning rate scheduling ([#941](https://github.com/Lightning-AI/lightning/pull/941)) +- Added support for logging `hparams` as dict ([#1029](https://github.com/Lightning-AI/lightning/pull/1029)) +- Checkpoint and early stopping now work without val. step ([#1041](https://github.com/Lightning-AI/lightning/pull/1041)) +- Support graceful training cleanup after Keyboard Interrupt ([#856](https://github.com/Lightning-AI/lightning/pull/856), + [#1019](https://github.com/Lightning-AI/lightning/pull/1019)) +- Added type hints for function arguments ([#912](https://github.com/Lightning-AI/lightning/pull/912), ) +- Added default `argparser` for `Trainer` ([#952](https://github.com/Lightning-AI/lightning/pull/1023), + [#1023](https://github.com/Lightning-AI/lightning/pull/1023)) +- Added TPU gradient clipping ([#963](https://github.com/Lightning-AI/lightning/pull/963)) +- Added max/min number of steps in `Trainer` ([#728](https://github.com/Lightning-AI/lightning/pull/728)) ### Changed -- Improved `NeptuneLogger` by adding `close_after_fit` argument to allow logging after training([#908](https://github.com/PyTorchLightning/pytorch-lightning/pull/1084)) -- Changed default TQDM to use `tqdm.auto` for prettier outputs in IPython notebooks ([#752](https://github.com/PyTorchLightning/pytorch-lightning/pull/752)) -- Changed `pytorch_lightning.logging` to `pytorch_lightning.loggers` ([#767](https://github.com/PyTorchLightning/pytorch-lightning/pull/767)) -- Moved the default `tqdm_dict` definition from Trainer to `LightningModule`, so it can be overridden by the user ([#749](https://github.com/PyTorchLightning/pytorch-lightning/pull/749)) -- Moved functionality of `LightningModule.load_from_metrics` into `LightningModule.load_from_checkpoint` ([#995](https://github.com/PyTorchLightning/pytorch-lightning/pull/995)) -- Changed Checkpoint path parameter from `filepath` to `dirpath` ([#1016](https://github.com/PyTorchLightning/pytorch-lightning/pull/1016)) -- Freezed models `hparams` as `Namespace` property ([#1029](https://github.com/PyTorchLightning/pytorch-lightning/pull/1029)) -- Dropped `logging` config in package init ([#1015](https://github.com/PyTorchLightning/pytorch-lightning/pull/1015)) -- Renames model steps ([#1051](https://github.com/PyTorchLightning/pytorch-lightning/pull/1051)) +- Improved `NeptuneLogger` by adding `close_after_fit` argument to allow logging after training([#908](https://github.com/Lightning-AI/lightning/pull/1084)) +- Changed default TQDM to use `tqdm.auto` for prettier outputs in IPython notebooks ([#752](https://github.com/Lightning-AI/lightning/pull/752)) +- Changed `pytorch_lightning.logging` to `pytorch_lightning.loggers` ([#767](https://github.com/Lightning-AI/lightning/pull/767)) +- Moved the default `tqdm_dict` definition from Trainer to `LightningModule`, so it can be overridden by the user ([#749](https://github.com/Lightning-AI/lightning/pull/749)) +- Moved functionality of `LightningModule.load_from_metrics` into `LightningModule.load_from_checkpoint` ([#995](https://github.com/Lightning-AI/lightning/pull/995)) +- Changed Checkpoint path parameter from `filepath` to `dirpath` ([#1016](https://github.com/Lightning-AI/lightning/pull/1016)) +- Freezed models `hparams` as `Namespace` property ([#1029](https://github.com/Lightning-AI/lightning/pull/1029)) +- Dropped `logging` config in package init ([#1015](https://github.com/Lightning-AI/lightning/pull/1015)) +- Renames model steps ([#1051](https://github.com/Lightning-AI/lightning/pull/1051)) - `training_end` >> `training_epoch_end` - `validation_end` >> `validation_epoch_end` - `test_end` >> `test_epoch_end` -- Refactor dataloading, supports infinite dataloader ([#955](https://github.com/PyTorchLightning/pytorch-lightning/pull/955)) -- Create single file in `TensorBoardLogger` ([#777](https://github.com/PyTorchLightning/pytorch-lightning/pull/777)) +- Refactor dataloading, supports infinite dataloader ([#955](https://github.com/Lightning-AI/lightning/pull/955)) +- Create single file in `TensorBoardLogger` ([#777](https://github.com/Lightning-AI/lightning/pull/777)) ### Deprecated -- Deprecated `pytorch_lightning.logging` ([#767](https://github.com/PyTorchLightning/pytorch-lightning/pull/767)) -- Deprecated `LightningModule.load_from_metrics` in favour of `LightningModule.load_from_checkpoint` ([#995](https://github.com/PyTorchLightning/pytorch-lightning/pull/995), - [#1079](https://github.com/PyTorchLightning/pytorch-lightning/pull/1079)) -- Deprecated `@data_loader` decorator ([#926](https://github.com/PyTorchLightning/pytorch-lightning/pull/926)) -- Deprecated model steps `training_end`, `validation_end` and `test_end` ([#1051](https://github.com/PyTorchLightning/pytorch-lightning/pull/1051), - [#1056](https://github.com/PyTorchLightning/pytorch-lightning/pull/1056)) +- Deprecated `pytorch_lightning.logging` ([#767](https://github.com/Lightning-AI/lightning/pull/767)) +- Deprecated `LightningModule.load_from_metrics` in favour of `LightningModule.load_from_checkpoint` ([#995](https://github.com/Lightning-AI/lightning/pull/995), + [#1079](https://github.com/Lightning-AI/lightning/pull/1079)) +- Deprecated `@data_loader` decorator ([#926](https://github.com/Lightning-AI/lightning/pull/926)) +- Deprecated model steps `training_end`, `validation_end` and `test_end` ([#1051](https://github.com/Lightning-AI/lightning/pull/1051), + [#1056](https://github.com/Lightning-AI/lightning/pull/1056)) ### Removed -- Removed dependency on `pandas` ([#736](https://github.com/PyTorchLightning/pytorch-lightning/pull/736)) -- Removed dependency on `torchvision` ([#797](https://github.com/PyTorchLightning/pytorch-lightning/pull/797)) -- Removed dependency on `scikit-learn` ([#801](https://github.com/PyTorchLightning/pytorch-lightning/pull/801)) +- Removed dependency on `pandas` ([#736](https://github.com/Lightning-AI/lightning/pull/736)) +- Removed dependency on `torchvision` ([#797](https://github.com/Lightning-AI/lightning/pull/797)) +- Removed dependency on `scikit-learn` ([#801](https://github.com/Lightning-AI/lightning/pull/801)) ### Fixed -- Fixed a bug where early stopping `on_end_epoch` would be called inconsistently when `check_val_every_n_epoch == 0` ([#743](https://github.com/PyTorchLightning/pytorch-lightning/pull/743)) -- Fixed a bug where the model checkpointer didn't write to the same directory as the logger ([#771](https://github.com/PyTorchLightning/pytorch-lightning/pull/771)) -- Fixed a bug where the `TensorBoardLogger` class would create an additional empty log file during fitting ([#777](https://github.com/PyTorchLightning/pytorch-lightning/pull/777)) -- Fixed a bug where `global_step` was advanced incorrectly when using `accumulate_grad_batches > 1` ([#832](https://github.com/PyTorchLightning/pytorch-lightning/pull/832)) -- Fixed a bug when calling `self.logger.experiment` with multiple loggers ([#1009](https://github.com/PyTorchLightning/pytorch-lightning/pull/1009)) -- Fixed a bug when calling `logger.append_tags` on a `NeptuneLogger` with a single tag ([#1009](https://github.com/PyTorchLightning/pytorch-lightning/pull/1009)) -- Fixed sending back data from `.spawn` by saving and loading the trained model in/out of the process ([#1017](https://github.com/PyTorchLightning/pytorch-lightning/pull/1017) -- Fixed port collision on DDP ([#1010](https://github.com/PyTorchLightning/pytorch-lightning/pull/1010)) -- Fixed/tested pass overrides ([#918](https://github.com/PyTorchLightning/pytorch-lightning/pull/918)) -- Fixed comet logger to log after train ([#892](https://github.com/PyTorchLightning/pytorch-lightning/pull/892)) -- Remove deprecated args to learning rate step function ([#890](https://github.com/PyTorchLightning/pytorch-lightning/pull/890)) +- Fixed a bug where early stopping `on_end_epoch` would be called inconsistently when `check_val_every_n_epoch == 0` ([#743](https://github.com/Lightning-AI/lightning/pull/743)) +- Fixed a bug where the model checkpointer didn't write to the same directory as the logger ([#771](https://github.com/Lightning-AI/lightning/pull/771)) +- Fixed a bug where the `TensorBoardLogger` class would create an additional empty log file during fitting ([#777](https://github.com/Lightning-AI/lightning/pull/777)) +- Fixed a bug where `global_step` was advanced incorrectly when using `accumulate_grad_batches > 1` ([#832](https://github.com/Lightning-AI/lightning/pull/832)) +- Fixed a bug when calling `self.logger.experiment` with multiple loggers ([#1009](https://github.com/Lightning-AI/lightning/pull/1009)) +- Fixed a bug when calling `logger.append_tags` on a `NeptuneLogger` with a single tag ([#1009](https://github.com/Lightning-AI/lightning/pull/1009)) +- Fixed sending back data from `.spawn` by saving and loading the trained model in/out of the process ([#1017](https://github.com/Lightning-AI/lightning/pull/1017) +- Fixed port collision on DDP ([#1010](https://github.com/Lightning-AI/lightning/pull/1010)) +- Fixed/tested pass overrides ([#918](https://github.com/Lightning-AI/lightning/pull/918)) +- Fixed comet logger to log after train ([#892](https://github.com/Lightning-AI/lightning/pull/892)) +- Remove deprecated args to learning rate step function ([#890](https://github.com/Lightning-AI/lightning/pull/890)) ## [0.6.0] - 2020-01-21 ### Added -- Added support for resuming from a specific checkpoint via `resume_from_checkpoint` argument ([#516](https://github.com/PyTorchLightning/pytorch-lightning/pull/516)) -- Added support for `ReduceLROnPlateau` scheduler ([#320](https://github.com/PyTorchLightning/pytorch-lightning/pull/320)) -- Added support for Apex mode `O2` in conjunction with Data Parallel ([#493](https://github.com/PyTorchLightning/pytorch-lightning/pull/493)) -- Added option (`save_top_k`) to save the top k models in the `ModelCheckpoint` class ([#128](https://github.com/PyTorchLightning/pytorch-lightning/pull/128)) -- Added `on_train_start` and `on_train_end` hooks to `ModelHooks` ([#598](https://github.com/PyTorchLightning/pytorch-lightning/pull/598)) -- Added `TensorBoardLogger` ([#607](https://github.com/PyTorchLightning/pytorch-lightning/pull/607)) -- Added support for weight summary of model with multiple inputs ([#543](https://github.com/PyTorchLightning/pytorch-lightning/pull/543)) -- Added `map_location` argument to `load_from_metrics` and `load_from_checkpoint` ([#625](https://github.com/PyTorchLightning/pytorch-lightning/pull/625)) -- Added option to disable validation by setting `val_percent_check=0` ([#649](https://github.com/PyTorchLightning/pytorch-lightning/pull/649)) -- Added `NeptuneLogger` class ([#648](https://github.com/PyTorchLightning/pytorch-lightning/pull/648)) -- Added `WandbLogger` class ([#627](https://github.com/PyTorchLightning/pytorch-lightning/pull/627)) +- Added support for resuming from a specific checkpoint via `resume_from_checkpoint` argument ([#516](https://github.com/Lightning-AI/lightning/pull/516)) +- Added support for `ReduceLROnPlateau` scheduler ([#320](https://github.com/Lightning-AI/lightning/pull/320)) +- Added support for Apex mode `O2` in conjunction with Data Parallel ([#493](https://github.com/Lightning-AI/lightning/pull/493)) +- Added option (`save_top_k`) to save the top k models in the `ModelCheckpoint` class ([#128](https://github.com/Lightning-AI/lightning/pull/128)) +- Added `on_train_start` and `on_train_end` hooks to `ModelHooks` ([#598](https://github.com/Lightning-AI/lightning/pull/598)) +- Added `TensorBoardLogger` ([#607](https://github.com/Lightning-AI/lightning/pull/607)) +- Added support for weight summary of model with multiple inputs ([#543](https://github.com/Lightning-AI/lightning/pull/543)) +- Added `map_location` argument to `load_from_metrics` and `load_from_checkpoint` ([#625](https://github.com/Lightning-AI/lightning/pull/625)) +- Added option to disable validation by setting `val_percent_check=0` ([#649](https://github.com/Lightning-AI/lightning/pull/649)) +- Added `NeptuneLogger` class ([#648](https://github.com/Lightning-AI/lightning/pull/648)) +- Added `WandbLogger` class ([#627](https://github.com/Lightning-AI/lightning/pull/627)) ### Changed -- Changed the default progress bar to print to stdout instead of stderr ([#531](https://github.com/PyTorchLightning/pytorch-lightning/pull/531)) -- Renamed `step_idx` to `step`, `epoch_idx` to `epoch`, `max_num_epochs` to `max_epochs` and `min_num_epochs` to `min_epochs` ([#589](https://github.com/PyTorchLightning/pytorch-lightning/pull/589)) -- Renamed `total_batch_nb` to `total_batches`, `nb_val_batches` to `num_val_batches`, `nb_training_batches` to `num_training_batches`, `max_nb_epochs` to `max_epochs`, `min_nb_epochs` to `min_epochs`, `nb_test_batches` to `num_test_batches`, and `nb_val_batches` to `num_val_batches` ([#567](https://github.com/PyTorchLightning/pytorch-lightning/pull/567)) -- Changed gradient logging to use parameter names instead of indexes ([#660](https://github.com/PyTorchLightning/pytorch-lightning/pull/660)) -- Changed the default logger to `TensorBoardLogger` ([#609](https://github.com/PyTorchLightning/pytorch-lightning/pull/609)) -- Changed the directory for tensorboard logging to be the same as model checkpointing ([#706](https://github.com/PyTorchLightning/pytorch-lightning/pull/706)) +- Changed the default progress bar to print to stdout instead of stderr ([#531](https://github.com/Lightning-AI/lightning/pull/531)) +- Renamed `step_idx` to `step`, `epoch_idx` to `epoch`, `max_num_epochs` to `max_epochs` and `min_num_epochs` to `min_epochs` ([#589](https://github.com/Lightning-AI/lightning/pull/589)) +- Renamed `total_batch_nb` to `total_batches`, `nb_val_batches` to `num_val_batches`, `nb_training_batches` to `num_training_batches`, `max_nb_epochs` to `max_epochs`, `min_nb_epochs` to `min_epochs`, `nb_test_batches` to `num_test_batches`, and `nb_val_batches` to `num_val_batches` ([#567](https://github.com/Lightning-AI/lightning/pull/567)) +- Changed gradient logging to use parameter names instead of indexes ([#660](https://github.com/Lightning-AI/lightning/pull/660)) +- Changed the default logger to `TensorBoardLogger` ([#609](https://github.com/Lightning-AI/lightning/pull/609)) +- Changed the directory for tensorboard logging to be the same as model checkpointing ([#706](https://github.com/Lightning-AI/lightning/pull/706)) ### Deprecated -- Deprecated `max_nb_epochs` and `min_nb_epochs` ([#567](https://github.com/PyTorchLightning/pytorch-lightning/pull/567)) -- Deprecated the `on_sanity_check_start` hook in `ModelHooks` ([#598](https://github.com/PyTorchLightning/pytorch-lightning/pull/598)) +- Deprecated `max_nb_epochs` and `min_nb_epochs` ([#567](https://github.com/Lightning-AI/lightning/pull/567)) +- Deprecated the `on_sanity_check_start` hook in `ModelHooks` ([#598](https://github.com/Lightning-AI/lightning/pull/598)) ### Removed -- Removed the `save_best_only` argument from `ModelCheckpoint`, use `save_top_k=1` instead ([#128](https://github.com/PyTorchLightning/pytorch-lightning/pull/128)) - -### Fixed - -- Fixed a bug which occurred when using Adagrad with cuda ([#554](https://github.com/PyTorchLightning/pytorch-lightning/pull/554)) -- Fixed a bug where training would be on the GPU despite setting `gpus=0` or `gpus=[]` ([#561](https://github.com/PyTorchLightning/pytorch-lightning/pull/561)) -- Fixed an error with `print_nan_gradients` when some parameters do not require gradient ([#579](https://github.com/PyTorchLightning/pytorch-lightning/pull/579)) -- Fixed a bug where the progress bar would show an incorrect number of total steps during the validation sanity check when using multiple validation data loaders ([#597](https://github.com/PyTorchLightning/pytorch-lightning/pull/597)) -- Fixed support for PyTorch 1.1.0 ([#552](https://github.com/PyTorchLightning/pytorch-lightning/pull/552)) -- Fixed an issue with early stopping when using a `val_check_interval < 1.0` in `Trainer` ([#492](https://github.com/PyTorchLightning/pytorch-lightning/pull/492)) -- Fixed bugs relating to the `CometLogger` object that would cause it to not work properly ([#481](https://github.com/PyTorchLightning/pytorch-lightning/pull/481)) -- Fixed a bug that would occur when returning `-1` from `on_batch_start` following an early exit or when the batch was `None` ([#509](https://github.com/PyTorchLightning/pytorch-lightning/pull/509)) -- Fixed a potential race condition with several processes trying to create checkpoint directories ([#530](https://github.com/PyTorchLightning/pytorch-lightning/pull/530)) -- Fixed a bug where batch 'segments' would remain on the GPU when using `truncated_bptt > 1` ([#532](https://github.com/PyTorchLightning/pytorch-lightning/pull/532)) -- Fixed a bug when using `IterableDataset` ([#547](https://github.com/PyTorchLightning/pytorch-lightning/pull/547)) -- Fixed a bug where `.item` was called on non-tensor objects ([#602](https://github.com/PyTorchLightning/pytorch-lightning/pull/602)) -- Fixed a bug where `Trainer.train` would crash on an uninitialized variable if the trainer was run after resuming from a checkpoint that was already at `max_epochs` ([#608](https://github.com/PyTorchLightning/pytorch-lightning/pull/608)) -- Fixed a bug where early stopping would begin two epochs early ([#617](https://github.com/PyTorchLightning/pytorch-lightning/pull/617)) -- Fixed a bug where `num_training_batches` and `num_test_batches` would sometimes be rounded down to zero ([#649](https://github.com/PyTorchLightning/pytorch-lightning/pull/649)) -- Fixed a bug where an additional batch would be processed when manually setting `num_training_batches` ([#653](https://github.com/PyTorchLightning/pytorch-lightning/pull/653)) -- Fixed a bug when batches did not have a `.copy` method ([#701](https://github.com/PyTorchLightning/pytorch-lightning/pull/701)) -- Fixed a bug when using `log_gpu_memory=True` in Python 3.6 ([#715](https://github.com/PyTorchLightning/pytorch-lightning/pull/715)) -- Fixed a bug where checkpoint writing could exit before completion, giving incomplete checkpoints ([#689](https://github.com/PyTorchLightning/pytorch-lightning/pull/689)) -- Fixed a bug where `on_train_end` was not called when ealy stopping ([#723](https://github.com/PyTorchLightning/pytorch-lightning/pull/723)) +- Removed the `save_best_only` argument from `ModelCheckpoint`, use `save_top_k=1` instead ([#128](https://github.com/Lightning-AI/lightning/pull/128)) + +### Fixed + +- Fixed a bug which occurred when using Adagrad with cuda ([#554](https://github.com/Lightning-AI/lightning/pull/554)) +- Fixed a bug where training would be on the GPU despite setting `gpus=0` or `gpus=[]` ([#561](https://github.com/Lightning-AI/lightning/pull/561)) +- Fixed an error with `print_nan_gradients` when some parameters do not require gradient ([#579](https://github.com/Lightning-AI/lightning/pull/579)) +- Fixed a bug where the progress bar would show an incorrect number of total steps during the validation sanity check when using multiple validation data loaders ([#597](https://github.com/Lightning-AI/lightning/pull/597)) +- Fixed support for PyTorch 1.1.0 ([#552](https://github.com/Lightning-AI/lightning/pull/552)) +- Fixed an issue with early stopping when using a `val_check_interval < 1.0` in `Trainer` ([#492](https://github.com/Lightning-AI/lightning/pull/492)) +- Fixed bugs relating to the `CometLogger` object that would cause it to not work properly ([#481](https://github.com/Lightning-AI/lightning/pull/481)) +- Fixed a bug that would occur when returning `-1` from `on_batch_start` following an early exit or when the batch was `None` ([#509](https://github.com/Lightning-AI/lightning/pull/509)) +- Fixed a potential race condition with several processes trying to create checkpoint directories ([#530](https://github.com/Lightning-AI/lightning/pull/530)) +- Fixed a bug where batch 'segments' would remain on the GPU when using `truncated_bptt > 1` ([#532](https://github.com/Lightning-AI/lightning/pull/532)) +- Fixed a bug when using `IterableDataset` ([#547](https://github.com/Lightning-AI/lightning/pull/547)) +- Fixed a bug where `.item` was called on non-tensor objects ([#602](https://github.com/Lightning-AI/lightning/pull/602)) +- Fixed a bug where `Trainer.train` would crash on an uninitialized variable if the trainer was run after resuming from a checkpoint that was already at `max_epochs` ([#608](https://github.com/Lightning-AI/lightning/pull/608)) +- Fixed a bug where early stopping would begin two epochs early ([#617](https://github.com/Lightning-AI/lightning/pull/617)) +- Fixed a bug where `num_training_batches` and `num_test_batches` would sometimes be rounded down to zero ([#649](https://github.com/Lightning-AI/lightning/pull/649)) +- Fixed a bug where an additional batch would be processed when manually setting `num_training_batches` ([#653](https://github.com/Lightning-AI/lightning/pull/653)) +- Fixed a bug when batches did not have a `.copy` method ([#701](https://github.com/Lightning-AI/lightning/pull/701)) +- Fixed a bug when using `log_gpu_memory=True` in Python 3.6 ([#715](https://github.com/Lightning-AI/lightning/pull/715)) +- Fixed a bug where checkpoint writing could exit before completion, giving incomplete checkpoints ([#689](https://github.com/Lightning-AI/lightning/pull/689)) +- Fixed a bug where `on_train_end` was not called when ealy stopping ([#723](https://github.com/Lightning-AI/lightning/pull/723)) ## [0.5.3] - 2019-11-06 diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index d05e37dbc6fb2..eb1a42730b5f0 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -25,14 +25,14 @@ ______________________________________________________________________ [![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning) [![Conda](https://img.shields.io/conda/v/conda-forge/pytorch-lightning?label=conda&color=success)](https://anaconda.org/conda-forge/pytorch-lightning) [![DockerHub](https://img.shields.io/docker/pulls/pytorchlightning/pytorch_lightning.svg)](https://hub.docker.com/r/pytorchlightning/pytorch_lightning) -[![codecov](https://codecov.io/gh/PyTorchLightning/pytorch-lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/PyTorchLightning/pytorch-lightning) +[![codecov](https://codecov.io/gh/Lightning-AI/lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Lightning-AI/lightning) [![ReadTheDocs](https://readthedocs.org/projects/pytorch-lightning/badge/?version=stable)](https://pytorch-lightning.readthedocs.io/en/stable/) [![Slack](https://img.shields.io/badge/slack-chat-green.svg?logo=slack)](https://www.pytorchlightning.ai/community) -[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/PytorchLightning/pytorch-lightning/blob/master/LICENSE) +[![license](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/Lightning-AI/lightning/blob/master/LICENSE) @@ -65,7 +65,7 @@ Lightning forces the following structure to your code which makes it reusable an Once you do this, you can train on multiple-GPUs, TPUs, CPUs, IPUs, HPUs and even in 16-bit precision without changing your code! -Get started with our [2 step guide](https://pytorch-lightning.readthedocs.io/en/latest/starter/new-project.html) +[Get started in just 15 minutes](https://pytorch-lightning.readthedocs.io/en/latest/starter/introduction.html) ______________________________________________________________________ @@ -78,17 +78,17 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master.svg?style=svg)](https://circleci.com/gh/PyTorchLightning/pytorch-lightning/tree/master) | - | - | -| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/PytorchLightning/pytorch-lightning/_build/latest?definitionId=6&branchName=master) | - | -| Linux py3.8 (with Conda) | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | - | -| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/PyTorchLightning/pytorch-lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -126,20 +126,21 @@ pip install pytorch-lightning['extra'] conda install pytorch-lightning -c conda-forge ``` -#### Install stable 1.6.x +#### Install stable version -the actual status of 1.6 \[stable\] is following: +The actual status of stable is the following: -![CI basic testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20basic%20testing/badge.svg?branch=release%2F1.5.x&event=push) -![CI complete testing](https://github.com/PyTorchLightning/pytorch-lightning/workflows/CI%20complete%20testing/badge.svg?branch=release%2F1.5.x&event=push) -![PyTorch & Conda](https://github.com/PyTorchLightning/pytorch-lightning/workflows/PyTorch%20&%20Conda/badge.svg?branch=release%2F1.5.x&event=push) -![TPU tests](https://github.com/PyTorchLightning/pytorch-lightning/workflows/TPU%20tests/badge.svg?branch=release%2F1.5.x&event=push) -![Docs check](https://github.com/PyTorchLightning/pytorch-lightning/workflows/Docs%20check/badge.svg?branch=release%2F1.5.x&event=push) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml) +[![GPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=release%2Fpytorch) +[![TPU](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) +[![IPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=release%2Fpytorch) +[![HPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=release%2Fpytorch) Install future release from the source ```bash -pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@release/1.5.x --upgrade +pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/release/pytorch.zip -U ``` #### Install bleeding-edge - future 1.7 @@ -147,7 +148,7 @@ pip install git+https://github.com/PytorchLightning/pytorch-lightning.git@releas Install nightly from the source (no guarantees) ```bash -pip install https://github.com/PyTorchLightning/pytorch-lightning/archive/master.zip +pip install https://github.com/Lightning-AI/lightning/archive/refs/heads/master.zip -U ``` or from testing PyPI @@ -346,7 +347,7 @@ ______________________________________________________________________ - Make fewer mistakes because lightning handles the tricky engineering - Keeps all the flexibility (LightningModules are still PyTorch modules), but removes a ton of boilerplate - Lightning has dozens of integrations with popular machine learning tools. -- [Tested rigorously with every new PR](https://github.com/PyTorchLightning/pytorch-lightning/tree/master/tests). We test every combination of PyTorch and Python supported versions, every OS, multi GPUs and even TPUs. +- [Tested rigorously with every new PR](https://github.com/Lightning-AI/lightning/tree/master/tests). We test every combination of PyTorch and Python supported versions, every OS, multi GPUs and even TPUs. - Minimal running speed overhead (about 300 ms per epoch compared with pure PyTorch). ______________________________________________________________________ diff --git a/tests/README.md b/tests/README.md index 3b40c32d755dd..e56131fb459b0 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,6 +1,6 @@ # PyTorch-Lightning Tests -Most of the tests in PyTorch Lightning train a [BoringModel](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/tests/helpers/boring_model.py) under various trainer conditions (ddp, ddp2+amp, etc...). Want to add a new test case and not sure how? [Talk to us!](https://www.pytorchlightning.ai/community) +Most of the tests in PyTorch Lightning train a [BoringModel](https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/demos/boring_classes.py) under various trainer conditions (ddp, ddp2+amp, etc...). Want to add a new test case and not sure how? [Talk to us!](https://www.pytorchlightning.ai/community) ## Running tests @@ -26,7 +26,7 @@ Additionally, for testing backward compatibility with older versions of PyTorch bash .actions/pull_legacy_checkpoints.sh ``` -Note: These checkpoints are generated to set baselines for maintaining backward compatibility with legacy versions of PyTorch Lightning. Details of checkpoints for back-compatibility can be found [here](https://github.com/PyTorchLightning/pytorch-lightning/blob/master/legacy/README.md). +Note: These checkpoints are generated to set baselines for maintaining backward compatibility with legacy versions of PyTorch Lightning. Details of checkpoints for back-compatibility can be found [here](https://github.com/Lightning-AI/lightning/blob/master/tests/legacy/README.md). You can run the full test suite in your terminal via this make script: From f835e3831464a8083675573c4a67ca48a035a29d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 28 Jul 2022 22:50:56 +0200 Subject: [PATCH 040/230] Update version for rc1 release (#13910) --- src/pytorch_lightning/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index 6213a2ac0b758..748f8f4eaea0c 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.7.0rc0" +version = "1.7.0rc1" From caaf35689c585fba1bb33b70243e7ddd2cf5d13c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 29 Jul 2022 01:33:22 +0200 Subject: [PATCH 041/230] Improvements to standalone scripts (#13840) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- dockers/tpu-tests/tpu_test_cases.jsonnet | 3 ++- tests/tests_pytorch/run_standalone_tasks.sh | 17 ++++------------- tests/tests_pytorch/run_standalone_tests.sh | 15 ++------------- 3 files changed, 8 insertions(+), 27 deletions(-) diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 18a0c894c31a2..48536817920c9 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -31,6 +31,7 @@ local tputests = base.BaseTest { git checkout {SHA} export PACKAGE_NAME=pytorch export FREEZE_REQUIREMENTS=1 + export PL_STANDALONE_TESTS_BATCH_SIZE=1 pip install -e .[test] echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" @@ -38,7 +39,7 @@ local tputests = base.BaseTest { cd tests/tests_pytorch coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ echo "\n||| Running standalone tests |||\n" - bash run_standalone_tests.sh -b 1 + bash run_standalone_tests.sh test_exit_code=$? echo "\n||| END PYTEST LOGS |||\n" coverage xml diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 4d433399e5736..960bd867ceaa4 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -15,34 +15,25 @@ set -e # THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY -report='' - if nvcc --version; then + echo "Running profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx" nvprof --profile-from-start off -o trace_name.prof -- python -m coverage run --source pytorch_lightning --append -m pytest --no-header profilers/test_profiler.py::test_pytorch_profiler_nested_emit_nvtx fi # needs to run outside of `pytest` +echo "Running utilities/test_warnings.py" python utilities/test_warnings.py -if [ $? -eq 0 ]; then - report+="Ran\tutilities/test_warnings.py\n" -fi # test deadlock is properly handled with TorchElastic. +echo "Running plugins/environments/torch_elastic_deadlock.py" LOGS=$(PL_RUN_STANDALONE_TESTS=1 PL_RECONCILE_PROCESS=1 python -m torch.distributed.run --nproc_per_node=2 --max_restarts 0 -m coverage run --source pytorch_lightning -a plugins/environments/torch_elastic_deadlock.py | grep "SUCCEEDED") if [ -z "$LOGS" ]; then exit 1 fi -report+="Ran\tplugins/environments/torch_elastic_deadlock.py\n" # test that a user can manually launch individual processes +echo "Running manual ddp launch test" export PYTHONPATH="${PYTHONPATH}:$(pwd)" args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} & MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} -report+="Ran\tmanual ddp launch test\n" - -# echo test report -printf '=%.s' {1..80} -printf "\n$report" -printf '=%.s' {1..80} -printf '\n' diff --git a/tests/tests_pytorch/run_standalone_tests.sh b/tests/tests_pytorch/run_standalone_tests.sh index 55a0d330f6188..7e9292f4458db 100644 --- a/tests/tests_pytorch/run_standalone_tests.sh +++ b/tests/tests_pytorch/run_standalone_tests.sh @@ -16,18 +16,8 @@ set -e # THIS FILE ASSUMES IT IS RUN INSIDE THE tests/tests_pytorch DIRECTORY # Batch size for testing: Determines how many standalone test invocations run in parallel -test_batch_size=6 - -while getopts "b:" opt; do - case $opt in - b) - test_batch_size=$OPTARG;; - *) - echo "Usage: $(basename $0) [-b batch_size]" - exit 1;; - esac -done -shift $((OPTIND-1)) +# It can be set through the env variable PL_STANDALONE_TESTS_BATCH_SIZE and defaults to 6 if not set +test_batch_size="${PL_STANDALONE_TESTS_BATCH_SIZE:-6}" # this environment variable allows special tests to run export PL_RUN_STANDALONE_TESTS=1 @@ -93,7 +83,6 @@ done # wait for leftover tests for pid in ${pids[*]}; do wait $pid; done show_batched_output -echo "Batched mode finished. End of standalone tests." # echo test report printf '=%.s' {1..80} From c019fc633d0b4e1c5b7e384216aab13149c90c9e Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 29 Jul 2022 12:50:26 +0200 Subject: [PATCH 042/230] meta pkg: wrap imports for traceability (#13924) --- .actions/setup_tools.py | 37 +++++++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 3a105f508fd45..2aff3bdf9a141 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -270,6 +270,37 @@ def prune_comments_docstrings(lines: List[str]) -> List[str]: return body +def wrap_try_except(body: List[str], pkg: str, ver: str) -> List[str]: + """Wrap the file with try/except for better traceability of import misalignment.""" + not_empty = sum(1 for ln in body if ln) + if not_empty == 0: + return body + body = ["try:"] + [f" {ln}" if ln else "" for ln in body] + body += [ + "", + "except ImportError as err:", + "", + " from os import linesep", + f" from {pkg} import __version__", + f" msg = f'Your `lightning` package was built for `{pkg}=={ver}`," + " but you are running {__version__}'", + " raise type(err)(str(err) + linesep + msg)", + ] + return body + + +def parse_version_from_file(pkg_root: str) -> str: + """Loading the package version from file.""" + file_ver = os.path.join(pkg_root, "__version__.py") + file_about = os.path.join(pkg_root, "__about__.py") + if os.path.isfile(file_ver): + ver = _load_py_module("version", file_ver).version + elif os.path.isfile(file_about): + ver = _load_py_module("about", file_about).__version__ + else: # this covers case you have build only meta-package so not additional source files are present + ver = "" + return ver + + def create_meta_package(src_folder: str, pkg_name: str = "pytorch_lightning", lit_name: str = "pytorch"): """Parse the real python package and for each module create a mirroe version with repalcing all function and class implementations by cross-imports to the true package. @@ -279,6 +310,7 @@ class implementations by cross-imports to the true package. >>> create_meta_package(os.path.join(_PROJECT_ROOT, "src")) """ package_dir = os.path.join(src_folder, pkg_name) + pkg_ver = parse_version_from_file(package_dir) # shutil.rmtree(os.path.join(src_folder, "lightning", lit_name)) py_files = glob.glob(os.path.join(src_folder, pkg_name, "**", "*.py"), recursive=True) for py_file in py_files: @@ -310,15 +342,16 @@ class implementations by cross-imports to the true package. while body_len != len(body): body_len = len(body) body = prune_empty_statements(body) - # TODO: add try/catch wrapper for whole body, + # add try/catch wrapper for whole body, # so when import fails it tells you what is the package version this meta package was generated for... + body = wrap_try_except(body, pkg_name, pkg_ver) # todo: apply pre-commit formatting body = [ln for ln, _group in groupby(body)] lines = [] # drop duplicated lines for ln in body: - if ln + os.linesep not in lines or ln in (")", ""): + if ln + os.linesep not in lines or ln.lstrip() in (")", ""): lines.append(ln + os.linesep) # compose the target file name new_file = os.path.join(src_folder, "lightning", lit_name, local_path) From c136ef5b9a963e3c509b65a160867726e435bd56 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 29 Jul 2022 14:37:33 +0200 Subject: [PATCH 043/230] meta pkg: set version as today (#13906) --- .actions/setup_tools.py | 15 +++++++++++++++ setup.py | 4 +++- src/lightning/__version__.py | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 2aff3bdf9a141..3e6a0ee0cbf84 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -20,6 +20,7 @@ import tarfile import tempfile import urllib.request +from datetime import datetime from importlib.util import module_from_spec, spec_from_file_location from itertools import groupby from types import ModuleType @@ -360,6 +361,20 @@ class implementations by cross-imports to the true package. fp.writelines(lines) +def set_version_today(fpath: str) -> None: + """Replace the template date with today.""" + with open(fpath) as fp: + lines = fp.readlines() + + def _replace_today(ln): + today = datetime.now() + return ln.replace("YYYY.-M.-D", f"{today.year}.{today.month}.{today.day}") + + lines = list(map(_replace_today, lines)) + with open(fpath, "w") as fp: + fp.writelines(lines) + + def _download_frontend(root: str = _PROJECT_ROOT): """Downloads an archive file for a specific release of the Lightning frontend and extracts it to the correct directory.""" diff --git a/setup.py b/setup.py index a542b3c1e0291..013729af02aa9 100755 --- a/setup.py +++ b/setup.py @@ -59,7 +59,8 @@ # https://packaging.python.org/guides/single-sourcing-package-version/ # http://blog.ionelmc.ro/2014/05/25/python-packaging/ _PATH_ROOT = os.path.dirname(__file__) -_PATH_SETUP = os.path.join(_PATH_ROOT, "src", _REAL_PKG_NAME or "lightning", "__setup__.py") +_PATH_SRC = os.path.join(_PATH_ROOT, "src") +_PATH_SETUP = os.path.join(_PATH_SRC, _REAL_PKG_NAME or "lightning", "__setup__.py") # Hardcode the env variable from time of package creation, otherwise it fails during installation @@ -88,6 +89,7 @@ def _load_py_module(name: str, location: str) -> ModuleType: # engineer specific practices if __name__ == "__main__": _SETUP_TOOLS = _load_py_module(name="setup_tools", location=os.path.join(".actions", "setup_tools.py")) + _SETUP_TOOLS.set_version_today(os.path.join(_PATH_SRC, "lightning", "__version__.py")) for lit_name, pkg_name in _PACKAGE_MAPPING.items(): # fixme: if we run creation of meta pkg against stable we shall pull the source _SETUP_TOOLS.create_meta_package(os.path.join(_PATH_ROOT, "src"), pkg_name, lit_name) diff --git a/src/lightning/__version__.py b/src/lightning/__version__.py index 8f2d7b1403a7b..1a929693c0e53 100644 --- a/src/lightning/__version__.py +++ b/src/lightning/__version__.py @@ -1 +1 @@ -version = "2022.7.18" +version = "YYYY.-M.-D" From cd92e3541eab86a837b07515814c93e0ed1cac9f Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 29 Jul 2022 15:04:57 +0200 Subject: [PATCH 044/230] prune func calls in meta pkg init (#13742) * prune func calls in meta pkg init * move calling * prune * coped Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .actions/setup_tools.py | 61 ++++++++++++++++++----- .github/actions/pkg-check/action.yml | 12 +++-- src/pytorch_lightning/loggers/__init__.py | 18 +++---- 3 files changed, 66 insertions(+), 25 deletions(-) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 3e6a0ee0cbf84..08678b43848bd 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -151,6 +151,7 @@ def replace_vars_with_imports(lines: List[str], import_path: str) -> List[str]: ... lines = [ln.rstrip() for ln in fp.readlines()] >>> lines = replace_vars_with_imports(lines, import_path) """ + copied = [] body, tracking, skip_offset = [], False, 0 for ln in lines: offset = len(ln) - len(ln.lstrip()) @@ -161,8 +162,9 @@ def replace_vars_with_imports(lines: List[str], import_path: str) -> List[str]: if var: name = var.groups()[0] # skip private or apply white-list for allowed vars - if not name.startswith("__") or name in ("__all__",): + if name not in copied and (not name.startswith("__") or name in ("__all__",)): body.append(f"{' ' * offset}from {import_path} import {name} # noqa: F401") + copied.append(name) tracking, skip_offset = True, offset continue if not tracking: @@ -197,6 +199,31 @@ def prune_imports_callables(lines: List[str]) -> List[str]: return body +def prune_func_calls(lines: List[str]) -> List[str]: + """Prune calling functions from a file, even multi-line. + + >>> py_file = os.path.join(_PROJECT_ROOT, "src", "pytorch_lightning", "loggers", "__init__.py") + >>> import_path = ".".join(["pytorch_lightning", "loggers"]) + >>> with open(py_file, encoding="utf-8") as fp: + ... lines = [ln.rstrip() for ln in fp.readlines()] + >>> lines = prune_func_calls(lines) + """ + body, tracking, score = [], False, 0 + for ln in lines: + # catching callable + calling = re.match(r"^@?[\w_\d\.]+ *\(", ln.lstrip()) + if calling and " import " not in ln: + tracking = True + score = 0 + if tracking: + score += ln.count("(") - ln.count(")") + if score == 0: + tracking = False + else: + body.append(ln) + return body + + def prune_empty_statements(lines: List[str]) -> List[str]: """Prune emprty if/else and try/except. @@ -302,6 +329,15 @@ def parse_version_from_file(pkg_root: str) -> str: return ver +def prune_duplicate_lines(body): + body_ = [] + # drop duplicated lines + for ln in body: + if ln.lstrip() not in body_ or ln.lstrip() in (")", ""): + body_.append(ln) + return body_ + + def create_meta_package(src_folder: str, pkg_name: str = "pytorch_lightning", lit_name: str = "pytorch"): """Parse the real python package and for each module create a mirroe version with repalcing all function and class implementations by cross-imports to the true package. @@ -331,34 +367,36 @@ class implementations by cross-imports to the true package. logging.warning(f"unsupported file: {local_path}") continue # ToDO: perform some smarter parsing - preserve Constants, lambdas, etc - body = prune_comments_docstrings(lines) + body = prune_comments_docstrings([ln.rstrip() for ln in lines]) if fname not in ("__init__.py", "__main__.py"): body = prune_imports_callables(body) - body = replace_block_with_imports([ln.rstrip() for ln in body], import_path, "class") - body = replace_block_with_imports(body, import_path, "def") - body = replace_block_with_imports(body, import_path, "async def") + for key_word in ("class", "def", "async def"): + body = replace_block_with_imports(body, import_path, key_word) + # TODO: fix reimporting which is artefact after replacing var assignment with import; + # after fixing , update CI by remove F811 from CI/check pkg body = replace_vars_with_imports(body, import_path) + if fname not in ("__main__.py",): + body = prune_func_calls(body) body_len = -1 # in case of several in-depth statements while body_len != len(body): body_len = len(body) + body = prune_duplicate_lines(body) body = prune_empty_statements(body) # add try/catch wrapper for whole body, # so when import fails it tells you what is the package version this meta package was generated for... body = wrap_try_except(body, pkg_name, pkg_ver) # todo: apply pre-commit formatting + # clean to many empty lines body = [ln for ln, _group in groupby(body)] - lines = [] # drop duplicated lines - for ln in body: - if ln + os.linesep not in lines or ln.lstrip() in (")", ""): - lines.append(ln + os.linesep) + body = prune_duplicate_lines(body) # compose the target file name new_file = os.path.join(src_folder, "lightning", lit_name, local_path) os.makedirs(os.path.dirname(new_file), exist_ok=True) with open(new_file, "w", encoding="utf-8") as fp: - fp.writelines(lines) + fp.writelines([ln + os.linesep for ln in body]) def set_version_today(fpath: str) -> None: @@ -380,7 +418,6 @@ def _download_frontend(root: str = _PROJECT_ROOT): directory.""" try: - build_dir = "build" frontend_dir = pathlib.Path(root, "src", "lightning_app", "ui") download_dir = tempfile.mkdtemp() @@ -390,7 +427,7 @@ def _download_frontend(root: str = _PROJECT_ROOT): file = tarfile.open(fileobj=response, mode="r|gz") file.extractall(path=download_dir) - shutil.move(os.path.join(download_dir, build_dir), frontend_dir) + shutil.move(os.path.join(download_dir, "build"), frontend_dir) print("The Lightning UI has successfully been downloaded!") # If installing from source without internet connection, we don't want to break the installation diff --git a/.github/actions/pkg-check/action.yml b/.github/actions/pkg-check/action.yml index aa0ecd3db4968..26ae8ddc88a7c 100644 --- a/.github/actions/pkg-check/action.yml +++ b/.github/actions/pkg-check/action.yml @@ -14,13 +14,19 @@ runs: run: pip install "twine==4.0.1" setuptools wheel flake8 shell: bash - - name: Create package + - name: Source check env: PACKAGE_NAME: ${{ inputs.pkg-name }} run: | python setup.py check --metadata --strict - flake8 src/lightning/ --ignore E402,F401,E501,W391,E303 - python setup.py sdist bdist_wheel + # TODO: fix reimporting (F811) which is aftefact after rplacing var assigne with import in meta package + flake8 src/lightning/ --ignore E402,F401,E501,W391,E303,F811 + shell: bash + + - name: Create package + env: + PACKAGE_NAME: ${{ inputs.pkg-name }} + run: python setup.py sdist bdist_wheel shell: bash - name: Check package diff --git a/src/pytorch_lightning/loggers/__init__.py b/src/pytorch_lightning/loggers/__init__.py index b5afba4bba9e3..c97a7a09d9e7f 100644 --- a/src/pytorch_lightning/loggers/__init__.py +++ b/src/pytorch_lightning/loggers/__init__.py @@ -11,26 +11,24 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from os import environ +import os -from pytorch_lightning.loggers.base import ( # LightningLoggerBase imported for backward compatibility - LightningLoggerBase, -) +# LightningLoggerBase imported for backward compatibility +from pytorch_lightning.loggers.base import LightningLoggerBase +from pytorch_lightning.loggers.comet import _COMET_AVAILABLE, CometLogger # noqa: F401 from pytorch_lightning.loggers.csv_logs import CSVLogger from pytorch_lightning.loggers.logger import Logger, LoggerCollection -from pytorch_lightning.loggers.tensorboard import TensorBoardLogger - -__all__ = ["CSVLogger", "LightningLoggerBase", "Logger", "LoggerCollection", "TensorBoardLogger"] - -from pytorch_lightning.loggers.comet import _COMET_AVAILABLE, CometLogger # noqa: F401 from pytorch_lightning.loggers.mlflow import _MLFLOW_AVAILABLE, MLFlowLogger # noqa: F401 from pytorch_lightning.loggers.neptune import NeptuneLogger # noqa: F401 +from pytorch_lightning.loggers.tensorboard import TensorBoardLogger from pytorch_lightning.loggers.wandb import WandbLogger # noqa: F401 +__all__ = ["CSVLogger", "LightningLoggerBase", "Logger", "LoggerCollection", "TensorBoardLogger"] + if _COMET_AVAILABLE: __all__.append("CometLogger") # needed to prevent ModuleNotFoundError and duplicated logs. - environ["COMET_DISABLE_AUTO_LOGGING"] = "1" + os.environ["COMET_DISABLE_AUTO_LOGGING"] = "1" if _MLFLOW_AVAILABLE: __all__.append("MLFlowLogger") From aefb9ab43f9a8e6704558a346dbae1a00044bb45 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Fri, 29 Jul 2022 16:44:52 +0200 Subject: [PATCH 045/230] (app) Introduce LightningTrainingComponent (#13830) --- .gitignore | 1 + docs/source-app/api_reference/components.rst | 1 + examples/app_multi_node/app.py | 11 + examples/app_multi_node/{ => bare}/.gitignore | 0 .../app_multi_node/{ => bare}/multi_node.py | 0 .../{ => bare}/requirements.txt | 0 examples/app_multi_node/train.py | 7 + src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/components/python/tracer.py | 48 ++++- src/lightning_app/components/training.py | 192 ++++++++++++++++++ src/lightning_app/core/flow.py | 1 + src/lightning_app/runners/backends/backend.py | 1 - src/lightning_app/source_code/local.py | 1 + src/lightning_app/structures/dict.py | 7 +- src/lightning_app/testing/testing.py | 3 + src/lightning_app/utilities/network.py | 2 +- .../utilities/packaging/tarfile.py | 39 ++++ src/lightning_app/utilities/state.py | 57 +++++- .../components/python/test_python.py | 52 +++++ tests/tests_app/utilities/test_state.py | 39 ++++ tests/tests_app_examples/test_multi_node.py | 29 +++ 21 files changed, 485 insertions(+), 8 deletions(-) create mode 100644 examples/app_multi_node/app.py rename examples/app_multi_node/{ => bare}/.gitignore (100%) rename examples/app_multi_node/{ => bare}/multi_node.py (100%) rename examples/app_multi_node/{ => bare}/requirements.txt (100%) create mode 100644 examples/app_multi_node/train.py create mode 100644 src/lightning_app/components/training.py create mode 100644 src/lightning_app/utilities/packaging/tarfile.py create mode 100644 tests/tests_app_examples/test_multi_node.py diff --git a/.gitignore b/.gitignore index 7040a912974e1..0f03c69600bed 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,4 @@ src/lightning_app/ui/* *examples/template_react_ui* hars* artifacts/* +*docs/examples* diff --git a/docs/source-app/api_reference/components.rst b/docs/source-app/api_reference/components.rst index 76a99402ddecc..c5f99f0f96629 100644 --- a/docs/source-app/api_reference/components.rst +++ b/docs/source-app/api_reference/components.rst @@ -20,5 +20,6 @@ ___________________ ~python.popen.PopenPythonScript ~python.tracer.TracerPythonScript + ~training.LightningTrainingComponent ~serve.gradio.ServeGradio ~serve.serve.ModelInferenceAPI diff --git a/examples/app_multi_node/app.py b/examples/app_multi_node/app.py new file mode 100644 index 0000000000000..6e405a346a143 --- /dev/null +++ b/examples/app_multi_node/app.py @@ -0,0 +1,11 @@ +from lightning import LightningApp +from lightning.app.components.training import LightningTrainingComponent +from lightning.app.utilities.packaging.cloud_compute import CloudCompute + +app = LightningApp( + LightningTrainingComponent( + "train.py", + num_nodes=2, + cloud_compute=CloudCompute("gpu-fast-multi"), + ), +) diff --git a/examples/app_multi_node/.gitignore b/examples/app_multi_node/bare/.gitignore similarity index 100% rename from examples/app_multi_node/.gitignore rename to examples/app_multi_node/bare/.gitignore diff --git a/examples/app_multi_node/multi_node.py b/examples/app_multi_node/bare/multi_node.py similarity index 100% rename from examples/app_multi_node/multi_node.py rename to examples/app_multi_node/bare/multi_node.py diff --git a/examples/app_multi_node/requirements.txt b/examples/app_multi_node/bare/requirements.txt similarity index 100% rename from examples/app_multi_node/requirements.txt rename to examples/app_multi_node/bare/requirements.txt diff --git a/examples/app_multi_node/train.py b/examples/app_multi_node/train.py new file mode 100644 index 0000000000000..f14809354f405 --- /dev/null +++ b/examples/app_multi_node/train.py @@ -0,0 +1,7 @@ +from lightning.pytorch import Trainer +from lightning.pytorch.demos.boring_classes import BoringModel + +if __name__ == "__main__": + model = BoringModel() + trainer = Trainer(max_epochs=1) + trainer.fit(model) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 95a7000818b78..89fcd615430aa 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -10,6 +10,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) +- Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) + ### Changed - Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537)) diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index fa955646acbbf..b98c782e138e4 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -2,16 +2,24 @@ import os import signal import sys -from typing import Any, Dict, List, Optional, Union +from copy import deepcopy +from typing import Any, Dict, List, Optional, TypedDict, Union from lightning_app import LightningWork +from lightning_app.storage.drive import Drive from lightning_app.storage.payload import Payload from lightning_app.utilities.app_helpers import _collect_child_process_pids +from lightning_app.utilities.packaging.tarfile import clean_tarfile, extract_tarfile from lightning_app.utilities.tracer import Tracer logger = logging.getLogger(__name__) +class Code(TypedDict): + drive: Drive + name: str + + class TracerPythonScript(LightningWork): def on_before_run(self): """Called before the python script is executed.""" @@ -31,6 +39,7 @@ def __init__( script_args: Optional[Union[list, str]] = None, outputs: Optional[List[str]] = None, env: Optional[Dict] = None, + code: Optional[Code] = None, **kwargs, ): """The TracerPythonScript class enables to easily run a python script. @@ -97,17 +106,46 @@ def __init__( if isinstance(script_args, str): script_args = script_args.split(" ") self.script_args = script_args if script_args else [] + self.original_args = deepcopy(self.script_args) self.env = env self.outputs = outputs or [] for name in self.outputs: setattr(self, name, None) + self.params = None + self.drive = code.get("drive") if code else None + self.code_name = code.get("name") if code else None + self.restart_count = 0 + + def run(self, params: Optional[Dict[str, Any]] = None, restart_count: Optional[int] = None, **kwargs): + """ + Arguments: + params: A dictionary of arguments to be be added to script_args. + restart_count: Passes an incrementing counter to enable the re-execution of LightningWorks. + """ + if restart_count: + self.restart_count = restart_count + + if params: + self.params = params + self.script_args = self.original_args + [self._to_script_args(k, v) for k, v in params.items()] + + if self.drive: + assert self.code_name + if os.path.exists(self.code_name): + clean_tarfile(self.code_name, "r:gz") + + if self.code_name in self.drive.list(): + self.drive.get(self.code_name) + extract_tarfile(self.code_name, ".", "r:gz") - def run(self, **kwargs): if not os.path.exists(self.script_path): raise FileNotFoundError(f"The provided `script_path` {self.script_path}` wasn't found.") + kwargs = {k: v.value if isinstance(v, Payload) else v for k, v in kwargs.items()} + init_globals = globals() init_globals.update(kwargs) + self.on_before_run() env_copy = os.environ.copy() if self.env: @@ -125,5 +163,11 @@ def on_exit(self): for child_pid in _collect_child_process_pids(os.getpid()): os.kill(child_pid, signal.SIGTERM) + @staticmethod + def _to_script_args(k: str, v: str) -> str: + if k.startswith("--"): + return f"{k}={v}" + return f"--{k}={v}" + __all__ = ["TracerPythonScript"] diff --git a/src/lightning_app/components/training.py b/src/lightning_app/components/training.py new file mode 100644 index 0000000000000..9773fe9670e52 --- /dev/null +++ b/src/lightning_app/components/training.py @@ -0,0 +1,192 @@ +import logging +import os +from typing import Any, Dict, List, Optional, Tuple, Type, Union + +from lightning import CloudCompute +from lightning_app import LightningFlow, structures +from lightning_app.components.python import TracerPythonScript +from lightning_app.storage.path import Path + +_logger = logging.getLogger(__name__) + + +class PyTorchLightningScriptRunner(TracerPythonScript): + def __init__( + self, + script_path: str, + script_args: Optional[Union[list, str]] = None, + node_rank: int = 1, + num_nodes: int = 1, + sanity_serving: bool = False, + cloud_compute: Optional[CloudCompute] = None, + parallel: bool = True, + raise_exception: bool = True, + env: Optional[Dict[str, Any]] = None, + **kwargs, + ): + super().__init__( + script_path, + script_args, + raise_exception=raise_exception, + parallel=parallel, + cloud_compute=cloud_compute, + **kwargs, + ) + self.node_rank = node_rank + self.num_nodes = num_nodes + self.best_model_path = None + self.best_model_score = None + self.monitor = None + self.sanity_serving = sanity_serving + self.has_finished = False + self.env = env + + def configure_tracer(self): + from pytorch_lightning import Trainer + + tracer = super().configure_tracer() + tracer.add_traced(Trainer, "__init__", pre_fn=self._trainer_init_pre_middleware) + return tracer + + def run(self, internal_urls: Optional[List[Tuple[str, str]]] = None, **kwargs) -> None: + if not internal_urls: + # Note: This is called only once. + _logger.info(f"The node {self.node_rank} started !") + return None + + if self.env: + os.environ.update(self.env) + + distributed_env_vars = { + "MASTER_ADDR": internal_urls[0][0], + "MASTER_PORT": str(internal_urls[0][1]), + "NODE_RANK": str(self.node_rank), + "PL_TRAINER_NUM_NODES": str(self.num_nodes), + "PL_TRAINER_DEVICES": "auto", + "PL_TRAINER_ACCELERATOR": "auto", + } + + os.environ.update(distributed_env_vars) + return super().run(**kwargs) + + def on_after_run(self, script_globals): + from pytorch_lightning import Trainer + from pytorch_lightning.cli import LightningCLI + + for v in script_globals.values(): + if isinstance(v, LightningCLI): + trainer = v.trainer + break + elif isinstance(v, Trainer): + trainer = v + break + else: + raise RuntimeError("No trainer instance found.") + + self.monitor = trainer.checkpoint_callback.monitor + + if trainer.checkpoint_callback.best_model_score: + self.best_model_path = Path(trainer.checkpoint_callback.best_model_path) + self.best_model_score = float(trainer.checkpoint_callback.best_model_score) + else: + self.best_model_path = Path(trainer.checkpoint_callback.last_model_path) + + self.has_finished = True + + def _trainer_init_pre_middleware(self, trainer, *args, **kwargs): + if self.node_rank != 0: + return {}, args, kwargs + + from pytorch_lightning.serve import ServableModuleValidator + + callbacks = kwargs.get("callbacks", []) + if self.sanity_serving: + callbacks = callbacks + [ServableModuleValidator()] + kwargs["callbacks"] = callbacks + return {}, args, kwargs + + @property + def is_running_in_cloud(self) -> bool: + return "LIGHTNING_APP_STATE_URL" in os.environ + + +class LightningTrainingComponent(LightningFlow): + def __init__( + self, + script_path: str, + script_args: Optional[Union[list, str]] = None, + num_nodes: int = 1, + cloud_compute: CloudCompute = CloudCompute("default"), + sanity_serving: bool = False, + script_runner: Type[TracerPythonScript] = PyTorchLightningScriptRunner, + **script_runner_kwargs, + ): + """This component enables performing distributed multi-node multi-device training. + + Example:: + + from lightning import LightningApp + from lightning.app.components.training import LightningTrainingComponent + from lightning.app.utilities.packaging.cloud_compute import CloudCompute + + app = LightningApp( + LightningTrainingComponent( + "train.py", + num_nodes=2, + cloud_compute=CloudCompute("gpu"), + ), + ) + + Arguments: + script_path: Path to the script to be executed. + script_args: The arguments to be pass to the script. + num_nodes: Number of nodes. + cloud_compute: The cloud compute object used in the cloud. + sanity_serving: Whether to validate that the model correctly implements + the ServableModule API + """ + super().__init__() + self.ws = structures.List() + self.has_initialized = False + self.script_path = script_path + self.script_args = script_args + self.num_nodes = num_nodes + self._cloud_compute = cloud_compute # TODO: Add support for cloudCompute + self.sanity_serving = sanity_serving + self._script_runner = script_runner + self._script_runner_kwargs = script_runner_kwargs + + def run(self, **run_kwargs): + if not self.has_initialized: + for node_rank in range(self.num_nodes): + self.ws.append( + self._script_runner( + script_path=self.script_path, + script_args=self.script_args, + cloud_compute=self._cloud_compute, + node_rank=node_rank, + sanity_serving=self.sanity_serving, + num_nodes=self.num_nodes, + **self._script_runner_kwargs, + ) + ) + + self.has_initialized = True + + for work in self.ws: + if all(w.internal_ip for w in self.ws): + internal_urls = [(w.internal_ip, w.port) for w in self.ws] + work.run(internal_urls=internal_urls, **run_kwargs) + if all(w.has_finished for w in self.ws): + for w in self.ws: + w.stop() + else: + work.run() + + @property + def best_model_score(self) -> Optional[float]: + return self.ws[0].best_model_score + + @property + def best_model_paths(self) -> List[Optional[Path]]: + return [self.ws[node_idx].best_mode_path for node_idx in range(len(self.ws))] diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index d1af891476a02..f6b6e34e81538 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -209,6 +209,7 @@ def _attach_backend(flow: "LightningFlow", backend): LightningFlow._attach_backend(flow, backend) for work in structure.works: backend._wrap_run_method(_LightningAppRef().get_current(), work) + work._backend = backend for name in flow._structures: getattr(flow, name)._backend = backend diff --git a/src/lightning_app/runners/backends/backend.py b/src/lightning_app/runners/backends/backend.py index c370c7098b778..87bb103823fd2 100644 --- a/src/lightning_app/runners/backends/backend.py +++ b/src/lightning_app/runners/backends/backend.py @@ -87,7 +87,6 @@ def _prepare_queues(self, app): app.commands_metadata_queue = self.queues.get_commands_metadata_queue(**kw) app.error_queue = self.queues.get_error_queue(**kw) app.delta_queue = self.queues.get_delta_queue(**kw) - app.error_queue = self.queues.get_error_queue(**kw) app.api_publish_state_queue = self.queues.get_api_state_publish_queue(**kw) app.api_delta_queue = self.queues.get_api_delta_queue(**kw) app.request_queues = {} diff --git a/src/lightning_app/source_code/local.py b/src/lightning_app/source_code/local.py index a42347ac42101..05669dff2f6a5 100644 --- a/src/lightning_app/source_code/local.py +++ b/src/lightning_app/source_code/local.py @@ -94,6 +94,7 @@ def upload(self, url: str) -> None: raise OSError( "cannot upload directory code whose total fize size is greater than 2GB (2e9 bytes)" ) from None + uploader = FileUploader( presigned_url=url, source_file=str(self.package_path), diff --git a/src/lightning_app/structures/dict.py b/src/lightning_app/structures/dict.py index 2aa02d4ebfa50..93e2b161b2e7a 100644 --- a/src/lightning_app/structures/dict.py +++ b/src/lightning_app/structures/dict.py @@ -58,7 +58,10 @@ def __init__(self, **kwargs: T): def __setitem__(self, k, v): from lightning_app import LightningFlow, LightningWork - if "." in k: + if not isinstance(k, str): + raise Exception("The provided key should be an string") + + if isinstance(k, str) and "." in k: raise Exception(f"The provided name {k} contains . which is forbidden.") if self._backend: @@ -67,7 +70,7 @@ def __setitem__(self, k, v): _set_child_name(self, v, k) elif isinstance(v, LightningWork): self._backend._wrap_run_method(_LightningAppRef().get_current(), v) - v._name = f"{self.name}.{k}" + v._name = f"{self.name}.{k}" super().__setitem__(k, v) @property diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index bdf37cacf04a7..cc03f5badec2b 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -23,6 +23,7 @@ from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.imports import _is_playwright_available, requires from lightning_app.utilities.network import _configure_session, LightningClient +from lightning_app.utilities.proxies import ProxyWorkRun if _is_playwright_available(): import playwright @@ -114,6 +115,8 @@ def run_work_isolated(work, *args, start_server: bool = False, **kwargs): # pop the stopped status. call_hash = work._calls["latest_call_hash"] work._calls[call_hash]["statuses"].pop(-1) + if isinstance(work.run, ProxyWorkRun): + work.run = work.run.work_run def browser_context_args(browser_context_args: Dict) -> Dict: diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index 98c7db3d46ff8..a9ebcf37ab564 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -48,7 +48,7 @@ def _configure_session() -> Session: return http -def _check_service_url_is_ready(url: str, timeout: float = 0.5) -> bool: +def _check_service_url_is_ready(url: str, timeout: float = 1) -> bool: try: response = requests.get(url, timeout=timeout) return response.status_code in (200, 404) diff --git a/src/lightning_app/utilities/packaging/tarfile.py b/src/lightning_app/utilities/packaging/tarfile.py new file mode 100644 index 0000000000000..123e4e2e0942a --- /dev/null +++ b/src/lightning_app/utilities/packaging/tarfile.py @@ -0,0 +1,39 @@ +import os +import shutil +import tarfile + + +def clean_tarfile(file_path: str, mode: str) -> None: + """This utility removes all files extracted from a tarfile.""" + + if not os.path.exists(file_path): + return None + + with tarfile.open(file_path, mode=mode) as tar_ref: + for member in tar_ref.getmembers(): + p = member.path + if p == "." or not os.path.exists(p): + continue + try: + if os.path.isfile(p): + os.remove(p) + else: + shutil.rmtree(p) + except (FileNotFoundError, OSError, PermissionError): + pass + + if os.path.exists(file_path): + os.remove(file_path) + + +def extract_tarfile(file_path: str, extract_path: str, mode: str) -> None: + """This utility extracts all files from a tarfile.""" + if not os.path.exists(file_path): + return None + + with tarfile.open(file_path, mode=mode) as tar_ref: + for member in tar_ref.getmembers(): + try: + tar_ref.extract(member, path=extract_path, set_attrs=False) + except PermissionError: + raise PermissionError(f"Could not extract tar file {file_path}") diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 0802a426e7349..5cd7979de09d9 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -3,7 +3,7 @@ import logging import os from copy import deepcopy -from typing import Any, Dict, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union from deepdiff import DeepDiff from requests import Session @@ -168,7 +168,7 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: # The state needs to be fetched on access if it doesn't exist. self._request_state() - if name in self._state["vars"]: + if name in self._state.get("vars", {}): value = self._state["vars"][name] if isinstance(value, dict): return _maybe_create_drive("root." + ".".join(self._my_affiliation), value) @@ -187,12 +187,23 @@ def __getattr__(self, name: str) -> Union[Any, "AppState"]: state=self._state["flows"][name], ) + elif name in self._state.get("structures", {}): + return AppState( + self._host, + self._port, + last_state=self._last_state["structures"][name], + state=self._state["structures"][name], + ) + raise AttributeError( f"Failed to access '{name}' through `AppState`. The state provides:" f" Variables: {list(self._state['vars'].keys())}," f" Components: {list(self._state.get('flows', {}).keys()) + list(self._state.get('works', {}).keys())}", ) + def __getitem__(self, key: str): + return self.__getattr__(key) + def __setattr__(self, name: str, value: Any) -> None: if name in self._APP_PRIVATE_KEYS: object.__setattr__(self, name, value) @@ -226,6 +237,48 @@ def __repr__(self) -> str: def __bool__(self) -> bool: return bool(self._state) + def __len__(self) -> int: + # The state needs to be fetched on access if it doesn't exist. + self._request_state() + + keys = [] + for component in ["flows", "works", "structures"]: + keys.extend(list(self._state.get(component, {}))) + return len(keys) + + def items(self) -> List[Dict[str, Any]]: + # The state needs to be fetched on access if it doesn't exist. + self._request_state() + + items = [] + for component in ["flows", "works"]: + state = self._state.get(component, {}) + last_state = self._last_state.get(component, {}) + for name, state_value in state.items(): + v = AppState( + self._host, + self._port, + last_state=last_state[name], + state=state_value, + ) + items.append((name, v)) + + structures = self._state.get("structures", {}) + last_structures = self._last_state.get("structures", {}) + if structures: + for component in ["flows", "works"]: + state = structures.get(component, {}) + last_state = last_structures.get(component, {}) + for name, state_value in state.items(): + v = AppState( + self._host, + self._port, + last_state=last_state[name], + state=state_value, + ) + items.append((name, v)) + return items + @staticmethod def _configure_session() -> Session: return _configure_session() diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index 61969ef1c4c51..678655d6ee908 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -1,11 +1,15 @@ import os +import tarfile import pytest from tests_app import _PROJECT_ROOT from lightning_app.components.python import PopenPythonScript, TracerPythonScript +from lightning_app.components.python.tracer import Code +from lightning_app.storage.drive import Drive from lightning_app.testing.helpers import RunIf from lightning_app.testing.testing import run_work_isolated +from lightning_app.utilities.component import _set_work_context COMPONENTS_SCRIPTS_FOLDER = str(os.path.join(_PROJECT_ROOT, "tests/tests_app/components/python/scripts/")) @@ -69,3 +73,51 @@ def test_tracer_python_script_with_kwargs(): ) run_work_isolated(python_script) assert python_script.has_failed + + +def test_tracer_component_with_code(): + """This test ensures the Tracer Component gets the latest code from the code object that is provided and + arguments are cleaned.""" + + drive = Drive("lit://code") + drive.component_name = "something" + code = Code(drive=drive, name="sample.tar.gz") + + with open("file.py", "w") as f: + f.write('raise Exception("An error")') + + with tarfile.open("sample.tar.gz", "w:gz") as tar: + tar.add("file.py") + + drive.put("sample.tar.gz") + os.remove("file.py") + os.remove("sample.tar.gz") + + python_script = TracerPythonScript("file.py", script_args=["--b=1"], raise_exception=False, code=code) + run_work_isolated(python_script, params={"a": "1"}, restart_count=0) + assert python_script.status.message == "An error" + + with open("file.py", "w") as f: + f.write("import sys\n") + f.write("print(sys.argv)\n") + + with tarfile.open("sample.tar.gz", "w:gz") as tar: + tar.add("file.py") + + _set_work_context() + drive.put("sample.tar.gz") + os.remove("file.py") + os.remove("sample.tar.gz") + + with open("file.py", "w") as f: + f.write('raise Exception("An error")') + + call_hash = python_script._calls["latest_call_hash"] + python_script._calls[call_hash]["statuses"].pop(-1) + python_script._calls[call_hash]["statuses"].pop(-1) + + run_work_isolated(python_script, params={"a": "1"}, restart_count=1) + assert python_script.has_succeeded + assert python_script.script_args == ["--b=1", "--a=1"] + os.remove("file.py") + os.remove("sample.tar.gz") diff --git a/tests/tests_app/utilities/test_state.py b/tests/tests_app/utilities/test_state.py index 0740ffc615b87..3b9f1b790cfc7 100644 --- a/tests/tests_app/utilities/test_state.py +++ b/tests/tests_app/utilities/test_state.py @@ -7,6 +7,7 @@ import lightning_app from lightning_app import LightningApp, LightningFlow, LightningWork +from lightning_app.structures import Dict, List from lightning_app.utilities.app_helpers import AppStatePlugin, BaseStatePlugin from lightning_app.utilities.state import AppState @@ -280,3 +281,41 @@ def test_app_state_with_no_env_var(**__): assert state._host == "http://127.0.0.1" assert state._port == 7501 assert state._url == "http://127.0.0.1:7501" + + +class FlowStructures(LightningFlow): + def __init__(self): + super().__init__() + self.w_list = List(Work(), Work()) + self.w_dict = Dict(**{"toto": Work(), "toto_2": Work()}) + + def run(self): + self._exit() + + +class FlowStructuresEmpty(LightningFlow): + def __init__(self): + super().__init__() + self.w_list = List() + self.w_dict = Dict() + + def run(self): + self._exit() + + +def test_app_state_with_structures(): + app = LightningApp(FlowStructures()) + state = AppState() + state._last_state = app.state + state._state = app.state + assert state.w_list["0"].counter == 0 + assert len(state.w_list) == 2 + assert state.w_dict["toto"].counter == 0 + assert [k for k, _ in state.w_dict.items()] == ["toto", "toto_2"] + assert [k for k, _ in state.w_list.items()] == ["0", "1"] + + app = LightningApp(FlowStructuresEmpty()) + state = AppState() + state._last_state = app.state + state._state = app.state + assert state.w_list diff --git a/tests/tests_app_examples/test_multi_node.py b/tests/tests_app_examples/test_multi_node.py new file mode 100644 index 0000000000000..4b5c80c0cd9cb --- /dev/null +++ b/tests/tests_app_examples/test_multi_node.py @@ -0,0 +1,29 @@ +import os + +from tests_app import _PROJECT_ROOT + +from lightning_app.testing.testing import application_testing, LightningTestApp + + +class LightningTestMultiNodeApp(LightningTestApp): + def on_before_run_once(self): + res = super().on_before_run_once() + if all(w.has_finished for w in self.works): + return True + return res + + +def test_multi_node_example(): + cwd = os.getcwd() + new_cwd = os.path.join(_PROJECT_ROOT, "examples/app_multi_node") + os.chdir(new_cwd) + command_line = [ + "app.py", + "--blocking", + "False", + "--open-ui", + "False", + ] + result = application_testing(LightningTestMultiNodeApp, command_line) + assert result.exit_code == 0 + os.chdir(cwd) From 0f6caffa577e2c8d9602dc8fba37c13e128b2120 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sat, 30 Jul 2022 02:06:51 +0530 Subject: [PATCH 046/230] Fix deepspeed default precision plugin `amp_level` to O2 (#13897) Co-authored-by: Akihiro Nitta --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../plugins/precision/apex_amp.py | 2 +- .../plugins/precision/deepspeed.py | 13 +++++++++++-- .../precision/test_deepspeed_precision.py | 18 ++++++++++++++++++ .../tests_pytorch/plugins/test_amp_plugins.py | 2 +- 5 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index aa66df9b54a8b..ea649a9b65236 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -396,6 +396,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue that caused the learning rate finder to set the model's learning rate to None when no suggestion was possible ([#13845](https://github.com/Lightning-AI/lightning/pull/13845)) +- Fixed default `amp_level` for `DeepSpeedPrecisionPlugin` to `O2` ([#13897](https://github.com/PyTorchLightning/pytorch-lightning/pull/13897)) + + ## [1.6.5] - 2022-07-13 diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index e18f82dc27f6e..2077f2072ab95 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -35,7 +35,7 @@ class ApexMixedPrecisionPlugin(MixedPrecisionPlugin): def __init__(self, amp_level: str = "O2") -> None: if not _APEX_AVAILABLE: raise MisconfigurationException( - "You have asked for Apex AMP but you have not installed it." + "You have asked for Apex AMP but `apex` is not installed." " Install `apex` using this guide: https://github.com/NVIDIA/apex" ) super().__init__() diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index fa948520e1fd6..791a08a87d107 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -20,9 +20,9 @@ import pytorch_lightning as pl from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import GradClipAlgorithmType -from pytorch_lightning.utilities.enums import PrecisionType +from pytorch_lightning.utilities.enums import AMPType, PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable +from pytorch_lightning.utilities.imports import _APEX_AVAILABLE, _RequirementAvailable from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.warnings import WarningCache @@ -51,6 +51,15 @@ class DeepSpeedPrecisionPlugin(PrecisionPlugin): """ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optional[str] = None) -> None: + if amp_type == AMPType.APEX: + if not _APEX_AVAILABLE: + raise MisconfigurationException( + "You have asked for Apex AMP but `apex` is not installed." + " Install `apex` using this guide: https://github.com/NVIDIA/apex" + ) + + amp_level = amp_level or "O2" + supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT, PrecisionType.MIXED) if precision not in supported_precision: raise ValueError( diff --git a/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py b/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py index a4698e7c19c97..c1f7979ea8482 100644 --- a/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py +++ b/tests/tests_pytorch/plugins/precision/test_deepspeed_precision.py @@ -11,11 +11,29 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from unittest import mock + import pytest from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin +from pytorch_lightning.utilities.exceptions import MisconfigurationException def test_invalid_precision_with_deepspeed_precision(): with pytest.raises(ValueError, match="is not supported. `precision` must be one of"): DeepSpeedPrecisionPlugin(precision=64, amp_type="native") + + +def test_deepspeed_precision_apex_not_installed(monkeypatch): + import pytorch_lightning.plugins.precision.deepspeed as deepspeed_apex + + monkeypatch.setattr(deepspeed_apex, "_APEX_AVAILABLE", False) + with pytest.raises(MisconfigurationException, match="You have asked for Apex AMP but `apex` is not installed."): + DeepSpeedPrecisionPlugin(precision=16, amp_type="apex") + + +@mock.patch("pytorch_lightning.plugins.precision.deepspeed._APEX_AVAILABLE", return_value=True) +def test_deepspeed_precision_apex_default_level(_): + precision_plugin = DeepSpeedPrecisionPlugin(precision=16, amp_type="apex") + assert isinstance(precision_plugin, DeepSpeedPrecisionPlugin) + assert precision_plugin.amp_level == "O2" diff --git a/tests/tests_pytorch/plugins/test_amp_plugins.py b/tests/tests_pytorch/plugins/test_amp_plugins.py index b02e3e29e9539..974964e5b9101 100644 --- a/tests/tests_pytorch/plugins/test_amp_plugins.py +++ b/tests/tests_pytorch/plugins/test_amp_plugins.py @@ -289,5 +289,5 @@ def test_precision_selection_raises(monkeypatch): monkeypatch.setattr(apex, "_APEX_AVAILABLE", False) with mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1), mock.patch( "pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True - ), pytest.raises(MisconfigurationException, match="asked for Apex AMP but you have not installed it"): + ), pytest.raises(MisconfigurationException, match="asked for Apex AMP but `apex` is not installed"): Trainer(amp_backend="apex", precision=16, accelerator="gpu", devices=1) From c65bbe045f19188167d1ca6618d1929e35060195 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 1 Aug 2022 10:00:01 +0200 Subject: [PATCH 047/230] fixing build meta pkg flow (#13926) * debug pulled version * pre flags * only meta --- .github/actions/pkg-install/action.yml | 2 +- .github/workflows/ci_pkg-install.yml | 1 + setup.py | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.github/actions/pkg-install/action.yml b/.github/actions/pkg-install/action.yml index 652a82d76155b..9f144807ce8a7 100644 --- a/.github/actions/pkg-install/action.yml +++ b/.github/actions/pkg-install/action.yml @@ -28,7 +28,7 @@ runs: - name: Install | Uninstall package - archive working-directory: ./dist run: | - pip install *.tar.gz ${PKG_NAME} + pip install *.tar.gz ${PKG_NAME} ${{ inputs.pip-flags }} pip list | grep lightning python -c "import ${PKG_NAME} ; print(${PKG_NAME}.__version__)" pip uninstall -y ${PKG_NAME} diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index 5d09047663c58..2ecd9a920c6a7 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -139,3 +139,4 @@ jobs: - uses: ./.github/actions/pkg-install with: pkg-name: "lightning" + pip-flags: "-U --pre --find-links ../pypi/" diff --git a/setup.py b/setup.py index 013729af02aa9..519829acee02e 100755 --- a/setup.py +++ b/setup.py @@ -89,7 +89,8 @@ def _load_py_module(name: str, location: str) -> ModuleType: # engineer specific practices if __name__ == "__main__": _SETUP_TOOLS = _load_py_module(name="setup_tools", location=os.path.join(".actions", "setup_tools.py")) - _SETUP_TOOLS.set_version_today(os.path.join(_PATH_SRC, "lightning", "__version__.py")) + if _PACKAGE_NAME not in _PACKAGE_MAPPING: + _SETUP_TOOLS.set_version_today(os.path.join(_PATH_SRC, "lightning", "__version__.py")) for lit_name, pkg_name in _PACKAGE_MAPPING.items(): # fixme: if we run creation of meta pkg against stable we shall pull the source _SETUP_TOOLS.create_meta_package(os.path.join(_PATH_ROOT, "src"), pkg_name, lit_name) From 8abf32e33a7997e665f58963022590ea589384b7 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 1 Aug 2022 12:52:46 +0200 Subject: [PATCH 048/230] update codecov badge (#13852) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ce2c71db653b8..2fef343425f17 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ ______________________________________________________________________ [![PyPI Status](https://pepy.tech/badge/pytorch-lightning)](https://pepy.tech/project/pytorch-lightning) [![Conda](https://img.shields.io/conda/v/conda-forge/pytorch-lightning?label=conda&color=success)](https://anaconda.org/conda-forge/pytorch-lightning) [![DockerHub](https://img.shields.io/docker/pulls/pytorchlightning/pytorch_lightning.svg)](https://hub.docker.com/r/pytorchlightning/pytorch_lightning) -[![codecov](https://codecov.io/gh/Lightning-AI/lightning/branch/master/graph/badge.svg)](https://codecov.io/gh/Lightning-AI/lightning) +[![codecov](https://codecov.io/gh/Lightning-AI/lightning/branch/master/graph/badge.svg?token=SmzX8mnKlA)](https://codecov.io/gh/Lightning-AI/lightning) [![ReadTheDocs](https://readthedocs.org/projects/pytorch-lightning/badge/?version=stable)](https://pytorch-lightning.readthedocs.io/en/stable/) [![Slack](https://img.shields.io/badge/slack-chat-green.svg?logo=slack)](https://www.pytorchlightning.ai/community) From 47833dfd1adddf0fea6421cd317d903ea49ebda0 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 1 Aug 2022 15:08:43 +0200 Subject: [PATCH 049/230] pkg: parse local versions (#13933) * pkg: parse local versions * offline * str * manifest * ci --- .actions/setup_tools.py | 34 ++++++++++++++++++++++++-- .github/actions/pkg-install/action.yml | 2 +- requirements/base.txt | 2 ++ setup.py | 10 ++++++++ src/lightning/__setup__.py | 29 ++++++++-------------- 5 files changed, 55 insertions(+), 22 deletions(-) create mode 100644 requirements/base.txt diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 08678b43848bd..5088be2020738 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -22,7 +22,7 @@ import urllib.request from datetime import datetime from importlib.util import module_from_spec, spec_from_file_location -from itertools import groupby +from itertools import chain, groupby from types import ModuleType from typing import List @@ -45,7 +45,7 @@ def _load_py_module(name: str, location: str) -> ModuleType: def load_requirements( path_dir: str, file_name: str = "base.txt", comment_char: str = "#", unfreeze: bool = True ) -> List[str]: - """Load requirements from a file. + """Loading requirements from a file. >>> path_req = os.path.join(_PROJECT_ROOT, "requirements") >>> load_requirements(path_req) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE @@ -433,3 +433,33 @@ def _download_frontend(root: str = _PROJECT_ROOT): # If installing from source without internet connection, we don't want to break the installation except Exception: print("The Lightning UI downloading has failed!") + + +def _adjust_require_versions(source_dir: str = "src", req_dir: str = "requirements") -> None: + """Parse the base requirements and append as version adjustments if needed `pkg>=X1.Y1.Z1,==X2.Y2.*`.""" + reqs = load_requirements(req_dir, file_name="base.txt") + for i, req in enumerate(reqs): + pkg_name = req[: min(req.index(c) for c in ">=" if c in req)] + ver_ = parse_version_from_file(os.path.join(source_dir, pkg_name)) + if not ver_: + continue + ver2 = ".".join(ver_.split(".")[:2] + ["*"]) + reqs[i] = f"{req}, =={ver2}" + + with open(os.path.join(req_dir, "base.txt"), "w") as fp: + fp.writelines([ln + os.linesep for ln in reqs]) + + +def _load_aggregate_requirements(req_dir: str = "requirements", freeze_requirements: bool = False) -> None: + """Load all base requirements from all particular packages and prune duplicates.""" + requires = [ + load_requirements(d, file_name="base.txt", unfreeze=not freeze_requirements) + for d in glob.glob(os.path.join(req_dir, "*")) + if os.path.isdir(d) + ] + if not requires: + return None + # TODO: add some smarter version aggregation per each package + requires = list(chain(*requires)) + with open(os.path.join(req_dir, "base.txt"), "w") as fp: + fp.writelines([ln + os.linesep for ln in requires]) diff --git a/.github/actions/pkg-install/action.yml b/.github/actions/pkg-install/action.yml index 9f144807ce8a7..a6cf0d659b28c 100644 --- a/.github/actions/pkg-install/action.yml +++ b/.github/actions/pkg-install/action.yml @@ -28,7 +28,7 @@ runs: - name: Install | Uninstall package - archive working-directory: ./dist run: | - pip install *.tar.gz ${PKG_NAME} ${{ inputs.pip-flags }} + pip install *.tar.gz ${{ inputs.pip-flags }} pip list | grep lightning python -c "import ${PKG_NAME} ; print(${PKG_NAME}.__version__)" pip uninstall -y ${PKG_NAME} diff --git a/requirements/base.txt b/requirements/base.txt new file mode 100644 index 0000000000000..fea4ee10f4ce7 --- /dev/null +++ b/requirements/base.txt @@ -0,0 +1,2 @@ +pytorch_lightning>=1.6.5 +lightning_app>=0.5.2 diff --git a/setup.py b/setup.py index 519829acee02e..7d4084960d450 100755 --- a/setup.py +++ b/setup.py @@ -60,7 +60,9 @@ # http://blog.ionelmc.ro/2014/05/25/python-packaging/ _PATH_ROOT = os.path.dirname(__file__) _PATH_SRC = os.path.join(_PATH_ROOT, "src") +_PATH_REQUIRE = os.path.join(_PATH_ROOT, "requirements") _PATH_SETUP = os.path.join(_PATH_SRC, _REAL_PKG_NAME or "lightning", "__setup__.py") +_FREEZE_REQUIREMENTS = bool(int(os.environ.get("FREEZE_REQUIREMENTS", 0))) # Hardcode the env variable from time of package creation, otherwise it fails during installation @@ -89,11 +91,19 @@ def _load_py_module(name: str, location: str) -> ModuleType: # engineer specific practices if __name__ == "__main__": _SETUP_TOOLS = _load_py_module(name="setup_tools", location=os.path.join(".actions", "setup_tools.py")) + + if _PACKAGE_NAME == "lightning": # install just the meta package + _SETUP_TOOLS._adjust_require_versions(_PATH_SRC, _PATH_REQUIRE) + elif _PACKAGE_NAME not in _PACKAGE_MAPPING: # install everything + _SETUP_TOOLS._load_aggregate_requirements(_PATH_REQUIRE, _FREEZE_REQUIREMENTS) + if _PACKAGE_NAME not in _PACKAGE_MAPPING: _SETUP_TOOLS.set_version_today(os.path.join(_PATH_SRC, "lightning", "__version__.py")) + for lit_name, pkg_name in _PACKAGE_MAPPING.items(): # fixme: if we run creation of meta pkg against stable we shall pull the source _SETUP_TOOLS.create_meta_package(os.path.join(_PATH_ROOT, "src"), pkg_name, lit_name) + _SETUP_MODULE = _load_py_module(name="pkg_setup", location=_PATH_SETUP) _SETUP_MODULE._adjust_manifest(pkg_name=_REAL_PKG_NAME) setup(**_SETUP_MODULE._setup_args(pkg_name=_REAL_PKG_NAME)) diff --git a/src/lightning/__setup__.py b/src/lightning/__setup__.py index 6ab3118b3174d..e3ada2d7e93df 100644 --- a/src/lightning/__setup__.py +++ b/src/lightning/__setup__.py @@ -1,7 +1,5 @@ -import glob import os.path from importlib.util import module_from_spec, spec_from_file_location -from itertools import chain from types import ModuleType from typing import Any, Dict @@ -10,6 +8,7 @@ _PROJECT_ROOT = "." _SOURCE_ROOT = os.path.join(_PROJECT_ROOT, "src") _PACKAGE_ROOT = os.path.join(_SOURCE_ROOT, "lightning") +_PATH_REQUIREMENTS = os.path.join("requirements") _FREEZE_REQUIREMENTS = bool(int(os.environ.get("FREEZE_REQUIREMENTS", 0))) @@ -22,6 +21,9 @@ def _load_py_module(name: str, location: str) -> ModuleType: return py +_SETUP_TOOLS = _load_py_module("setup_tools", os.path.join(_PROJECT_ROOT, ".actions", "setup_tools.py")) + + def _adjust_manifest(**kwargs: Any) -> None: # todo: consider rather aggregation of particular manifest adjustments manifest_path = os.path.join(_PROJECT_ROOT, "MANIFEST.in") @@ -31,6 +33,7 @@ def _adjust_manifest(**kwargs: Any) -> None: if kwargs["pkg_name"] == "lightning": lines += [ "recursive-include src/lightning *.md", + "include requirements/base.txt", # fixme: this is strange, this shall work with setup find package - include "prune src/lightning_app", "prune src/pytorch_lightning", @@ -47,29 +50,17 @@ def _adjust_manifest(**kwargs: Any) -> None: def _setup_args(**kwargs: Any) -> Dict[str, Any]: - _path_setup_tools = os.path.join(_PROJECT_ROOT, ".actions", "setup_tools.py") - _setup_tools = _load_py_module("setup_tools", _path_setup_tools) _about = _load_py_module("about", os.path.join(_PACKAGE_ROOT, "__about__.py")) _version = _load_py_module("version", os.path.join(_PACKAGE_ROOT, "__version__.py")) - _long_description = _setup_tools.load_readme_description( + _long_description = _SETUP_TOOLS.load_readme_description( _PROJECT_ROOT, homepage=_about.__homepage__, version=_version.version ) - if kwargs["pkg_name"] == "lightning": - _include_pkgs = ["lightning", "lightning.*"] - # todo: generate this list automatically with parsing feature pkg versions - _requires = ["pytorch-lightning>=1.6.5", "lightning-app>=0.5.2"] - else: - _include_pkgs = ["*"] - _requires = [ - _setup_tools.load_requirements(d, unfreeze=not _FREEZE_REQUIREMENTS) - for d in glob.glob(os.path.join("requirements", "*")) - if os.path.isdir(d) - ] - _requires = list(chain(*_requires)) + _include_pkgs = ["lightning", "lightning.*"] if kwargs["pkg_name"] == "lightning" else ["*"] + # TODO: consider invaliding some additional arguments from packages, for example if include data or safe to zip # TODO: remove this once lightning-ui package is ready as a dependency - _setup_tools._download_frontend(_PROJECT_ROOT) + _SETUP_TOOLS._download_frontend(_PROJECT_ROOT) return dict( name="lightning", @@ -94,7 +85,7 @@ def _setup_args(**kwargs: Any) -> Dict[str, Any]: ], }, setup_requires=[], - install_requires=_requires, + install_requires=_SETUP_TOOLS.load_requirements(_PATH_REQUIREMENTS, unfreeze=True), extras_require={}, # todo: consider porting all other packages extras with prefix project_urls={ "Bug Tracker": "https://github.com/Lightning-AI/lightning/issues", From 5cbea78096993c7ea645b2001f1aa16e290cb734 Mon Sep 17 00:00:00 2001 From: edenlightning <66261195+edenlightning@users.noreply.github.com> Date: Mon, 1 Aug 2022 10:10:18 -0400 Subject: [PATCH 050/230] Docs update (#13959) --- docs/source-app/levels/basic/level_3.rst | 2 +- docs/source-app/levels/basic/level_7.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source-app/levels/basic/level_3.rst b/docs/source-app/levels/basic/level_3.rst index b2632509731eb..cf4c8def7bf3a 100644 --- a/docs/source-app/levels/basic/level_3.rst +++ b/docs/source-app/levels/basic/level_3.rst @@ -13,7 +13,7 @@ What is the Lightning Cloud? The Lightning Cloud is the platform that we've created to interface with the cloud providers. Today the Lightning Cloud supports AWS. -.. note:: Support for GCP and Azure is coming in the Fall of 2022! +.. note:: Support for GCP and Azure is coming soon! To use the Lightning Cloud, you buy credits that are used to pay the cloud providers. If you want to run on your own AWS credentials, please contact us (support@lightning.ai) so we can get your clusters set up for you. diff --git a/docs/source-app/levels/basic/level_7.rst b/docs/source-app/levels/basic/level_7.rst index 70f16e116c27c..63b1ae05018b8 100644 --- a/docs/source-app/levels/basic/level_7.rst +++ b/docs/source-app/levels/basic/level_7.rst @@ -13,7 +13,7 @@ What is the Lightning Cloud? The Lightning Cloud is the platform that we've created to interface with the cloud providers. Today the Lightning Cloud supports AWS. -.. note:: Support for GCP and Azure is coming in the Fall of 2022! +.. note:: Support for GCP and Azure is coming soon! To use the Lightning Cloud, you buy credits that are used to pay the cloud providers. If you want to run on your own AWS credentials, please contact us (support@lightning.ai) so we can get your clusters set up for you. From e33d25fb281b4f5e71e8209f1a3f307ef24b45cb Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Mon, 1 Aug 2022 07:25:40 -0700 Subject: [PATCH 051/230] Porting latest App docs update (#13680) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * PRs 909,910,911, and 912 moves last 4 commits to the private re;po to the OS repo * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Fix validation error * Fixes API links and validation issues * Update docs/source-app/examples/file_server/app.py Co-authored-by: Rohit Gupta * Fix Python validation errors * update * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Akihiro Nitta Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta Co-authored-by: thomas chaton Co-authored-by: Jirka Borovec --- .gitignore | 1 + docs/source-app/api_reference/core.rst | 26 +++++++ .../{api_reference => }/api_references.rst | 12 ++-- docs/source-app/examples/file_server/app.py | 19 ++++- .../examples/file_server/file_server.rst | 8 +-- .../file_server/file_server_content.rst | 52 +++++++------- .../file_server/file_server_step_1.rst | 48 +++++++++++-- .../file_server/file_server_step_2.rst | 54 +++++++++++--- .../file_server/file_server_step_3.rst | 46 ++++++++++-- .../file_server/file_server_step_4.rst | 71 +++++++++++++++---- .../examples/github_repo_runner/app.py | 28 ++++---- .../github_repo_runner/github_repo_runner.rst | 8 +-- .../github_repo_runner_content.rst | 44 ++++++------ .../github_repo_runner_step_1.rst | 18 ++--- .../github_repo_runner_step_2.rst | 18 ++--- .../github_repo_runner_step_3.rst | 22 +++--- .../github_repo_runner_step_4.rst | 39 +++++----- .../github_repo_runner_step_5.rst | 32 ++++----- docs/source-app/glossary/storage/drive.rst | 2 + docs/source-app/index.rst | 2 +- docs/source-app/installation.rst | 17 +---- docs/source-app/installation_win.rst | 34 +++++++++ src/lightning_app/components/python/popen.py | 2 +- src/lightning_app/components/python/tracer.py | 4 +- src/lightning_app/components/serve/gradio.py | 2 +- 25 files changed, 414 insertions(+), 195 deletions(-) create mode 100644 docs/source-app/api_reference/core.rst rename docs/source-app/{api_reference => }/api_references.rst (98%) create mode 100644 docs/source-app/installation_win.rst diff --git a/.gitignore b/.gitignore index 0f03c69600bed..719f291a492ca 100644 --- a/.gitignore +++ b/.gitignore @@ -164,3 +164,4 @@ src/lightning_app/ui/* hars* artifacts/* *docs/examples* +*docs/source-app/api* diff --git a/docs/source-app/api_reference/core.rst b/docs/source-app/api_reference/core.rst new file mode 100644 index 0000000000000..3f6863ca8a244 --- /dev/null +++ b/docs/source-app/api_reference/core.rst @@ -0,0 +1,26 @@ +:orphan: + +################## +lightning_app.core +################## + +.. contents:: + :depth: 1 + :local: + :backlinks: top + +.. currentmodule:: lightning_app.core + +Core APIs +___________________ + +.. autosummary:: + :toctree: api/ + :nosignatures: + :template: classtemplate.rst + + LightningApp + LightningFlow + LightningWork + +Learn more about :ref:`Lightning Core `. diff --git a/docs/source-app/api_reference/api_references.rst b/docs/source-app/api_references.rst similarity index 98% rename from docs/source-app/api_reference/api_references.rst rename to docs/source-app/api_references.rst index 42cfcb7aed3d5..55803457c4820 100644 --- a/docs/source-app/api_reference/api_references.rst +++ b/docs/source-app/api_references.rst @@ -5,12 +5,12 @@ Lightning App - API References ############################## Core ----- +____ .. currentmodule:: lightning_app.core .. autosummary:: - :toctree: api + :toctree: api/ :nosignatures: :template: classtemplate_no_index.rst @@ -32,10 +32,10 @@ ___________________ :nosignatures: :template: classtemplate_no_index.rst - ~serve.serve.ModelInferenceAPI ~python.popen.PopenPythonScript - ~serve.gradio.ServeGradio ~python.tracer.TracerPythonScript + ~serve.gradio.ServeGradio + ~serve.serve.ModelInferenceAPI ---- @@ -67,8 +67,8 @@ _______ :nosignatures: :template: classtemplate_no_index.rst - ~drive.Drive ~path.Path + ~drive.Drive ~payload.Payload Learn more about :ref:`Storage `. @@ -86,5 +86,5 @@ _______ :template: classtemplate_no_index.rst ~cloud.CloudRuntime - ~multiprocess.MultiProcessRuntime ~singleprocess.SingleProcessRuntime + ~multiprocess.MultiProcessRuntime diff --git a/docs/source-app/examples/file_server/app.py b/docs/source-app/examples/file_server/app.py index 29e67e7dd6958..b70d36870e944 100644 --- a/docs/source-app/examples/file_server/app.py +++ b/docs/source-app/examples/file_server/app.py @@ -10,7 +10,13 @@ class FileServer(L.LightningWork): - def __init__(self, drive: Drive, base_dir: str = "file_server", chunk_size=10240, **kwargs): + def __init__( + self, + drive: Drive, + base_dir: str = "file_server", + chunk_size=10240, + **kwargs + ): """This component uploads, downloads files to your application. Arguments: @@ -48,7 +54,9 @@ def upload_file(self, file): filename = file.filename uploaded_file = self.get_random_filename() meta_file = uploaded_file + ".meta" - self.uploaded_files[filename] = {"progress": (0, None), "done": False} + self.uploaded_files[filename] = { + "progress": (0, None), "done": False + } # 2: Create a stream and write bytes of # the file to the disk under `uploaded_file` path. @@ -155,6 +163,7 @@ def alive(self): class TestFileServer(LightningWork): + def __init__(self, drive: Drive): super().__init__(cache_calls=True) self.drive = drive @@ -164,7 +173,10 @@ def run(self, file_server_url: str, first=True): with open("test.txt", "w") as f: f.write("Some text.") - response = requests.post(file_server_url + "/upload_file/", files={"file": open("test.txt", "rb")}) + response = requests.post( + file_server_url + "/upload_file/", + files={'file': open("test.txt", 'rb')} + ) assert response.status_code == 200 else: response = requests.get(file_server_url) @@ -176,6 +188,7 @@ def run(self, file_server_url: str, first=True): class Flow(LightningFlow): + def __init__(self): super().__init__() # 1: Create a drive to share data between works diff --git a/docs/source-app/examples/file_server/file_server.rst b/docs/source-app/examples/file_server/file_server.rst index 430333a875533..ab854c17e450e 100644 --- a/docs/source-app/examples/file_server/file_server.rst +++ b/docs/source-app/examples/file_server/file_server.rst @@ -1,11 +1,11 @@ .. _fileserver_example: -################### -Build a File Server -################### +##################### +Develop a File Server +##################### -**Prerequisite**: Reach :ref:`level 16+ ` and read the `Drive article `_. +**Prerequisite**: Reach :ref:`level 16+ ` and read the :ref:`Drive article `. ---- diff --git a/docs/source-app/examples/file_server/file_server_content.rst b/docs/source-app/examples/file_server/file_server_content.rst index 26603e04f817d..fee5d5d4b0f13 100644 --- a/docs/source-app/examples/file_server/file_server_content.rst +++ b/docs/source-app/examples/file_server/file_server_content.rst @@ -1,34 +1,41 @@ + + ********* -Objective +Our Goal ********* -Create a simple application where users can upload files and list the uploaded files. +Create a simple Lightning App (App) that allows users to upload files and list the uploaded files. ---- -***************** -Final Application -***************** +************* +Completed App +************* -Here is a recording of the final application built in this example tested with pytest. +Here is a recording of the final App built in this example, tested with pytest. .. raw:: html - +
+ +
+
---- -************* -System Design -************* +********** +App Design +********** -In order to create such application, we need to build two components and an application: +In order to create this App, we need to develop two components and an App: -* A **File Server Component** that gives you the ability to download or list files shared with your application. This is particularly useful when you want to trigger an ML job but your users need to provide their own data or if the user wants to download the trained checkpoints. +* A **File Server Component** that gives you the ability to download or list files shared with your App. This is particularly useful when you want to trigger an ML job but your users need to provide their own data or if the user wants to download the trained checkpoints. * A **Test File Server** Component to interact with the file server. -* An application putting everything together and its associated pytest tests. +* An App putting everything together and the App's associated pytest tests. ---- @@ -36,41 +43,38 @@ In order to create such application, we need to build two components and an appl Tutorial ******** -Let's dive in on how to create such application and component: - .. raw:: html
.. displayitem:: - :header: 1. Implement the File Server general structure - :description: Put together the shape of the component + :header: Step 1: Implement the File Server general structure + :description: Put together the shape of the Component :col_css: col-md-4 :button_link: file_server_step_1.html :height: 180 :tag: Basic .. displayitem:: - :header: 2. Implement the File Server upload and list files methods - :description: Add the core functionalities to the component + :header: Step 2: Implement the File Server upload and list files methods + :description: Add the core functionalities to the Component :col_css: col-md-4 :button_link: file_server_step_2.html :height: 180 :tag: Basic .. displayitem:: - :header: 3. Implement a File Server Testing Component - :description: Create a component to test the file server + :header: Step 3: Implement a File Server Testing Component + :description: Create a Component to test the file server :col_css: col-md-4 :button_link: file_server_step_3.html :height: 180 :tag: Intermediate - .. displayitem:: - :header: 4. Implement tests for the File Server component with pytest - :description: Create an app to validate the upload and list files endpoints + :header: Step 4: Implement tests for the File Server component with pytest + :description: Create an App to validate the upload and list files endpoints :col_css: col-md-4 :button_link: file_server_step_4.html :height: 180 diff --git a/docs/source-app/examples/file_server/file_server_step_1.rst b/docs/source-app/examples/file_server/file_server_step_1.rst index 782e553e9fdd5..8703a1d443ef2 100644 --- a/docs/source-app/examples/file_server/file_server_step_1.rst +++ b/docs/source-app/examples/file_server/file_server_step_1.rst @@ -1,11 +1,49 @@ :orphan: -********************************************* -1. Implement the FileServer general structure -********************************************* +################################################## +Step 1: Implement the FileServer general structure +################################################## -Let's dive in on how to create such a component with the code below. +Let’s dive in on how to develop the component with the following code: .. literalinclude:: ./app.py - :lines: 1-44, 132-158 + :lines: 1-41, 132-158 :emphasize-lines: 16, 51- + +******** +Tutorial +******** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Step 2: Implement the File Server upload and list files methods + :description: Add the core functionalities to the Component + :col_css: col-md-4 + :button_link: file_server_step_2.html + :height: 180 + :tag: Basic + +.. displayitem:: + :header: Step 3: Implement a File Server Testing Component + :description: Create a Component to test the file server + :col_css: col-md-4 + :button_link: file_server_step_3.html + :height: 180 + :tag: Intermediate + +.. displayitem:: + :header: Step 4: Implement tests for the File Server component with pytest + :description: Create an App to validate the upload and list files endpoints + :col_css: col-md-4 + :button_link: file_server_step_4.html + :height: 180 + :tag: Intermediate + +.. raw:: html + +
+
diff --git a/docs/source-app/examples/file_server/file_server_step_2.rst b/docs/source-app/examples/file_server/file_server_step_2.rst index 668b01b1771d1..d3bd199ceacdb 100644 --- a/docs/source-app/examples/file_server/file_server_step_2.rst +++ b/docs/source-app/examples/file_server/file_server_step_2.rst @@ -1,10 +1,10 @@ :orphan: -********************************************************** -2. Implement the File Server upload and list_files methods -********************************************************** +################################################################ +Step 2: Implement the File Server upload and list_files methods +################################################################ -Let's dive in on how to implement such methods. +Let's dive in on how to implement these methods. *************************** Implement the upload method @@ -12,10 +12,10 @@ Implement the upload method In this method, we are creating a stream between the uploaded file and the uploaded file stored on the file server disk. -Once the file is uploaded, we are putting the file into the :class:`~lightning_app.storage.drive.Drive`, so it becomes persistent and accessible to all components. +Once the file is uploaded, we are putting the file into the :class:`~lightning_app.storage.drive.Drive`, so it becomes persistent and accessible to all Components. .. literalinclude:: ./app.py - :lines: 13, 52-100 + :lines: 12, 51-99 :emphasize-lines: 49 ******************************* @@ -25,7 +25,7 @@ Implement the fist_files method First, in this method, we get the file in the file server filesystem, if available in the Drive. Once done, we list the the files under the provided paths and return the results. .. literalinclude:: ./app.py - :lines: 13, 101-131 + :lines: 12, 100-130 :emphasize-lines: 9 @@ -34,4 +34,42 @@ Implement utilities ******************* .. literalinclude:: ./app.py - :lines: 13, 46-51 + :lines: 12, 43-49 + +******** +Tutorial +******** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Step 1: Implement the File Server general structure + :description: Put together the shape of the Component + :col_css: col-md-4 + :button_link: file_server_step_1.html + :height: 180 + :tag: Basic + +.. displayitem:: + :header: Step 3: Implement a File Server Testing Component + :description: Create a Component to test the file server + :col_css: col-md-4 + :button_link: file_server_step_3.html + :height: 180 + :tag: Intermediate + +.. displayitem:: + :header: Step 4: Implement tests for the File Server component with pytest + :description: Create an App to validate the upload and list files endpoints + :col_css: col-md-4 + :button_link: file_server_step_4.html + :height: 180 + :tag: Intermediate + +.. raw:: html + +
+
diff --git a/docs/source-app/examples/file_server/file_server_step_3.rst b/docs/source-app/examples/file_server/file_server_step_3.rst index 97b524a978ea3..4703ef0750f1e 100644 --- a/docs/source-app/examples/file_server/file_server_step_3.rst +++ b/docs/source-app/examples/file_server/file_server_step_3.rst @@ -1,8 +1,8 @@ :orphan: -******************************************** -3. Implement a File Server Testing Component -******************************************** +################################################# +Step 3: Implement a File Server Testing Component +################################################# Let's dive in on how to implement a testing component for a server. @@ -13,4 +13,42 @@ This component needs to test two things: * The **/** endpoint listing files, by validating the that previously uploaded file is present in the response. .. literalinclude:: ./app.py - :lines: 161-183 + :lines: 165-182 + +******** +Tutorial +******** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Step 1: Implement the File Server general structure + :description: Put together the shape of the Component + :col_css: col-md-4 + :button_link: file_server_step_1.html + :height: 180 + :tag: Basic + +.. displayitem:: + :header: Step 2: Implement the File Server upload and list files methods + :description: Add the core functionalities to the Component + :col_css: col-md-4 + :button_link: file_server_step_2.html + :height: 180 + :tag: Basic + +.. displayitem:: + :header: Step 4: Implement tests for the File Server component with pytest + :description: Create an App to validate the upload and list files endpoints + :col_css: col-md-4 + :button_link: file_server_step_4.html + :height: 180 + :tag: Intermediate + +.. raw:: html + +
+
diff --git a/docs/source-app/examples/file_server/file_server_step_4.rst b/docs/source-app/examples/file_server/file_server_step_4.rst index 06d9e051dc0cb..70930b64fccd6 100644 --- a/docs/source-app/examples/file_server/file_server_step_4.rst +++ b/docs/source-app/examples/file_server/file_server_step_4.rst @@ -1,26 +1,27 @@ :orphan: -************************************************************ -4. Implement tests for the File Server component with pytest -************************************************************ +################################################################# +Step 4: Implement tests for the File Server component with pytest +################################################################# -Let's create a simple Lightning App (App) with our **File Server** and the **File Server Test** components. +Let's create a simple App with our **File Server** and **File Server Test** components. Once the File Server is up and running, we'll execute the **test_file_server** LightningWork and when both calls are successful, we exit the App using ``self._exit``. .. literalinclude:: ./app.py - :lines: 186-216 + :lines: 187-218 -Simply create a ``test.py`` file with the following code and run ``pytest tests.py`` +Simply create a ``test.py`` file with the following code and run ``pytest tests.py``: .. literalinclude:: ./app.py - :lines: 218-222 + :lines: 221-226 -To test the App in the cloud, create a ``cloud_test.py`` file with the following code and run ``pytest cloud_test.py``. Under the hood, we are using the end-to-end testing `playwright `_ library so you can interact with the UI. +To test the App in the cloud, create a ``cloud_test.py`` file with the following code and run ``pytest cloud_test.py``. +Under the hood, we are using the end-to-end testing `playwright `_ library, so you can interact with the UI. .. literalinclude:: ./app.py - :lines: 224- + :lines: 229- ---- @@ -28,7 +29,7 @@ To test the App in the cloud, create a ``cloud_test.py`` file with the following Test the application ******************** -Clone the lightning repo and run the following command: +Clone the Lightning repo and run the following command: .. code-block:: bash @@ -36,6 +37,46 @@ Clone the lightning repo and run the following command: ---- +******** +Tutorial +******** + +.. raw:: html + +
+
+ +.. displayitem:: + :header: Step 1: Implement the File Server general structure + :description: Put together the shape of the Component + :col_css: col-md-4 + :button_link: file_server_step_1.html + :height: 180 + :tag: Basic + +.. displayitem:: + :header: Step 2: Implement the File Server upload and list files methods + :description: Add the core functionalities to the Component + :col_css: col-md-4 + :button_link: file_server_step_2.html + :height: 180 + :tag: Basic + +.. displayitem:: + :header: Step 3: Implement a File Server Testing Component + :description: Create a Component to test the file server + :col_css: col-md-4 + :button_link: file_server_step_3.html + :height: 180 + :tag: Intermediate + +.. raw:: html + +
+
+ +---- + ****************** Find more examples ****************** @@ -48,7 +89,7 @@ Find more examples .. Add callout items below this line .. displayitem:: - :header: Build a DAG + :header: Develop a DAG :description: Create a dag pipeline :col_css: col-md-4 :button_link: ../dag/dag.html @@ -56,7 +97,7 @@ Find more examples :tag: Intermediate .. displayitem:: - :header: Build a Github Repo Script Runner + :header: Develop a Github Repo Script Runner :description: Run any script on github in the cloud :col_css: col-md-4 :button_link: ../github_repo_runner/github_repo_runner.html @@ -65,7 +106,7 @@ Find more examples .. displayitem:: - :header: Build a HPO Sweeper + :header: Develop a HPO Sweeper :description: Train multiple models with different parameters :col_css: col-md-4 :button_link: ../hpo/hpo.html @@ -73,10 +114,10 @@ Find more examples :tag: Intermediate .. displayitem:: - :header: Build a Model Server + :header: Develop a Model Server :description: Serve multiple models with different parameters :col_css: col-md-4 - :button_link: ../model_server/model_server.html + :button_link: ../model_server_app/model_server_app.html :height: 150 :tag: Intermediate diff --git a/docs/source-app/examples/github_repo_runner/app.py b/docs/source-app/examples/github_repo_runner/app.py index 90cb1a993db06..245692edbc1fb 100644 --- a/docs/source-app/examples/github_repo_runner/app.py +++ b/docs/source-app/examples/github_repo_runner/app.py @@ -5,7 +5,7 @@ from copy import deepcopy from functools import partial from subprocess import Popen -from typing import Any, Dict, List, Optional +from typing import Dict, List, Optional from lightning import BuildConfig, CloudCompute, LightningApp, LightningFlow from lightning.app import structures @@ -24,7 +24,7 @@ def __init__( script_args: List[str], requirements: List[str], cloud_compute: Optional[CloudCompute] = None, - **kwargs: Any, + **kwargs, ): """The GithubRepoRunner Component clones a repo, runs a specific script with provided arguments and collect logs. @@ -56,7 +56,8 @@ def run(self, *args, **kwargs): # 2: Use git command line to clone the repo. repo_name = self.github_repo.split("/")[-1].replace(".git", "") cwd = os.path.dirname(__file__) - subprocess.Popen(f"git clone {self.github_repo}", cwd=cwd, shell=True).wait() + subprocess.Popen( + f"git clone {self.github_repo}", cwd=cwd, shell=True).wait() # 3: Execute the parent run method of the TracerPythonScript class. os.chdir(os.path.join(cwd, repo_name)) @@ -72,6 +73,7 @@ def configure_layout(self): class PyTorchLightningGithubRepoRunner(GithubRepoRunner): + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.best_model_path = None @@ -103,7 +105,8 @@ def trainer_pre_fn(self, *args, work=None, **kwargs): # 5. Patch the `__init__` method of the Trainer # to inject our callback with a reference to the work. - tracer.add_traced(Trainer, "__init__", pre_fn=partial(trainer_pre_fn, work=self)) + tracer.add_traced( + Trainer, "__init__", pre_fn=partial(trainer_pre_fn, work=self)) return tracer def on_after_run(self, end_script_globals): @@ -210,7 +213,9 @@ def page_1__create_new_run(state): script_path = st.text_input("Enter your script to run", value="train_script.py") script_args = st.text_input("Enter your base script arguments", value=default_script_args) requirements = st.text_input("Enter your requirements", value=default_requirements) - ml_framework = st.radio("Select your ML Training Frameworks", options=["PyTorch Lightning", "Keras", "Tensorflow"]) + ml_framework = st.radio( + "Select your ML Training Frameworks", options=["PyTorch Lightning", "Keras", "Tensorflow"] + ) if ml_framework not in ("PyTorch Lightning"): st.write(f"{ml_framework} isn't supported yet.") @@ -274,7 +279,8 @@ def render_fn(state: AppState): "View your Runs": partial(page_2__view_run_lists, state=state), "View the App state": partial(page_3__view_app_state, state=state), } - selected_page = st.sidebar.selectbox("Select a page", page_names_to_funcs.keys()) + selected_page = st.sidebar.selectbox( + "Select a page", page_names_to_funcs.keys()) page_names_to_funcs[selected_page]() @@ -290,12 +296,10 @@ def run(self): def configure_layout(self): # 1: Add the main StreamLit UI - selection_tab = [ - { - "name": "Run your Github Repo", - "content": self.flow, - } - ] + selection_tab = [{ + "name": "Run your Github Repo", + "content": self.flow, + }] # 2: Add a new tab whenever a new work is dynamically created run_tabs = [e.configure_layout() for e in self.flow.ws.values()] # 3: Returns the list of tabs. diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner.rst index affb115b74b5e..cd420f5c5db27 100644 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner.rst +++ b/docs/source-app/examples/github_repo_runner/github_repo_runner.rst @@ -1,10 +1,10 @@ .. _github_repo_script_runner_example: -################################# -Build a Github Repo Script Runner -################################# +################################### +Develop a Github Repo Script Runner +################################### -**Audience:** Users that want to create interactive applications which runs Github Repo in the cloud at any scale for multiple users. +**Audience:** Users that want to develop interactive applications which runs Github Repo in the cloud at any scale for multiple users. **Prerequisite**: Reach :ref:`level 16+ ` and read the docstring of of :class:`~lightning_app.components.python.tracer.TracerPythonScript` component. diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst index 335b21ce7f601..79c060eed9089 100644 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst +++ b/docs/source-app/examples/github_repo_runner/github_repo_runner_content.rst @@ -1,33 +1,35 @@ -********* -Objective -********* +******** +Our Goal +******** -Create a simple application where users can enter information in a UI to run a given PyTorch Lightning Script from a given Github Repo with optionally some extra python requirements and arguments. +Create a simple Lightning App (App) where users can enter information in a UI to run a given PyTorch Lightning Script from a given Github Repo with some optional extra Python requirements and arguments. -Furthermore, the users should be able to monitor their training progress in real-time, view the logs, and get the best-monitored metric and associated checkpoint for their models. +Users should be able to monitor their training progress in real-time, view the logs, and get the best monitored metric and associated checkpoint for their models. ---- -***************** -Final Application -***************** +Completed App +^^^^^^^^^^^^^ Here is a recording of the final application built in this example. The example is around 200 lines in total and should give you a great foundation to build your own Lightning App. .. raw:: html - +
+ +
+
---- -************* -System Design -************* +********** +App Design +********** -In order to create such application, we need to build several components: +In order to develop the App, we need to build several components: * A GithubRepoRunner Component that clones a repo, runs a specific script with provided arguments and collect logs. @@ -37,8 +39,6 @@ In order to create such application, we need to build several components: * A Flow to dynamically create GithubRepoRunner once a user submits information from the UI. -Let's dive in on how to create such a component. - ---- ******** @@ -51,7 +51,7 @@ Tutorial
.. displayitem:: - :header: 1. Implement the GithubRepoRunner Component + :header: Step 1: Implement the GithubRepoRunner Component :description: Clone and execute script from a GitHub Repo. :col_css: col-md-4 :button_link: github_repo_runner_step_1.html @@ -59,7 +59,7 @@ Tutorial :tag: Intermediate .. displayitem:: - :header: 2. Implement the PyTorch Lightning GithubRepoRunner Component + :header: Step 2: Implement the PyTorch Lightning GithubRepoRunner Component :description: Automate PyTorch Lightning execution :col_css: col-md-4 :button_link: github_repo_runner_step_2.html @@ -67,7 +67,7 @@ Tutorial :tag: Advanced .. displayitem:: - :header: 3. Implement the Flow to manage user requests + :header: Step 3: Implement the Flow to manage user requests :description: Dynamically create GithubRepoRunner :col_css: col-md-4 :button_link: github_repo_runner_step_3.html @@ -76,7 +76,7 @@ Tutorial .. displayitem:: - :header: 4. Implement the UI with StreamLit + :header: Step 4: Implement the UI with StreamLit :description: Several pages application :col_css: col-md-4 :button_link: github_repo_runner_step_4.html @@ -85,7 +85,7 @@ Tutorial .. displayitem:: - :header: 5. Putting everything together + :header: Step 5: Put it all together :description: :col_css: col-md-4 :button_link: github_repo_runner_step_5.html diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst index 3a683501fa3da..e85ecc9da6b95 100644 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst +++ b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_1.rst @@ -1,12 +1,12 @@ :orphan: -******************************************* -1. Implement the GithubRepoRunner Component -******************************************* +************************************************ +Step 1: Implement the GithubRepoRunner Component +************************************************ -The GithubRepoRunner Component clones a repo, runs a specific script with provided arguments and collect logs. +The GithubRepoRunner Component clones a repo, runs a specific script with provided arguments, and collect logs. -Let's dive in on how to create such a component with the code below. +Let's dive in on how to develop the component with the following code: .. literalinclude:: ./app.py :lines: -72 @@ -23,7 +23,7 @@ Tutorial
.. displayitem:: - :header: 2. Implement the PyTorch Lightning GithubRepoRunner Component + :header: Step 2: Implement the PyTorch Lightning GithubRepoRunner Component :description: Automate PyTorch Lightning execution :col_css: col-md-4 :button_link: github_repo_runner_step_2.html @@ -31,7 +31,7 @@ Tutorial :tag: Advanced .. displayitem:: - :header: 3. Implement the Flow to manage user requests + :header: Step 3: Implement the Flow to manage user requests :description: Dynamically create GithubRepoRunner :col_css: col-md-4 :button_link: github_repo_runner_step_3.html @@ -40,7 +40,7 @@ Tutorial .. displayitem:: - :header: 4. Implement the UI with StreamLit + :header: Step 4: Implement the UI with StreamLit :description: Several pages application :col_css: col-md-4 :button_link: github_repo_runner_step_4.html @@ -49,7 +49,7 @@ Tutorial .. displayitem:: - :header: 5. Putting everything together + :header: Step 5: Put it all together :description: :col_css: col-md-4 :button_link: github_repo_runner_step_5.html diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst index c0825fa8eafa4..387709bdf13c9 100644 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst +++ b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_2.rst @@ -1,8 +1,8 @@ :orphan: -************************************************************* -2. Implement the PyTorch Lightning GithubRepoRunner Component -************************************************************* +****************************************************************** +Step 2: Implement the PyTorch Lightning GithubRepoRunner Component +****************************************************************** The PyTorch Lightning GithubRepoRunner Component subclasses the GithubRepoRunner but tailors the execution experience to PyTorch Lightning. @@ -10,9 +10,9 @@ As a matter of fact, this component adds two primary tailored features for PyTor * It injects dynamically a custom callback ``TensorboardServerLauncher`` in the PyTorch Lightning Trainer to start a tensorboard server so it can be exposed in Lightning App UI. -* Once the script has run, the ``on_after_run`` hook of the :class:`~lightning_app.components.python.tracer.TracerPythonScript` is invoked with the script globals, meaning we can collect anything we need. In particular, we are reloading the best model, torch scripting it, and storing its path in the state alongside the best metric score. +* Once the script has run, the ``on_after_run`` hook of the :class:`~lightning_app.components.python.tracer.TracerPythonScript` is invoked with the script globals, meaning we can collect anything we need. In particular, we are reloading the best model, torch scripting it, and storing its path in the state along side the best metric score. -Let's dive in on how to create such a component with the code below. +Let's dive in on how to develop the component with the following code: .. literalinclude:: ./app.py :lines: 75-136 @@ -29,7 +29,7 @@ Tutorial
.. displayitem:: - :header: 1. Implement the GithubRepoRunner Component + :header: Step 1: Implement the GithubRepoRunner Component :description: Clone and execute script from a GitHub Repo. :col_css: col-md-4 :button_link: github_repo_runner_step_1.html @@ -37,7 +37,7 @@ Tutorial :tag: Intermediate .. displayitem:: - :header: 3. Implement the Flow to manage user requests + :header: Step 3: Implement the Flow to manage user requests :description: Dynamically create GithubRepoRunner :col_css: col-md-4 :button_link: github_repo_runner_step_3.html @@ -46,7 +46,7 @@ Tutorial .. displayitem:: - :header: 4. Implement the UI with StreamLit + :header: Step 4: Implement the UI with StreamLit :description: Several pages application :col_css: col-md-4 :button_link: github_repo_runner_step_4.html @@ -55,7 +55,7 @@ Tutorial .. displayitem:: - :header: 5. Putting everything together + :header: Step 5: Put it all together :description: :col_css: col-md-4 :button_link: github_repo_runner_step_5.html diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst index fc1b3116beb8d..44cf7dd5a6523 100644 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst +++ b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_3.rst @@ -1,16 +1,16 @@ :orphan: -********************************************* -3. Implement the Flow to manage user requests -********************************************* +************************************************** +Step 3: Implement the Flow to manage user requests +************************************************** -In step 1 and 2, we have implemented ``GithubRepoRunner`` and ``PyTorchLightningGithubRepoRunner`` components. +In step 1 and 2, we have implemented the ``GithubRepoRunner`` and ``PyTorchLightningGithubRepoRunner`` components. -Now, we are going to create a component to dynamically handle user requests. -Let's dive in on how to create such a component with the code below. +Now, we are going to develop a component to dynamically handle user requests. +Let's dive in on how to develop the component with the following code: .. literalinclude:: ./app.py - :lines: 138-187 + :lines: 142-190 ---- @@ -24,7 +24,7 @@ Tutorial
.. displayitem:: - :header: 1. Implement the GithubRepoRunner Component + :header: Step 1: Implement the GithubRepoRunner Component :description: Clone and execute script from a GitHub Repo. :col_css: col-md-4 :button_link: github_repo_runner_step_1.html @@ -32,7 +32,7 @@ Tutorial :tag: Intermediate .. displayitem:: - :header: 2. Implement the PyTorch Lightning GithubRepoRunner Component + :header: Step 2: Implement the PyTorch Lightning GithubRepoRunner Component :description: Automate PyTorch Lightning execution :col_css: col-md-4 :button_link: github_repo_runner_step_2.html @@ -40,7 +40,7 @@ Tutorial :tag: Advanced .. displayitem:: - :header: 4. Implement the UI with StreamLit + :header: Step 4: Implement the UI with StreamLit :description: Several pages application :col_css: col-md-4 :button_link: github_repo_runner_step_4.html @@ -49,7 +49,7 @@ Tutorial .. displayitem:: - :header: 5. Putting everything together + :header: Step 5: Put it all together :description: :col_css: col-md-4 :button_link: github_repo_runner_step_5.html diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst index 2716adbaf8328..16893aafee183 100644 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst +++ b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_4.rst @@ -1,48 +1,41 @@ :orphan: -********************************** -4. Implement the UI with StreamLit -********************************** +*************************************** +Step 4: Implement the UI with StreamLit +*************************************** -In step 3, we have implemented a flow that dynamically creates a Work when a new request is added to the requests list. +In step 3, we have implemented a Flow which dynamically creates a Work when a new request is added to the requests list. From the UI, we create 3 pages with `StreamLit `_: -* **Page 1**: Create a form to add a new request to the flow state **requests**. +* **Page 1**: Create a form with add a new request to the Flow state **requests**. -* **Page 2**: Iterate through all the requests and display associated information. +* **Page 2**: Iterate through all the requests and display the associated information. * **Page 3**: Display the entire App State. -**************** + Render All Pages -**************** +^^^^^^^^^^^^^^^^ .. literalinclude:: ./app.py - :lines: 263-274 - + :lines: 274-284 -****** -Page 1 -****** +**Page 1** .. literalinclude:: ./app.py - :lines: 189-231 + :lines: 193-241 :emphasize-lines: 43 -****** -Page 2 -****** +**Page 2** .. literalinclude:: ./app.py - :lines: 233-255 + :lines: 244-264 -****** -Page 3 -****** +**Page 3** .. literalinclude:: ./app.py - :lines: 257-261 + :lines: 267-271 ---- @@ -80,7 +73,7 @@ Tutorial :tag: Intermediate .. displayitem:: - :header: 5. Putting everything together + :header: Step 5: Put it all together :description: :col_css: col-md-4 :button_link: github_repo_runner_step_5.html diff --git a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst index bdad9523323d9..a57ee40475200 100644 --- a/docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst +++ b/docs/source-app/examples/github_repo_runner/github_repo_runner_step_5.rst @@ -1,26 +1,24 @@ :orphan: -****************************** -5. Putting everything together -****************************** +*************************** +Step 5: Put it all together +*************************** -Let's dive in on how to create such a component with the code below. +Let's dive in on how to develop the component with the following code: .. literalinclude:: ./app.py - :lines: 277- + :lines: 287- - -******************* Run the application -******************* +^^^^^^^^^^^^^^^^^^^ -Clone the lightning repo and run the following command: +Clone the Lightning repo and run the following command: .. code-block:: bash lightning run app docs/source-app/examples/github_repo_runner/app.py -Add **--cloud** to run this application in the cloud. +Add ``--cloud`` to run this application in the cloud. .. code-block:: bash @@ -28,9 +26,9 @@ Add **--cloud** to run this application in the cloud. ---- -****************** -Find more examples -****************** +********************** +More hands-on examples +********************** .. raw:: html @@ -40,7 +38,7 @@ Find more examples .. Add callout items below this line .. displayitem:: - :header: Build a DAG + :header: Develop a DAG :description: Create a dag pipeline :col_css: col-md-4 :button_link: ../dag/dag.html @@ -48,7 +46,7 @@ Find more examples :tag: Intermediate .. displayitem:: - :header: Build a File Server + :header: Develop a File Server :description: Train multiple models with different parameters :col_css: col-md-4 :button_link: ../file_server/file_server.html @@ -56,7 +54,7 @@ Find more examples :tag: Intermediate .. displayitem:: - :header: Build a HPO Sweeper + :header: Develop a HPO Sweeper :description: Train multiple models with different parameters :col_css: col-md-4 :button_link: ../hpo/hpo.html @@ -64,7 +62,7 @@ Find more examples :tag: Intermediate .. displayitem:: - :header: Build a Model Server + :header: Develop a Model Server :description: Serve multiple models with different parameters :col_css: col-md-4 :button_link: ../model_server/model_server.html diff --git a/docs/source-app/glossary/storage/drive.rst b/docs/source-app/glossary/storage/drive.rst index e500e087cea8f..6bb5b5033a83e 100644 --- a/docs/source-app/glossary/storage/drive.rst +++ b/docs/source-app/glossary/storage/drive.rst @@ -1,5 +1,7 @@ :orphan: +.. _drive_storage: + ############# Drive Storage ############# diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index 29e03b1327ad9..364f4034e5ae1 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -143,7 +143,7 @@ Keep Learning :header: API Reference :description: Detailed description of each API package :col_css: col-md-6 - :button_link: api_reference/api_references.html + :button_link: api_references.html :height: 180 .. displayitem:: diff --git a/docs/source-app/installation.rst b/docs/source-app/installation.rst index 7f243e170e188..1828e30cb49c4 100644 --- a/docs/source-app/installation.rst +++ b/docs/source-app/installation.rst @@ -1,6 +1,7 @@ .. _install: + ############ Installation ############ @@ -11,12 +12,8 @@ Don't know what this is? Follow our `beginner guide here ` **Requirements** * Python 3.8.x or later (3.8.x, 3.9.x, 3.10.x) -* Pip (the latest versions of Python will already have this) -* Git -* PyTorch - https://pytorch.org/get-started/locally/ -* Setup an alias for Python: python=python3 -* Add the root folder of Lightning to the Environment Variables to PATH -* Install Z shell (zsh) (This is required for Windows to install the quickstart app) + +Or read the `Windows installation article `_. ---- @@ -26,16 +23,8 @@ Install with pip 0. Activate your virtual environment. - .. raw:: html - -
- 1. Install the ``lightning`` package - .. raw:: html - -
- .. code:: bash python -m pip install -U lightning diff --git a/docs/source-app/installation_win.rst b/docs/source-app/installation_win.rst new file mode 100644 index 0000000000000..ff08cc1945da3 --- /dev/null +++ b/docs/source-app/installation_win.rst @@ -0,0 +1,34 @@ +:orphan: + +####################### +Installation on Windows +####################### + +We strongly recommend to create a virtual environment first. +Don't know what this is? Follow our `beginner guide here `_. + +Windows environments might need a little more tweaking before you install. + +**Requirements** + +* Python 3.8.x or later (3.8.x, 3.9.x, 3.10.x) +* Pip (the latest versions of Python will already have this) +* Git +* PyTorch - https://pytorch.org/get-started/locally/ +* Setup an alias for Python: python=python3 +* Add the root folder of Lightning to the Environment Variables to PATH +* Install Z shell (zsh) (This is required for Windows to install the quickstart app) + +---- + +**************** +Install with pip +**************** + +0. Activate your virtual environment. + +1. Install the ``lightning`` package + + .. code:: bash + + python -m pip install -U lightning diff --git a/src/lightning_app/components/python/popen.py b/src/lightning_app/components/python/popen.py index 553795d9617de..7efc2b6d83c61 100644 --- a/src/lightning_app/components/python/popen.py +++ b/src/lightning_app/components/python/popen.py @@ -56,7 +56,7 @@ def __init__( In this example, the script will be launch with the :class:`~subprocess.Popen`. - .. literalinclude:: ../../../../examples/app_components/python/component_popen.py + .. literalinclude:: ../../../examples/app_components/python/component_popen.py :language: python """ super().__init__(**kwargs) diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index b98c782e138e4..3d5bde7fc718b 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -91,14 +91,14 @@ def __init__( Even more interesting, this component works for ANY PyTorch Lightning script and its state can be used in real time in a UI. - .. literalinclude:: ../../../../examples/app_components/python/component_tracer.py + .. literalinclude:: ../../../examples/app_components/python/component_tracer.py :language: python Once implemented, this component can easily be integrated within a larger app to execute a specific python script. - .. literalinclude:: ../../../../examples/app_components/python/app.py + .. literalinclude:: ../../../examples/app_components/python/app.py :language: python """ super().__init__(**kwargs) diff --git a/src/lightning_app/components/serve/gradio.py b/src/lightning_app/components/serve/gradio.py index ecbd583020354..70044ce43dfa8 100644 --- a/src/lightning_app/components/serve/gradio.py +++ b/src/lightning_app/components/serve/gradio.py @@ -18,7 +18,7 @@ class ServeGradio(LightningWork, abc.ABC): In the example below, the ``ServeGradio`` is subclassed to deploy ``AnimeGANv2``. - .. literalinclude:: ../../../../examples/app_components/serve/gradio/app.py + .. literalinclude:: ../../../examples/app_components/serve/gradio/app.py :language: python The result would be the following: From e195c85407c7d414a9e8f03796c81cde6f5ef705 Mon Sep 17 00:00:00 2001 From: Ross Johnstone Date: Mon, 1 Aug 2022 23:30:18 +0900 Subject: [PATCH 052/230] Fix some tiny typos in docs (#13939) --- src/lightning_app/components/python/tracer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lightning_app/components/python/tracer.py b/src/lightning_app/components/python/tracer.py index 3d5bde7fc718b..abc4609e044ef 100644 --- a/src/lightning_app/components/python/tracer.py +++ b/src/lightning_app/components/python/tracer.py @@ -45,9 +45,9 @@ def __init__( """The TracerPythonScript class enables to easily run a python script. When subclassing this class, you can configure your own :class:`~lightning_app.utilities.tracer.Tracer` - by :meth:`~lightning_app.components.python.tracer.TracerPythonScript.configure_tracer` method + by :meth:`~lightning_app.components.python.tracer.TracerPythonScript.configure_tracer` method. - The tracer is quite a magical class. It enables you to inject core into a script execution without changing it. + The tracer is quite a magical class. It enables you to inject code into a script execution without changing it. Arguments: script_path: Path of the python script to run. @@ -59,12 +59,12 @@ def __init__( Raises: FileNotFoundError: If the provided `script_path` doesn't exists. - **How does it works ?** + **How does it work?** It works by executing the python script with python built-in `runpy `_ run_path method. This method takes any python globals before executing the script, - e.g you can modify classes or function from the script. + e.g., you can modify classes or function from the script. .. doctest:: From 98f7326683096ba880822519fddf69b868783ff1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Aug 2022 16:59:48 +0000 Subject: [PATCH 053/230] Bump actions/setup-python from 2 to 4 (#13952) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 2 to 4. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v2...v4) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-app_cloud_e2e_test.yml | 4 ++-- .github/workflows/ci-app_examples.yml | 2 +- .github/workflows/ci-app_tests.yml | 2 +- .github/workflows/ci-pytorch_test-full.yml | 2 +- .github/workflows/ci-pytorch_test-slow.yml | 2 +- .github/workflows/ci_pkg-install.yml | 6 +++--- .github/workflows/code-checks.yml | 2 +- .github/workflows/docs-checks.yml | 4 ++-- .github/workflows/docs-deploy.yml | 2 +- .github/workflows/events-nightly.yml | 2 +- .github/workflows/legacy-checkpoints.yml | 2 +- .github/workflows/release-pypi.yml | 10 +++++----- 12 files changed, 20 insertions(+), 20 deletions(-) diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app_cloud_e2e_test.yml index cb0fbdf40a9e0..6ecfaa4e212f5 100644 --- a/.github/workflows/ci-app_cloud_e2e_test.yml +++ b/.github/workflows/ci-app_cloud_e2e_test.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.8" @@ -59,7 +59,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.8" diff --git a/.github/workflows/ci-app_examples.yml b/.github/workflows/ci-app_examples.yml index 0e3559f3551e2..ec8becd5f70d1 100644 --- a/.github/workflows/ci-app_examples.yml +++ b/.github/workflows/ci-app_examples.yml @@ -27,7 +27,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci-app_tests.yml b/.github/workflows/ci-app_tests.yml index 1dd6315811673..1678dab257301 100644 --- a/.github/workflows/ci-app_tests.yml +++ b/.github/workflows/ci-app_tests.yml @@ -31,7 +31,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch_test-full.yml index 972753f3d8f12..3e96dd22fe702 100644 --- a/.github/workflows/ci-pytorch_test-full.yml +++ b/.github/workflows/ci-pytorch_test-full.yml @@ -55,7 +55,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} if: ${{ (steps.skip.outputs.continue == '1') }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch_test-slow.yml index 47bf8f502573c..9de5687bba829 100644 --- a/.github/workflows/ci-pytorch_test-slow.yml +++ b/.github/workflows/ci-pytorch_test-slow.yml @@ -48,7 +48,7 @@ jobs: echo "::set-output name=continue::1" fi - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci_pkg-install.yml index 2ecd9a920c6a7..342e027b07cfe 100644 --- a/.github/workflows/ci_pkg-install.yml +++ b/.github/workflows/ci_pkg-install.yml @@ -39,7 +39,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -73,7 +73,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} @@ -105,7 +105,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 5a8d7164f12c8..ed9cd46adbe44 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/setup-python@v4 with: python-version: 3.9 diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 9d9ff3a0da8e0..977118b644ef3 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -22,7 +22,7 @@ jobs: - uses: actions/checkout@v2 with: submodules: true - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 @@ -74,7 +74,7 @@ jobs: with: submodules: true # lfs: true - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml index cb6e090e5a0e3..a408f882ac558 100644 --- a/.github/workflows/docs-deploy.yml +++ b/.github/workflows/docs-deploy.yml @@ -13,7 +13,7 @@ jobs: # If you're using actions/checkout@v2 you must set persist-credentials to false in most cases for the deployment to work correctly. with: persist-credentials: false - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.8 diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index af04ec45d34a4..3e955db3738d3 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -16,7 +16,7 @@ jobs: steps: # does nightly releases from feature branch - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 diff --git a/.github/workflows/legacy-checkpoints.yml b/.github/workflows/legacy-checkpoints.yml index ffe65663690f3..0856cfd3229a2 100644 --- a/.github/workflows/legacy-checkpoints.yml +++ b/.github/workflows/legacy-checkpoints.yml @@ -10,7 +10,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index eae9da931c88f..7876f05cfe1e6 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -22,7 +22,7 @@ jobs: pull-pkgs: ${{ steps.download.outputs.pkgs }} steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 - run: | @@ -65,7 +65,7 @@ jobs: with: name: dist-packages-${{ github.sha }} path: dist - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 @@ -99,7 +99,7 @@ jobs: with: name: pypi-packages-${{ github.sha }} path: pypi - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 @@ -129,7 +129,7 @@ jobs: name: pypi-packages-${{ github.sha }} path: pypi - run: ls -lh pypi/ - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 @@ -222,7 +222,7 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 with: python-version: 3.9 From 91bdacf8d2265fd6a6c06c98f0e332badb393840 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 1 Aug 2022 17:45:13 +0000 Subject: [PATCH 054/230] Bump actions/cache from 2 to 3 (#13955) Bumps [actions/cache](https://github.com/actions/cache) from 2 to 3. - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/cache dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-app_cloud_e2e_test.yml | 4 ++-- .github/workflows/docs-deploy.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app_cloud_e2e_test.yml index 6ecfaa4e212f5..3ad455650a117 100644 --- a/.github/workflows/ci-app_cloud_e2e_test.yml +++ b/.github/workflows/ci-app_cloud_e2e_test.yml @@ -75,7 +75,7 @@ jobs: # TODO: Enable cache # - name: Cache virtualenv # id: cache-venv -# uses: actions/cache@v2 +# uses: actions/cache@v3 # with: # path: ./.venv/ # key: ${{ runner.os }}-pip-${{ matrix.app_name }}-${{ hashFiles('requirements/app/base.txt', 'requirements/app/*.txt', 'src/lightning_app/__version__.py') }} @@ -90,7 +90,7 @@ jobs: - name: Cache Playwright dependencies id: playwright-cache - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/ms-playwright key: ${{ runner.os }}-playwright-${{ matrix.app_name }}-${{ hashFiles('requirements/app/base.txt', 'requirements/app/*.txt', 'src/lightning_app/__version__.py') }} diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml index a408f882ac558..6fdc1f069652c 100644 --- a/.github/workflows/docs-deploy.yml +++ b/.github/workflows/docs-deploy.yml @@ -31,7 +31,7 @@ jobs: # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Cache pip - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/pip key: ${{ runner.os }}-deploy-docs-pip-${{ hashFiles('requirements/app/*.txt') }} From eb233ea12dacf010edda49470396e85707d8c00e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Aug 2022 00:21:46 +0200 Subject: [PATCH 055/230] Snapshot selected globals and restore them in spawned process (#13921) Co-authored-by: Jirka Borovec --- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/strategies/ddp_spawn.py | 2 - .../strategies/launchers/multiprocessing.py | 65 ++++++++++++++++++- .../strategies/launchers/xla.py | 8 ++- src/pytorch_lightning/strategies/tpu_spawn.py | 2 - .../launchers/test_multiprocessing.py | 43 +++++++++++- 6 files changed, 113 insertions(+), 10 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ea649a9b65236..1516b74453842 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -396,6 +396,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an issue that caused the learning rate finder to set the model's learning rate to None when no suggestion was possible ([#13845](https://github.com/Lightning-AI/lightning/pull/13845)) +- Fixed an issue causing deterministic algorighms and other globals to get reset in spawned processes ([#13921](https://github.com/Lightning-AI/lightning/pull/13921)) + + - Fixed default `amp_level` for `DeepSpeedPrecisionPlugin` to `O2` ([#13897](https://github.com/PyTorchLightning/pytorch-lightning/pull/13897)) diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 6a3460febbf07..30bcef457c44a 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -50,7 +50,6 @@ from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only -from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep log = logging.getLogger(__name__) @@ -175,7 +174,6 @@ def set_world_ranks(self, process_idx: int = 0) -> None: rank_zero_only.rank = self.cluster_environment.global_rank() def _worker_setup(self, process_idx: int) -> None: - reset_seed() self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank self._process_group_backend = self._get_process_group_backend() diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 37e6c8d893150..91fa92b555ae0 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -13,11 +13,13 @@ # limitations under the License. import os from collections import UserList +from dataclasses import dataclass from multiprocessing.queues import SimpleQueue -from typing import Any, Callable, NamedTuple, Optional +from typing import Any, Callable, Dict, NamedTuple, Optional import numpy as np import torch +import torch.backends.cudnn import torch.multiprocessing as mp from torch import Tensor from typing_extensions import Literal @@ -27,7 +29,9 @@ from pytorch_lightning.strategies.strategy import Strategy from pytorch_lightning.trainer.states import TrainerFn, TrainerState from pytorch_lightning.utilities.apply_func import apply_to_collection, move_data_to_device +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11 from pytorch_lightning.utilities.rank_zero import rank_zero_debug +from pytorch_lightning.utilities.seed import _collect_rng_states, _set_rng_states from pytorch_lightning.utilities.types import _PATH @@ -89,9 +93,16 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] os.environ["MASTER_PORT"] = str(self._strategy.cluster_environment.main_port) context = mp.get_context(self._start_method) return_queue = context.SimpleQueue() + + if self._start_method == "spawn": + global_states = _GlobalStateSnapshot.capture() + process_args = [trainer, function, args, kwargs, return_queue, global_states] + else: + process_args = [trainer, function, args, kwargs, return_queue] + mp.start_processes( self._wrapping_function, - args=(trainer, function, args, kwargs, return_queue), + args=process_args, nprocs=self._strategy.num_processes, start_method=self._start_method, ) @@ -110,7 +121,10 @@ def _wrapping_function( args: Any, kwargs: Any, return_queue: SimpleQueue, + global_states: Optional["_GlobalStateSnapshot"] = None, ) -> None: + if global_states: + global_states.restore() self._strategy._worker_setup(process_idx) results = function(*args, **kwargs) @@ -209,3 +223,50 @@ class _WorkerOutput(NamedTuple): trainer_state: TrainerState trainer_results: Any extra: _FakeQueue + + +@dataclass +class _GlobalStateSnapshot: + """Captures a hand-selected set of (global) variables in modules and provides a way to restore them. + + It facilitates and encapsulates the transfer of globals like PyTorch's deterministic flags or random generator state + across process boundaries when launching processes with :func:`torch.multiprocessing.spawn`. + + Example: + + .. code-block:: python + + # in main process + snapshot = _GlobalStateSnapshot.capture() + + # in worker process + snapshot.restore() + """ + + use_deterministic_algorithms: bool + use_deterministic_algorithms_warn_only: bool + cudnn_benchmark: bool + rng_states: Dict[str, Any] + + @classmethod + def capture(cls) -> "_GlobalStateSnapshot": + """Capture a few global states from torch, numpy, etc., that we want to restore in a spawned worker + process.""" + warn_only = torch.is_deterministic_algorithms_warn_only_enabled() if _TORCH_GREATER_EQUAL_1_11 else False + return cls( + use_deterministic_algorithms=torch.are_deterministic_algorithms_enabled(), + use_deterministic_algorithms_warn_only=warn_only, + cudnn_benchmark=torch.backends.cudnn.benchmark, + rng_states=_collect_rng_states(), + ) + + def restore(self) -> None: + """Restores all globals to the values captured in the :meth:`capture` method.""" + if _TORCH_GREATER_EQUAL_1_11: + torch.use_deterministic_algorithms( + self.use_deterministic_algorithms, warn_only=self.use_deterministic_algorithms_warn_only + ) + else: + torch.use_deterministic_algorithms(self.use_deterministic_algorithms) + torch.backends.cudnn.benchmark = self.cudnn_benchmark + _set_rng_states(self.rng_states) diff --git a/src/pytorch_lightning/strategies/launchers/xla.py b/src/pytorch_lightning/strategies/launchers/xla.py index 037ec027bfd7d..064d952f71a8f 100644 --- a/src/pytorch_lightning/strategies/launchers/xla.py +++ b/src/pytorch_lightning/strategies/launchers/xla.py @@ -21,7 +21,12 @@ from torch.multiprocessing import ProcessContext import pytorch_lightning as pl -from pytorch_lightning.strategies.launchers.multiprocessing import _FakeQueue, _MultiProcessingLauncher, _WorkerOutput +from pytorch_lightning.strategies.launchers.multiprocessing import ( + _FakeQueue, + _GlobalStateSnapshot, + _MultiProcessingLauncher, + _WorkerOutput, +) from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _TPU_AVAILABLE from pytorch_lightning.utilities.apply_func import move_data_to_device @@ -96,6 +101,7 @@ def _wrapping_function( args: Any, kwargs: Any, return_queue: SimpleQueue, + global_states: Optional[_GlobalStateSnapshot] = None, ) -> None: self._strategy._worker_setup(process_idx) results = function(*args, **kwargs) diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 2d474fafe51b1..4d20e784e0d29 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -37,7 +37,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only -from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT if _TPU_AVAILABLE: @@ -206,7 +205,6 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ def _worker_setup(self, process_idx: int): self._launched = True - reset_seed() self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank diff --git a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py index 2a5fe82928a67..ad3e891ad607f 100644 --- a/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py +++ b/tests/tests_pytorch/strategies/launchers/test_multiprocessing.py @@ -15,19 +15,20 @@ from unittest.mock import ANY, Mock import pytest +import torch -from pytorch_lightning.strategies.launchers.multiprocessing import _MultiProcessingLauncher +from pytorch_lightning.strategies.launchers.multiprocessing import _GlobalStateSnapshot, _MultiProcessingLauncher @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp.get_all_start_methods", return_value=[]) -def test_spawn_launcher_forking_on_unsupported_platform(_): +def test_multiprocessing_launcher_forking_on_unsupported_platform(_): with pytest.raises(ValueError, match="The start method 'fork' is not available on this platform"): _MultiProcessingLauncher(strategy=Mock(), start_method="fork") @pytest.mark.parametrize("start_method", ["spawn", "fork"]) @mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp") -def test_spawn_launcher_start_method(mp_mock, start_method): +def test_multiprocessing_launcher_start_method(mp_mock, start_method): mp_mock.get_all_start_methods.return_value = [start_method] launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) launcher.launch(function=Mock()) @@ -38,3 +39,39 @@ def test_spawn_launcher_start_method(mp_mock, start_method): nprocs=ANY, start_method=start_method, ) + + +@pytest.mark.parametrize("start_method", ["spawn", "fork"]) +@mock.patch("pytorch_lightning.strategies.launchers.multiprocessing.mp") +def test_multiprocessing_launcher_restore_globals(mp_mock, start_method): + """Test that we pass the global state snapshot to the worker function only if we are starting with 'spawn'.""" + mp_mock.get_all_start_methods.return_value = [start_method] + launcher = _MultiProcessingLauncher(strategy=Mock(), start_method=start_method) + launcher.launch(function=Mock()) + function_args = mp_mock.start_processes.call_args[1]["args"] + if start_method == "spawn": + assert len(function_args) == 6 + assert isinstance(function_args[5], _GlobalStateSnapshot) + else: + assert len(function_args) == 5 + + +def test_global_state_snapshot(): + """Test the capture() and restore() methods for the global state snapshot.""" + torch.use_deterministic_algorithms(True) + torch.backends.cudnn.benchmark = False + torch.manual_seed(123) + + # capture the state of globals + snapshot = _GlobalStateSnapshot.capture() + + # simulate there is a process boundary and flags get reset here + torch.use_deterministic_algorithms(False) + torch.backends.cudnn.benchmark = True + torch.manual_seed(321) + + # restore the state of globals + snapshot.restore() + assert torch.are_deterministic_algorithms_enabled() + assert not torch.backends.cudnn.benchmark + assert torch.initial_seed() == 123 From b3203d93d046db42daed95b885cbad44316773c6 Mon Sep 17 00:00:00 2001 From: Jerome Anand <88475913+jerome-habana@users.noreply.github.com> Date: Tue, 2 Aug 2022 13:31:31 +0530 Subject: [PATCH 056/230] Added support for HPU device stats monitor (#13819) * Added support for HPU device stats monitor Signed-off-by: Jerome * Update changelog Signed-off-by: Jerome * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply suggestions from code review Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> * Update reference Signed-off-by: Jerome * Apply suggestions from code review Co-authored-by: Rohit Gupta * fix alignment * add descriptions * Update hpu_intermediate.rst Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Rohit Gupta --- .../source-pytorch/accelerators/hpu_basic.rst | 1 - .../accelerators/hpu_intermediate.rst | 31 +++++++++++++++++++ src/pytorch_lightning/CHANGELOG.md | 3 ++ src/pytorch_lightning/accelerators/hpu.py | 21 +++++++++++-- tests/tests_pytorch/accelerators/test_hpu.py | 20 ++++++++++++ 5 files changed, 72 insertions(+), 4 deletions(-) diff --git a/docs/source-pytorch/accelerators/hpu_basic.rst b/docs/source-pytorch/accelerators/hpu_basic.rst index b222782dfc6f5..a6c20414a7a02 100644 --- a/docs/source-pytorch/accelerators/hpu_basic.rst +++ b/docs/source-pytorch/accelerators/hpu_basic.rst @@ -79,5 +79,4 @@ Known limitations ----------------- * `Habana dataloader `__ is not supported. -* :class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is not supported. * :func:`torch.inference_mode` is not supported diff --git a/docs/source-pytorch/accelerators/hpu_intermediate.rst b/docs/source-pytorch/accelerators/hpu_intermediate.rst index 0e08683211431..3b1c0e6b43707 100644 --- a/docs/source-pytorch/accelerators/hpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/hpu_intermediate.rst @@ -66,3 +66,34 @@ This enables advanced users to provide their own BF16 and FP32 operator list ins trainer.fit(model, datamodule=dm) For more details, please refer to `PyTorch Mixed Precision Training on Gaudi `__. + +---- + +Enabling DeviceStatsMonitor with HPUs +---------------------------------------- + +:class:`~pytorch_lightning.callbacks.device_stats_monitor.DeviceStatsMonitor` is a callback that automatically monitors and logs device stats during the training stage. +This callback can be passed for training with HPUs. It returns a map of the following metrics with their values in bytes of type uint64: + +- **Limit**: amount of total memory on HPU device. +- **InUse**: amount of allocated memory at any instance. +- **MaxInUse**: amount of total active memory allocated. +- **NumAllocs**: number of allocations. +- **NumFrees**: number of freed chunks. +- **ActiveAllocs**: number of active allocations. +- **MaxAllocSize**: maximum allocated size. +- **TotalSystemAllocs**: total number of system allocations. +- **TotalSystemFrees**: total number of system frees. +- **TotalActiveAllocs**: total number of active allocations. + +The below snippet shows how DeviceStatsMonitor can be enabled. + +.. code-block:: python + + from pytorch_lightning import Trainer + from pytorch_lightning.callbacks import DeviceStatsMonitor + + device_stats = DeviceStatsMonitor() + trainer = Trainer(accelerator="hpu", callbacks=[device_stats]) + +For more details, please refer to `Memory Stats APIs `__. diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 1516b74453842..719ee0363eaf1 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -111,6 +111,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added support for async checkpointing ([#13658](https://github.com/Lightning-AI/lightning/pull/13658)) +- Added support for HPU Device stats monitor ([#13819](https://github.com/Lightning-AI/lightning/pull/13819)) + + ### Changed - `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) diff --git a/src/pytorch_lightning/accelerators/hpu.py b/src/pytorch_lightning/accelerators/hpu.py index 686bf6bb9452d..8fc242fa55f20 100644 --- a/src/pytorch_lightning/accelerators/hpu.py +++ b/src/pytorch_lightning/accelerators/hpu.py @@ -39,9 +39,24 @@ def setup_environment(self, root_device: torch.device) -> None: raise MisconfigurationException(f"Device should be HPU, got {root_device} instead.") def get_device_stats(self, device: Union[str, torch.device]) -> Dict[str, Any]: - """HPU device stats aren't supported yet.""" - rank_zero_debug("HPU device stats aren't supported yet.") - return {} + """Returns a map of the following metrics with their values: + + - Limit: amount of total memory on HPU device. + - InUse: amount of allocated memory at any instance. + - MaxInUse: amount of total active memory allocated. + - NumAllocs: number of allocations. + - NumFrees: number of freed chunks. + - ActiveAllocs: number of active allocations. + - MaxAllocSize: maximum allocated size. + - TotalSystemAllocs: total number of system allocations. + - TotalSystemFrees: total number of system frees. + - TotalActiveAllocs: total number of active allocations. + """ + try: + return torch_hpu.hpu.memory_stats(device) + except (AttributeError, NameError): + rank_zero_debug("HPU `get_device_stats` failed") + return {} @staticmethod def parse_devices(devices: Union[int, str, List[int]]) -> Optional[int]: diff --git a/tests/tests_pytorch/accelerators/test_hpu.py b/tests/tests_pytorch/accelerators/test_hpu.py index 0ef63de417907..4947000b47162 100644 --- a/tests/tests_pytorch/accelerators/test_hpu.py +++ b/tests/tests_pytorch/accelerators/test_hpu.py @@ -303,3 +303,23 @@ def training_epoch_end(self, outputs) -> None: trainer.fit(model) assert all(model.optims) + + +@RunIf(hpu=True) +def test_hpu_device_stats_monitor(tmpdir): + + hpu_stats = HPUAccelerator().get_device_stats("hpu") + fields = [ + "Limit", + "InUse", + "MaxInUse", + "NumAllocs", + "NumFrees", + "ActiveAllocs", + "MaxAllocSize", + "TotalSystemAllocs", + "TotalSystemFrees", + "TotalActiveAllocs", + ] + for f in fields: + assert any(f in h for h in hpu_stats.keys()) From 2919dcf7eea181a2706bbcb11201f2f8515cad36 Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Tue, 2 Aug 2022 01:31:09 -0700 Subject: [PATCH 057/230] [CLI] add support for cluster management (#13835) --- src/lightning_app/CHANGELOG.md | 1 + src/lightning_app/cli/cmd_clusters.py | 206 ++++++++++++++++++ src/lightning_app/cli/core.py | 13 ++ src/lightning_app/cli/lightning_cli.py | 16 +- src/lightning_app/cli/lightning_cli_create.py | 86 ++++++++ src/lightning_app/cli/lightning_cli_delete.py | 49 +++++ src/lightning_app/cli/lightning_cli_list.py | 16 ++ src/lightning_app/testing/testing.py | 24 ++ src/lightning_app/utilities/openapi.py | 61 ++++++ tests/tests_app/cli/test_cli.py | 58 ++++- tests/tests_app/cli/test_cmd_clusters.py | 135 ++++++++++++ tests/tests_clusters/__init__.py | 0 .../tests_clusters/test_cluster_lifecycle.py | 53 +++++ 13 files changed, 707 insertions(+), 11 deletions(-) create mode 100644 src/lightning_app/cli/cmd_clusters.py create mode 100644 src/lightning_app/cli/core.py create mode 100644 src/lightning_app/cli/lightning_cli_create.py create mode 100644 src/lightning_app/cli/lightning_cli_delete.py create mode 100644 src/lightning_app/cli/lightning_cli_list.py create mode 100644 src/lightning_app/utilities/openapi.py create mode 100644 tests/tests_app/cli/test_cmd_clusters.py create mode 100644 tests/tests_clusters/__init__.py create mode 100644 tests/tests_clusters/test_cluster_lifecycle.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 89fcd615430aa..34fdb9665f5aa 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) +- Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py new file mode 100644 index 0000000000000..7acdc9b63022d --- /dev/null +++ b/src/lightning_app/cli/cmd_clusters.py @@ -0,0 +1,206 @@ +import json +import re +import time +from datetime import datetime + +import click +from lightning_cloud.openapi import ( + V1AWSClusterDriverSpec, + V1ClusterDriver, + V1ClusterPerformanceProfile, + V1ClusterSpec, + V1CreateClusterRequest, + V1InstanceSpec, + V1KubernetesClusterDriver, +) +from lightning_cloud.openapi.models import Externalv1Cluster, V1ClusterState, V1ClusterType +from rich.console import Console +from rich.table import Table +from rich.text import Text + +from lightning_app.cli.core import Formatable +from lightning_app.utilities.network import LightningClient +from lightning_app.utilities.openapi import create_openapi_object, string2dict + +CLUSTER_STATE_CHECKING_TIMEOUT = 60 +MAX_CLUSTER_WAIT_TIME = 5400 + + +class AWSClusterManager: + """AWSClusterManager implements API calls specific to Lightning AI BYOC compute clusters when the AWS provider + is selected as the backend compute.""" + + def __init__(self): + self.api_client = LightningClient() + + def create( + self, + cost_savings: bool = False, + cluster_name: str = None, + role_arn: str = None, + region: str = "us-east-1", + external_id: str = None, + instance_types: [str] = [], + edit_before_creation: bool = False, + wait: bool = False, + ): + """request Lightning AI BYOC compute cluster creation. + + Args: + cost_savings: Specifies if the cluster uses cost savings mode + cluster_name: The name of the cluster to be created + role_arn: AWS IAM Role ARN used to provision resources + region: AWS region containing compute resources + external_id: AWS IAM Role external ID + instance_types: AWS instance types supported by the cluster + edit_before_creation: Enables interactive editing of requests before submitting it to Lightning AI. + wait: Waits for the cluster to be in a RUNNING state. Only use this for debugging. + """ + performance_profile = V1ClusterPerformanceProfile.DEFAULT + if cost_savings: + """In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters + with low utilization.""" + performance_profile = V1ClusterPerformanceProfile.COST_SAVING + + body = V1CreateClusterRequest( + name=cluster_name, + spec=V1ClusterSpec( + cluster_type=V1ClusterType.BYOC, + performance_profile=performance_profile, + driver=V1ClusterDriver( + kubernetes=V1KubernetesClusterDriver( + aws=V1AWSClusterDriverSpec( + region=region, + role_arn=role_arn, + external_id=external_id, + instance_types=[V1InstanceSpec(name=x) for x in instance_types], + ) + ) + ), + ), + ) + new_body = body + if edit_before_creation: + after = click.edit(json.dumps(body.to_dict(), indent=4)) + if after is not None: + new_body = create_openapi_object(string2dict(after), body) + if new_body == body: + click.echo("cluster unchanged") + + resp = self.api_client.cluster_service_create_cluster(body=new_body) + if wait: + _wait_for_cluster_state(self.api_client, resp.id, V1ClusterState.RUNNING) + + click.echo(f"${resp.id} cluster is ${resp.status.phase}") + + def list(self): + resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) + console = Console() + console.print(ClusterList(resp.clusters).as_table()) + + def delete(self, cluster_id: str = None, force: bool = False, wait: bool = False): + if force: + click.echo( + """ + Deletes a BYOC cluster. Lightning AI removes cluster artifacts and any resources running on the cluster.\n + WARNING: Deleting a cluster does not clean up any resources managed by Lightning AI.\n + Check your cloud provider to verify that existing cloud resources are deleted. + """ + ) + click.confirm("Do you want to continue?", abort=True) + + self.api_client.cluster_service_delete_cluster(id=cluster_id, force=force) + click.echo("Cluster deletion triggered successfully") + + if wait: + _wait_for_cluster_state(self.api_client, cluster_id, V1ClusterState.DELETED) + + +class ClusterList(Formatable): + def __init__(self, clusters: [Externalv1Cluster]): + self.clusters = clusters + + def as_json(self) -> str: + return json.dumps(self.clusters) + + def as_table(self) -> Table: + table = Table("id", "name", "type", "status", "created", show_header=True, header_style="bold green") + phases = { + V1ClusterState.QUEUED: Text("queued", style="bold yellow"), + V1ClusterState.PENDING: Text("pending", style="bold yellow"), + V1ClusterState.RUNNING: Text("running", style="bold green"), + V1ClusterState.FAILED: Text("failed", style="bold red"), + V1ClusterState.DELETED: Text("deleted", style="bold red"), + } + + cluster_type_lookup = { + V1ClusterType.BYOC: Text("byoc", style="bold yellow"), + V1ClusterType.GLOBAL: Text("lightning-cloud", style="bold green"), + } + for cluster in self.clusters: + cluster: Externalv1Cluster + status = phases[cluster.status.phase] + if cluster.spec.desired_state == V1ClusterState.DELETED and cluster.status.phase != V1ClusterState.DELETED: + status = Text("terminating", style="bold red") + + # this guard is necessary only until 0.3.93 releases which includes the `created_at` + # field to the external API + created_at = datetime.now() + if hasattr(cluster, "created_at"): + created_at = cluster.created_at + + table.add_row( + cluster.id, + cluster.name, + cluster_type_lookup.get(cluster.spec.cluster_type, Text("unknown", style="red")), + status, + created_at.strftime("%Y-%m-%d") if created_at else "", + ) + return table + + +def _wait_for_cluster_state( + api_client: LightningClient, + cluster_id: str, + target_state: V1ClusterState, + max_wait_time: int = MAX_CLUSTER_WAIT_TIME, + check_timeout: int = CLUSTER_STATE_CHECKING_TIMEOUT, +): + """_wait_for_cluster_state waits until the provided cluster has reached a desired state, or failed. + + Args: + api_client: LightningClient used for polling + cluster_id: Specifies the cluster to wait for + target_state: Specifies the desired state the target cluster needs to meet + max_wait_time: Maximum duration to wait (in seconds) + check_timeout: duration between polling for the cluster state (in seconds) + """ + start = time.time() + elapsed = 0 + while elapsed < max_wait_time: + cluster_resp = api_client.cluster_service_list_clusters() + new_cluster = None + for clust in cluster_resp.clusters: + if clust.id == cluster_id: + new_cluster = clust + break + if new_cluster is not None: + if new_cluster.status.phase == target_state: + break + elif new_cluster.status.phase == V1ClusterState.FAILED: + raise click.ClickException(f"Cluster {cluster_id} is in failed state.") + time.sleep(check_timeout) + elapsed = time.time() - start + else: + raise click.ClickException("Max wait time elapsed") + + +def _check_cluster_name_is_valid(_ctx, _param, value): + pattern = r"^(?!-)[a-z0-9-]{1,63}(? Table: + pass + + @abc.abstractmethod + def as_json(self) -> str: + pass diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 74b2d1c4926e1..bb81b4eda133f 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -12,6 +12,9 @@ from lightning_app import __version__ as ver from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init +from lightning_app.cli.lightning_cli_create import create +from lightning_app.cli.lightning_cli_delete import delete +from lightning_app.cli.lightning_cli_list import get_list from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType @@ -206,16 +209,9 @@ def stop(): pass -@_main.group(hidden=True) -def delete(): - """Delete an application.""" - pass - - -@_main.group(name="list", hidden=True) -def get_list(): - """List your applications.""" - pass +_main.add_command(get_list) +_main.add_command(delete) +_main.add_command(create) @_main.group() diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py new file mode 100644 index 0000000000000..7e45fe7e7c078 --- /dev/null +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -0,0 +1,86 @@ +import click + +from lightning_app.cli.cmd_clusters import _check_cluster_name_is_valid, AWSClusterManager + + +@click.group("create") +def create(): + """Create Lightning AI BYOC managed resources.""" + pass + + +@create.command("cluster") +@click.argument("cluster_name", callback=_check_cluster_name_is_valid) +@click.option("--provider", "provider", type=str, default="aws", help="cloud provider to be used for your cluster") +@click.option("--external-id", "external_id", type=str, required=True) +@click.option( + "--role-arn", "role_arn", type=str, required=True, help="AWS role ARN attached to the associated resources." +) +@click.option( + "--region", + "region", + type=str, + required=False, + default="us-east-1", + help="AWS region that is used to host the associated resources.", +) +@click.option( + "--instance-types", + "instance_types", + type=str, + required=False, + default=None, + help="Instance types that you want to support, for computer jobs within the cluster.", +) +@click.option( + "--cost-savings", + "cost_savings", + type=bool, + required=False, + default=False, + is_flag=True, + help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for cost savings. + This makes runs cheaper but start-up times may increase.""", +) +@click.option( + "--edit-before-creation", + default=False, + is_flag=True, + help="Edit the cluster specs before submitting them to the API server.", +) +@click.option( + "--wait", + "wait", + type=bool, + required=False, + default=False, + is_flag=True, + help="Enabling this flag makes the CLI wait until the cluster is running.", +) +def create_cluster( + cluster_name: str, + region: str, + role_arn: str, + external_id: str, + provider: str, + instance_types: str, + edit_before_creation: bool, + cost_savings: bool, + wait: bool, + **kwargs, +): + """Create a Lightning AI BYOC compute cluster with your cloud provider credentials.""" + if provider != "aws": + click.echo("Only AWS is supported for now. But support for more providers is coming soon.") + return + cluster_manager = AWSClusterManager() + cluster_manager.create( + cluster_name=cluster_name, + region=region, + role_arn=role_arn, + external_id=external_id, + instance_types=instance_types.split(","), + edit_before_creation=edit_before_creation, + cost_savings=cost_savings, + wait=wait, + ) diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py new file mode 100644 index 0000000000000..c304b130bdf5d --- /dev/null +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -0,0 +1,49 @@ +import click + +from lightning_app.cli.cmd_clusters import AWSClusterManager + + +@click.group("delete") +def delete(): + """Delete Lightning AI BYOC managed resources.""" + pass + + +@delete.command("cluster") +@click.argument("cluster", type=str) +@click.option( + "--force", + "force", + type=bool, + required=False, + default=False, + is_flag=True, + help="""Delete a BYOC cluster from Lightning AI. This does NOT delete any resources created by the cluster, + it just removes the entry from Lightning AI. + + WARNING: You should NOT use this under normal circumstances.""", +) +@click.option( + "--wait", + "wait", + type=bool, + required=False, + default=False, + is_flag=True, + help="Enabling this flag makes the CLI wait until the cluster is deleted.", +) +def delete_cluster(cluster: str, force: bool = False, wait: bool = False): + """Delete a Lightning AI BYOC compute cluster and all associated cloud provider resources. + + Deleting a run also deletes all Runs and Experiments that were started on the cluster. + Deletion permanently removes not only the record of all runs on a cluster, but all associated experiments, + artifacts, metrics, logs, etc. + + WARNING: This process may take a few minutes to complete, but once started it CANNOT be rolled back. + Deletion permanently removes not only the BYOC cluster from being managed by Lightning AI, but tears down + every BYOC resource Lightning AI managed (for that cluster id) in the host cloud. + + All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered. + """ + cluster_manager = AWSClusterManager() + cluster_manager.delete(cluster_id=cluster, force=force, wait=wait) diff --git a/src/lightning_app/cli/lightning_cli_list.py b/src/lightning_app/cli/lightning_cli_list.py new file mode 100644 index 0000000000000..31f46537e8c5f --- /dev/null +++ b/src/lightning_app/cli/lightning_cli_list.py @@ -0,0 +1,16 @@ +import click + +from lightning_app.cli.cmd_clusters import AWSClusterManager + + +@click.group(name="list") +def get_list(): + """List your Lightning AI BYOC managed resources.""" + pass + + +@get_list.command("clusters") +def list_clusters(**kwargs): + """List your Lightning AI BYOC compute clusters.""" + cluster_manager = AWSClusterManager() + cluster_manager.list() diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index cc03f5badec2b..10abdac4aad5d 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -2,6 +2,7 @@ import json import os import shutil +import subprocess import sys import tempfile import time @@ -130,6 +131,29 @@ def browser_context_args(browser_context_args: Dict) -> Dict: } +@contextmanager +def run_cli(args) -> Generator: + """This utility is used to automate end-to-end testing of the Lightning AI CLI.""" + cmd = [ + sys.executable, + "-m", + "lightning", + ] + args + + with tempfile.TemporaryDirectory() as tmpdir: + env_copy = os.environ.copy() + process = Popen( + cmd, + cwd=tmpdir, + env=env_copy, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + process.wait() + + yield process.stdout.read().decode("UTF-8"), process.stderr.read().decode("UTF-8") + + @requires("playwright") @contextmanager def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator: diff --git a/src/lightning_app/utilities/openapi.py b/src/lightning_app/utilities/openapi.py new file mode 100644 index 0000000000000..f533b1f8de3dc --- /dev/null +++ b/src/lightning_app/utilities/openapi.py @@ -0,0 +1,61 @@ +import json +from typing import Any, Dict + + +def _duplicate_checker(js): + """_duplicate_checker verifies that your JSON object doesn't contain duplicate keys.""" + result = {} + for name, value in js: + if name in result: + raise ValueError( + f"Unable to load JSON. A duplicate key {name} was detected. JSON objects must have unique keys." + ) + result[name] = value + return result + + +def string2dict(text): + """string2dict parses a JSON string into a dictionary, ensuring no keys are duplicated by accident.""" + if not isinstance(text, str): + text = text.decode("utf-8") + try: + js = json.loads(text, object_pairs_hook=_duplicate_checker) + return js + except ValueError as e: + raise ValueError(f"Unable to load JSON: {str(e)}.") + + +def is_openapi(obj): + """is_openopi checks if an object was generated by OpenAPI.""" + return hasattr(obj, "swagger_types") + + +def create_openapi_object(json_obj: Dict, target: Any): + """Create the OpenAPI object from the given JSON dict and based on the target object. + + Lightning AI uses the target object to make new objects from the given JSON spec so the target must be a valid + object. + """ + if not isinstance(json_obj, dict): + raise TypeError("json_obj must be a dictionary") + if not is_openapi(target): + raise TypeError("target must be an openapi object") + + target_attribs = {} + for key, value in json_obj.items(): + try: + # user provided key is not a valid key on openapi object + sub_target = getattr(target, key) + except AttributeError: + raise ValueError(f"Field {key} not found in the target object") + + if is_openapi(sub_target): # it's an openapi object + target_attribs[key] = create_openapi_object(value, sub_target) + else: + target_attribs[key] = value + + # TODO(sherin) - specifically process list and dict and do the validation. Also do the + # verification for enum types + + new_target = target.__class__(**target_attribs) + return new_target diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 39d8d6b7890b6..3e003293692a8 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -1,11 +1,15 @@ import os from unittest import mock +from unittest.mock import MagicMock import pytest from click.testing import CliRunner from lightning_cloud.openapi import Externalv1LightningappInstance from lightning_app.cli.lightning_cli import _main, get_app_url, login, logout, run +from lightning_app.cli.lightning_cli_create import create, create_cluster +from lightning_app.cli.lightning_cli_delete import delete, delete_cluster +from lightning_app.cli.lightning_cli_list import get_list, list_clusters from lightning_app.runners.runtime_type import RuntimeType @@ -37,7 +41,7 @@ def test_start_target_url(runtime_type, extra_args, lightning_cloud_url, expecte assert get_app_url(runtime_type, *extra_args) == expected_url -@pytest.mark.parametrize("command", [_main, run]) +@pytest.mark.parametrize("command", [_main, run, get_list, create, delete]) def test_commands(command): runner = CliRunner() result = runner.invoke(command) @@ -50,6 +54,9 @@ def test_main_lightning_cli_help(): assert "login " in res assert "logout " in res assert "run " in res + assert "list " in res + assert "delete " in res + assert "create " in res res = os.popen("python -m lightning run --help").read() assert "app " in res @@ -61,6 +68,55 @@ def test_main_lightning_cli_help(): assert "frontend" not in res +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") +def test_create_cluster(create: mock.MagicMock): + runner = CliRunner() + runner.invoke( + create_cluster, + [ + "test-7", + "--provider", + "aws", + "--external-id", + "dummy", + "--role-arn", + "arn:aws:iam::1234567890:role/lai-byoc", + "--instance-types", + "t2.small", + ], + ) + + create.assert_called_once_with( + cluster_name="test-7", + region="us-east-1", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + external_id="dummy", + instance_types=["t2.small"], + edit_before_creation=False, + cost_savings=False, + wait=False, + ) + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.list") +def test_list_clusters(list: mock.MagicMock): + runner = CliRunner() + runner.invoke(list_clusters) + + list.assert_called_once_with() + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.delete") +def test_delete_cluster(delete: mock.MagicMock): + runner = CliRunner() + runner.invoke(delete_cluster, ["test-7"]) + + delete.assert_called_once_with(cluster_id="test-7", force=False, wait=False) + + @mock.patch("lightning_app.utilities.login.Auth._run_server") @mock.patch("lightning_app.utilities.login.Auth.clear") def test_cli_login(clear: mock.MagicMock, run_server: mock.MagicMock): diff --git a/tests/tests_app/cli/test_cmd_clusters.py b/tests/tests_app/cli/test_cmd_clusters.py new file mode 100644 index 0000000000000..e835643fd94fa --- /dev/null +++ b/tests/tests_app/cli/test_cmd_clusters.py @@ -0,0 +1,135 @@ +from unittest import mock +from unittest.mock import MagicMock + +import click +import pytest +from lightning_cloud.openapi import ( + V1AWSClusterDriverSpec, + V1ClusterDriver, + V1ClusterPerformanceProfile, + V1ClusterSpec, + V1ClusterType, + V1CreateClusterRequest, + V1InstanceSpec, + V1KubernetesClusterDriver, +) +from lightning_cloud.openapi.models import Externalv1Cluster, V1ClusterState, V1ClusterStatus, V1ListClustersResponse + +from lightning_app.cli import cmd_clusters +from lightning_app.cli.cmd_clusters import AWSClusterManager + + +class FakeLightningClient: + def __init__(self, list_responses=[], consume=True): + self.list_responses = list_responses + self.list_call_count = 0 + self.consume = consume + + def cluster_service_list_clusters(self, phase_not_in=None): + self.list_call_count = self.list_call_count + 1 + if self.consume: + return self.list_responses.pop() + return self.list_responses[0] + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_create_cluster") +def test_create_cluster(api: mock.MagicMock): + cluster_manager = AWSClusterManager() + cluster_manager.create( + cluster_name="test-7", + external_id="dummy", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + instance_types=["t2.small"], + region="us-west-2", + ) + + api.assert_called_once_with( + body=V1CreateClusterRequest( + name="test-7", + spec=V1ClusterSpec( + cluster_type=V1ClusterType.BYOC, + performance_profile=V1ClusterPerformanceProfile.DEFAULT, + driver=V1ClusterDriver( + kubernetes=V1KubernetesClusterDriver( + aws=V1AWSClusterDriverSpec( + region="us-west-2", + role_arn="arn:aws:iam::1234567890:role/lai-byoc", + external_id="dummy", + instance_types=[V1InstanceSpec(name="t2.small")], + ) + ) + ), + ), + ) + ) + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_list_clusters") +def test_list_clusters(api: mock.MagicMock): + cluster_manager = AWSClusterManager() + cluster_manager.list() + + api.assert_called_once_with(phase_not_in=[V1ClusterState.DELETED]) + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.cluster_service_delete_cluster") +def test_delete_cluster(api: mock.MagicMock): + cluster_manager = AWSClusterManager() + cluster_manager.delete(cluster_id="test-7") + + api.assert_called_once_with(id="test-7", force=False) + + +class Test_check_cluster_name_is_valid: + @pytest.mark.parametrize("name", ["test-7", "0wildgoat"]) + def test_valid(self, name): + assert cmd_clusters._check_cluster_name_is_valid(None, None, name) + + @pytest.mark.parametrize( + "name", ["(&%)!@#", "1234567890123456789012345678901234567890123456789012345678901234567890"] + ) + def test_invalid(self, name): + with pytest.raises(click.ClickException) as e: + cmd_clusters._check_cluster_name_is_valid(None, None, name) + assert "cluster name doesn't match regex pattern" in str(e.value) + + +class Test_wait_for_cluster_state: + # TODO(rra) add tests for pagination + + @pytest.mark.parametrize("target_state", [V1ClusterState.RUNNING, V1ClusterState.DELETED]) + @pytest.mark.parametrize( + "previous_state", [V1ClusterState.QUEUED, V1ClusterState.PENDING, V1ClusterState.UNSPECIFIED] + ) + def test_happy_path(self, target_state, previous_state): + client = FakeLightningClient( + list_responses=[ + V1ListClustersResponse( + clusters=[Externalv1Cluster(id="test-cluster", status=V1ClusterStatus(phase=state))] + ) + for state in [previous_state, target_state] + ] + ) + cmd_clusters._wait_for_cluster_state(client, "test-cluster", target_state, check_timeout=0.1) + assert client.list_call_count == 1 + + @pytest.mark.parametrize("target_state", [V1ClusterState.RUNNING, V1ClusterState.DELETED]) + def test_times_out(self, target_state): + client = FakeLightningClient( + list_responses=[ + V1ListClustersResponse( + clusters=[ + Externalv1Cluster(id="test-cluster", status=V1ClusterStatus(phase=V1ClusterState.UNSPECIFIED)) + ] + ) + ], + consume=False, + ) + with pytest.raises(click.ClickException) as e: + cmd_clusters._wait_for_cluster_state( + client, "test-cluster", target_state, max_wait_time=0.4, check_timeout=0.2 + ) + assert "Max wait time elapsed" in str(e.value) diff --git a/tests/tests_clusters/__init__.py b/tests/tests_clusters/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_clusters/test_cluster_lifecycle.py b/tests/tests_clusters/test_cluster_lifecycle.py new file mode 100644 index 0000000000000..cd48761f5fc99 --- /dev/null +++ b/tests/tests_clusters/test_cluster_lifecycle.py @@ -0,0 +1,53 @@ +import os +import uuid + +import pytest + +from src.lightning_app.testing.testing import run_cli + + +@pytest.mark.cloud +@pytest.mark.skipif( + os.environ.get("LIGHTNING_BYOC_ROLE_ARN") is None, reason="missing LIGHTNING_BYOC_ROLE_ARN environment variable" +) +@pytest.mark.skipif( + os.environ.get("LIGHTNING_BYOC_EXTERNAL_ID") is None, + reason="missing LIGHTNING_BYOC_EXTERNAL_ID environment variable", +) +def test_cluster_lifecycle() -> None: + role_arn = os.environ.get("LIGHTNING_BYOC_ROLE_ARN", None) + external_id = os.environ.get("LIGHTNING_BYOC_EXTERNAL_ID", None) + region = "us-west-2" + instance_types = "t2.small,t3.small" + cluster_name = "byoc-%s" % (uuid.uuid4()) + with run_cli( + [ + "create", + "cluster", + cluster_name, + "--provider", + "aws", + "--role-arn", + role_arn, + "--external-id", + external_id, + "--region", + region, + "--instance-types", + instance_types, + "--wait", + ] + ) as (stdout, stderr): + assert "success" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + with run_cli(["list", "clusters"]) as (stdout, stderr): + assert cluster_name in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + with run_cli(["delete", "cluster", "--force", cluster_name]) as (stdout, stderr): + assert "success" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + +@pytest.mark.cloud +def test_cluster_list() -> None: + with run_cli(["list", "clusters"]) as (stdout, stderr): + assert "lightning-cloud" in stdout, f"stdout: {stdout}\nstderr: {stderr}" From 0fbfbf9e818d22bbfa5b15bec7e294ad51bc73d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Aug 2022 10:55:07 +0200 Subject: [PATCH 058/230] Make tbptt imports Python 3.10 compatible (#13973) * Make tbptt imports Python 3.10 compatible * add chlog --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/core/module.py | 8 ++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 719ee0363eaf1..b4a236b846dab 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -405,6 +405,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed default `amp_level` for `DeepSpeedPrecisionPlugin` to `O2` ([#13897](https://github.com/PyTorchLightning/pytorch-lightning/pull/13897)) +- Fixed Python 3.10 compatibility for truncated back-propagation through time (TBPTT) ([#13973](https://github.com/Lightning-AI/lightning/pull/13973)) + + ## [1.6.5] - 2022-07-13 diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index a66c7679b3ee0..b8cc1d91cde18 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -13,7 +13,7 @@ # limitations under the License. """The LightningModule - an nn.Module with many additional features.""" -import collections +import collections.abc import inspect import logging import numbers @@ -1712,7 +1712,7 @@ def tbptt_split_batch(self, batch, split_size): for i, x in enumerate(batch): if isinstance(x, torch.Tensor): split_x = x[:, t:t + split_size] - elif isinstance(x, collections.Sequence): + elif isinstance(x, collections.abc.Sequence): split_x = [None] * len(x) for batch_idx in range(len(x)): split_x[batch_idx] = x[batch_idx][t:t + split_size] @@ -1726,7 +1726,7 @@ def tbptt_split_batch(self, batch, split_size): if :paramref:`~pytorch_lightning.core.module.LightningModule.truncated_bptt_steps` > 0. Each returned batch split is passed separately to :meth:`training_step`. """ - time_dims = [len(x[0]) for x in batch if isinstance(x, (Tensor, collections.Sequence))] + time_dims = [len(x[0]) for x in batch if isinstance(x, (Tensor, collections.abc.Sequence))] assert len(time_dims) >= 1, "Unable to determine batch time dimension" assert all(x == time_dims[0] for x in time_dims), "Batch time dimension length is ambiguous" @@ -1736,7 +1736,7 @@ def tbptt_split_batch(self, batch, split_size): for i, x in enumerate(batch): if isinstance(x, Tensor): split_x = x[:, t : t + split_size] - elif isinstance(x, collections.Sequence): + elif isinstance(x, collections.abc.Sequence): split_x = [None] * len(x) for batch_idx in range(len(x)): split_x[batch_idx] = x[batch_idx][t : t + split_size] From d8e5e7f889646e2ae3f480941b4c9e18434e994d Mon Sep 17 00:00:00 2001 From: Lee Jungwon <33821003+BongYang@users.noreply.github.com> Date: Tue, 2 Aug 2022 18:20:47 +0900 Subject: [PATCH 059/230] Fix mypy typing errors in pytorch_lightning/strategies/tpu_spawn.py (#13813) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: awaelchli Co-authored-by: Carlos Mocholí Co-authored-by: otaj --- pyproject.toml | 1 - src/pytorch_lightning/strategies/tpu_spawn.py | 48 ++++++++++++------- .../trainer/connectors/data_connector.py | 2 +- src/pytorch_lightning/utilities/apply_func.py | 2 +- 4 files changed, 32 insertions(+), 21 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 05eba62c50402..15f0293bb1c8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -66,7 +66,6 @@ module = [ "pytorch_lightning.strategies.ipu", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", - "pytorch_lightning.strategies.tpu_spawn", "pytorch_lightning.trainer.callback_hook", "pytorch_lightning.trainer.connectors.callback_connector", "pytorch_lightning.trainer.connectors.data_connector", diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 4d20e784e0d29..62bb1c308480b 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -13,7 +13,7 @@ # limitations under the License. import io import os -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Mapping, Optional, Sequence, Union import torch from torch import Tensor @@ -29,15 +29,17 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.strategies.launchers.xla import _XLALauncher +from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.connectors.data_connector import DataConnector from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import _TPU_AVAILABLE, find_shared_parameters, set_shared_parameters +from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.data import has_len from pytorch_lightning.utilities.distributed import ReduceOp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only -from pytorch_lightning.utilities.types import _PATH, STEP_OUTPUT +from pytorch_lightning.utilities.types import _PATH, EVAL_DATALOADERS, STEP_OUTPUT, TRAIN_DATALOADERS if _TPU_AVAILABLE: import torch_xla.core.xla_env_vars as xenv @@ -58,7 +60,7 @@ class TPUSpawnStrategy(DDPSpawnStrategy): def __init__( self, accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = None, - parallel_devices: Optional[List[int]] = None, + parallel_devices: Optional[List[torch.device]] = None, checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, debug: bool = False, @@ -72,6 +74,7 @@ def __init__( precision_plugin=precision_plugin, start_method="fork", ) + self._checkpoint_io: Optional[CheckpointIO] self.debug = debug self._launched = False @@ -95,17 +98,16 @@ def root_device(self) -> torch.device: return xm.xla_device() @staticmethod - def _validate_dataloader(dataloaders: Union[List[DataLoader], DataLoader]) -> None: - if not isinstance(dataloaders, list): - dataloaders = [dataloaders] - - for dataloader in dataloaders: + def _validate_dataloader(dataloaders: Union[TRAIN_DATALOADERS, EVAL_DATALOADERS]) -> None: + def check_has_len(dataloader: DataLoader) -> None: if not has_len(dataloader): raise MisconfigurationException( "TPUs do not currently support IterableDataset objects, the dataset must implement `__len__`." " HINT: You can mock the length on your dataset to bypass this MisconfigurationException." ) + apply_to_collection(dataloaders, dtype=object, wrong_dtype=(Sequence, Mapping), function=check_has_len) + @staticmethod def _validate_patched_dataloaders(model: "pl.LightningModule") -> None: """Validate and fail fast if the dataloaders were passed directly to fit.""" @@ -118,24 +120,29 @@ def _validate_patched_dataloaders(model: "pl.LightningModule") -> None: ) for source in sources: if not source.is_module(): + assert source.instance is not None + assert not isinstance(source.instance, (pl.LightningModule, pl.LightningDataModule)) TPUSpawnStrategy._validate_dataloader(source.instance) - def connect(self, model: "pl.LightningModule") -> None: + def connect(self, model: "pl.LightningModule") -> None: # type: ignore TPUSpawnStrategy._validate_patched_dataloaders(model) self.wrapped_model = xmp.MpModelWrapper(LightningDistributedModule(model)) return super().connect(model) - def _configure_launcher(self): + def _configure_launcher(self) -> None: self._launcher = _XLALauncher(self) def setup(self, trainer: "pl.Trainer") -> None: + assert self.accelerator self.accelerator.setup(trainer) if self.debug: os.environ["PT_XLA_DEBUG"] = "1" + assert self.model shared_params = find_shared_parameters(self.model) self.model_to_device() + assert isinstance(self.model.module, Module) set_shared_parameters(self.model.module, shared_params) self.setup_precision_plugin() @@ -143,7 +150,7 @@ def setup(self, trainer: "pl.Trainer") -> None: self.setup_optimizers(trainer) optimizers_to_device(self.optimizers, self.root_device) - def _setup_model(self, model: Module) -> Module: + def _setup_model(self, model: Module) -> Module: # type: ignore return model @property @@ -168,11 +175,11 @@ def configure_ddp(self) -> None: def model_to_device(self) -> None: self.model = self.wrapped_model.to(self.root_device) - def barrier(self, name: Optional[str] = None) -> None: + def barrier(self, name: Optional[str] = None, *args: Any, **kwargs: Any) -> None: if self.is_distributed: rendezvous(name) - def broadcast(self, obj: object, src: int = 0) -> object: + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: if not self.is_distributed: return obj buffer = io.BytesIO() @@ -184,7 +191,9 @@ def broadcast(self, obj: object, src: int = 0) -> object: obj = torch.load(buffer) return obj - def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None): + def reduce( + self, output: Union[Tensor, Any], group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = None + ) -> Tensor: if not isinstance(output, Tensor): output = torch.tensor(output, device=self.root_device) @@ -203,20 +212,23 @@ def reduce(self, output, group: Optional[Any] = None, reduce_op: Optional[Union[ return output - def _worker_setup(self, process_idx: int): + def _worker_setup(self, process_idx: int) -> None: self._launched = True self.set_world_ranks(process_idx) rank_zero_only.rank = self.global_rank - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + assert self.model is not None with self.precision_plugin.val_step_context(): return self.model(*args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: + assert self.model is not None with self.precision_plugin.test_step_context(): return self.model(*args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + assert self.model is not None with self.precision_plugin.predict_step_context(): return self.model(*args, **kwargs) diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index 7831316a98ae1..e1aca404722db 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -516,7 +516,7 @@ def is_defined(self) -> bool: return not self.is_module() or is_overridden(self.name, self.instance) def is_module(self) -> bool: - """Returns whether the the DataLoader source is a LightningModule or a LightningDataModule. + """Returns whether the DataLoader source is a LightningModule or a LightningDataModule. It does not check whether ``*_dataloader`` methods are actually overridden. """ diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py index cfeb48c423332..8729520ee9d96 100644 --- a/src/pytorch_lightning/utilities/apply_func.py +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -76,7 +76,7 @@ def apply_to_collection( dtype: Union[type, Any, Tuple[Union[type, Any]]], function: Callable, *args: Any, - wrong_dtype: Optional[Union[type, Tuple[type]]] = None, + wrong_dtype: Optional[Union[type, Tuple[type, ...]]] = None, include_none: bool = True, **kwargs: Any, ) -> Any: From af07e75ab9364636bb559facd469e5407c922cf6 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 2 Aug 2022 19:06:32 +0900 Subject: [PATCH 060/230] Fix MPS availability check (#13947) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/accelerators/mps.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/accelerators/mps.py b/src/pytorch_lightning/accelerators/mps.py index 3a7178f0623c2..20a2e609fa54b 100644 --- a/src/pytorch_lightning/accelerators/mps.py +++ b/src/pytorch_lightning/accelerators/mps.py @@ -24,7 +24,7 @@ # For using the `MPSAccelerator`, user's machine should have `torch>=1.12`, Metal programming framework and # the ARM-based Apple Silicon processors. -_MPS_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() and platform.platform() == "arm" +_MPS_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() and platform.processor() == "arm" class MPSAccelerator(Accelerator): From ff2e3296a00426f95a05d37b93fdd8175bbb7190 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Aug 2022 13:28:49 +0200 Subject: [PATCH 061/230] Update version to 1.7.0 for release (#13977) --- src/pytorch_lightning/__about__.py | 2 +- src/pytorch_lightning/__version__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/__about__.py b/src/pytorch_lightning/__about__.py index 2cd55565de9ba..6d09c5264e1ab 100644 --- a/src/pytorch_lightning/__about__.py +++ b/src/pytorch_lightning/__about__.py @@ -13,7 +13,7 @@ # limitations under the License. import time -# __version__ = "1.7.0dev" +# __version__ = "1.7.0" __author__ = "Lightning AI et al." __author_email__ = "pytorch@lightning.ai" __license__ = "Apache-2.0" diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index 748f8f4eaea0c..a55413d1549b4 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.7.0rc1" +version = "1.7.0" From f576ed3bbda95a5045edacc49146a3f1cdcd892a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Aug 2022 13:34:12 +0200 Subject: [PATCH 062/230] Fix resuming the tqdm progress bar (#13962) --- src/pytorch_lightning/CHANGELOG.md | 5 ++++- .../callbacks/progress/tqdm_progress.py | 22 ++++++++++--------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b4a236b846dab..c8428acd3d06b 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -408,6 +408,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed Python 3.10 compatibility for truncated back-propagation through time (TBPTT) ([#13973](https://github.com/Lightning-AI/lightning/pull/13973)) +- Fixed `TQDMProgressBar` reset and update to show correct time estimation (2/2) ([#13962](https://github.com/Lightning-AI/lightning/pull/13962)) + + ## [1.6.5] - 2022-07-13 @@ -463,7 +466,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `fuse_modules` to be qat-aware for `torch>=1.11` ([#12891](https://github.com/Lightning-AI/lightning/pull/12891)) - Enforced eval shuffle warning only for default samplers in DataLoader ([#12653](https://github.com/Lightning-AI/lightning/pull/12653)) - Enable mixed precision in `DDPFullyShardedStrategy` when `precision=16` ([#12965](https://github.com/Lightning-AI/lightning/pull/12965)) -- Fixed `TQDMProgressBar` reset and update to show correct time estimation ([#12889](https://github.com/Lightning-AI/lightning/pull/12889)) +- Fixed `TQDMProgressBar` reset and update to show correct time estimation (1/2) ([#12889](https://github.com/Lightning-AI/lightning/pull/12889)) - Fixed fit loop restart logic to enable resume using the checkpoint ([#12821](https://github.com/Lightning-AI/lightning/pull/12821)) diff --git a/src/pytorch_lightning/callbacks/progress/tqdm_progress.py b/src/pytorch_lightning/callbacks/progress/tqdm_progress.py index ff203c666216f..4911c4a4a697f 100644 --- a/src/pytorch_lightning/callbacks/progress/tqdm_progress.py +++ b/src/pytorch_lightning/callbacks/progress/tqdm_progress.py @@ -254,12 +254,13 @@ def on_train_start(self, *_: Any) -> None: def on_train_epoch_start(self, trainer: "pl.Trainer", *_: Any) -> None: total_batches = self.total_batches_current_epoch self.main_progress_bar.reset(convert_inf(total_batches)) + self.main_progress_bar.initial = 0 self.main_progress_bar.set_description(f"Epoch {trainer.current_epoch}") def on_train_batch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule", *_: Any) -> None: current = self.train_batch_idx + self._val_processed if self._should_update(current, self.main_progress_bar.total): - _update_n(self.main_progress_bar, current, self.refresh_rate) + _update_n(self.main_progress_bar, current) self.main_progress_bar.set_postfix(self.get_metrics(trainer, pl_module)) def on_train_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: @@ -280,16 +281,17 @@ def on_validation_batch_start( return self.val_progress_bar.reset(convert_inf(self.total_val_batches_current_dataloader)) + self.val_progress_bar.initial = 0 desc = self.sanity_check_description if trainer.sanity_checking else self.validation_description self.val_progress_bar.set_description(f"{desc} DataLoader {dataloader_idx}") def on_validation_batch_end(self, trainer: "pl.Trainer", *_: Any) -> None: if self._should_update(self.val_batch_idx, self.val_progress_bar.total): - _update_n(self.val_progress_bar, self.val_batch_idx, self.refresh_rate) + _update_n(self.val_progress_bar, self.val_batch_idx) current = self.train_batch_idx + self._val_processed if trainer.state.fn == "fit" and self._should_update(current, self.main_progress_bar.total): - _update_n(self.main_progress_bar, current, self.refresh_rate) + _update_n(self.main_progress_bar, current) def on_validation_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: if self._main_progress_bar is not None and trainer.state.fn == "fit": @@ -307,11 +309,12 @@ def on_test_batch_start( return self.test_progress_bar.reset(convert_inf(self.total_test_batches_current_dataloader)) + self.test_progress_bar.initial = 0 self.test_progress_bar.set_description(f"{self.test_description} DataLoader {dataloader_idx}") def on_test_batch_end(self, *_: Any) -> None: if self._should_update(self.test_batch_idx, self.test_progress_bar.total): - _update_n(self.test_progress_bar, self.test_batch_idx, self.refresh_rate) + _update_n(self.test_progress_bar, self.test_batch_idx) def on_test_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self.test_progress_bar.close() @@ -327,11 +330,12 @@ def on_predict_batch_start( return self.predict_progress_bar.reset(convert_inf(self.total_predict_batches_current_dataloader)) + self.predict_progress_bar.initial = 0 self.predict_progress_bar.set_description(f"{self.predict_description} DataLoader {dataloader_idx}") def on_predict_batch_end(self, *_: Any) -> None: if self._should_update(self.predict_batch_idx, self.predict_progress_bar.total): - _update_n(self.predict_progress_bar, self.predict_batch_idx, self.refresh_rate) + _update_n(self.predict_progress_bar, self.predict_batch_idx) def on_predict_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self.predict_progress_bar.close() @@ -375,9 +379,7 @@ def convert_inf(x: Optional[Union[int, float]]) -> Optional[Union[int, float]]: return x -def _update_n(bar: _tqdm, current: int, refresh_rate: int) -> None: +def _update_n(bar: _tqdm, value: int) -> None: if not bar.disable: - total = bar.total - leftover = current % refresh_rate - advance = leftover if (current == total and leftover != 0) else refresh_rate - bar.update(advance) + bar.n = value + bar.refresh() From d2c086b04ad14ec5d6cf63e9487381a96eee1733 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 2 Aug 2022 15:06:29 +0200 Subject: [PATCH 063/230] Prepare changelog for 1.7 release (#13979) * Prepare changelog for 1.7.0 release * update links * fix conflicts Co-authored-by: Justus Schock <12886177+justusschock@users.noreply.github.com> --- src/pytorch_lightning/CHANGELOG.md | 263 +---------------------------- 1 file changed, 4 insertions(+), 259 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index c8428acd3d06b..89fa726922a40 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -5,413 +5,158 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [1.7.0] - 2022-MM-DD +## [1.7.0] - 2022-08-02 ### Added - Added ``ServableModule`` and its associated callback called ``ServableModuleValidator`` to ensure the model can served ([#13614](https://github.com/Lightning-AI/lightning/pull/13614)) - - Converted validation loop config warnings to `PossibleUserWarning` ([#13377](https://github.com/Lightning-AI/lightning/pull/13377)) - - - Added a flag named `log_rank_zero_only` to `EarlyStopping` to disable logging to non-zero rank processes ([#13233](https://github.com/Lightning-AI/lightning/pull/13233)) - - - Added support for reloading the last checkpoint saved by passing `ckpt_path="last"` ([#12816](https://github.com/Lightning-AI/lightning/pull/12816)) - - - Added `LightningDataModule.load_from_checkpoint` to support loading datamodules directly from checkpoint ([#12550](https://github.com/Lightning-AI/lightning/pull/12550)) - - - Added a friendly error message when attempting to call `Trainer.save_checkpoint()` without a model attached ([#12772](https://github.com/Lightning-AI/lightning/pull/12772)) - - - Added a friendly error message when attempting to use `DeepSpeedStrategy` on unsupported accelerators ([#12699](https://github.com/Lightning-AI/lightning/pull/12699)) - - - Enabled `torch.inference_mode` for evaluation and prediction ([#12715](https://github.com/Lightning-AI/lightning/pull/12715)) - - - Added support for setting `val_check_interval` to a value higher than the amount of training batches when `check_val_every_n_epoch=None` ([#11993](https://github.com/Lightning-AI/lightning/pull/11993)) - - - Include the `pytorch_lightning` version as a header in the CLI config files ([#12532](https://github.com/Lightning-AI/lightning/pull/12532)) - - - Added support for `Callback` registration through entry points ([#12739](https://github.com/Lightning-AI/lightning/pull/12739)) - - - Added support for `Trainer(deterministic="warn")` to warn instead of fail when a non-deterministic operation is encountered ([#12588](https://github.com/Lightning-AI/lightning/pull/12588)) - - - Added profiling to the loops' dataloader `__next__` calls ([#12124](https://github.com/Lightning-AI/lightning/pull/12124)) - - Hivemind Strategy * Added `CollaborativeStrategy` ([#12842](https://github.com/Lightning-AI/lightning/pull/12842)) * Renamed `CollaborativeStrategy` to `HivemindStrategy` ([#13388](https://github.com/Lightning-AI/lightning/pull/13388)) * Removed unnecessary endpoint logic, renamed `collaborative` to `hivemind` ([#13392](https://github.com/Lightning-AI/lightning/pull/13392)) - - Include a version suffix for new "last" checkpoints of later runs in the same directory ([#12902](https://github.com/Lightning-AI/lightning/pull/12902)) - - - Show a better error message when a Metric that does not return a Tensor is logged ([#13164](https://github.com/Lightning-AI/lightning/pull/13164)) - - - Added missing `predict_dataset` argument in `LightningDataModule.from_datasets` to create predict dataloaders ([#12942](https://github.com/Lightning-AI/lightning/pull/12942)) - - - Added class name prefix to metrics logged by `DeviceStatsMonitor` ([#12228](https://github.com/Lightning-AI/lightning/pull/12228)) - - - Automatically wrap custom samplers under a distributed environment by using `DistributedSamplerWrapper` ([#12959](https://github.com/Lightning-AI/lightning/pull/12959)) - - - Added profiling of `LightningDataModule` hooks ([#12971](https://github.com/Lightning-AI/lightning/pull/12971)) - - - Added Native FSDP Strategy ([#12447](https://github.com/Lightning-AI/lightning/pull/12447)) - - - Added breaking of lazy graph across training, validation, test and predict steps when training with habana accelerators to ensure better performance ([#12938](https://github.com/Lightning-AI/lightning/pull/12938)) - - - Added `Checkpoint` class to inherit from ([#13024](https://github.com/Lightning-AI/lightning/pull/13024)) - - - Added CPU metric tracking to `DeviceStatsMonitor` ([#11795](https://github.com/Lightning-AI/lightning/pull/11795)) - - - Added `teardown()` method to `Accelerator` ([#11935](https://github.com/Lightning-AI/lightning/pull/11935)) - - - Added support for using custom Trainers that don't include callbacks using the CLI ([#13138](https://github.com/Lightning-AI/lightning/pull/13138)) - - - Added a `timeout` argument to `DDPStrategy` and `DDPSpawnStrategy`. ([#13244](https://github.com/Lightning-AI/lightning/pull/13244), [#13383](https://github.com/Lightning-AI/lightning/pull/13383)) - - - Added `XLAEnvironment` cluster environment plugin ([#11330](https://github.com/Lightning-AI/lightning/pull/11330)) - - - Added logging messages to notify when `FitLoop` stopping conditions are met ([#9749](https://github.com/Lightning-AI/lightning/pull/9749)) - - - Added support for calling unknown methods with `DummyLogger` ([#13224](https://github.com/Lightning-AI/lightning/pull/13224) - - - Added support for recursively setting the `Trainer` reference for ensembles of `LightningModule`s ([#13638](https://github.com/Lightning-AI/lightning/pull/13638) - - - Added Apple Silicon Support via `MPSAccelerator` ([#13123](https://github.com/Lightning-AI/lightning/pull/13123)) - - - Added support for DDP Fork ([#13405](https://github.com/Lightning-AI/lightning/pull/13405)) - - - Added support for async checkpointing ([#13658](https://github.com/Lightning-AI/lightning/pull/13658)) - - - Added support for HPU Device stats monitor ([#13819](https://github.com/Lightning-AI/lightning/pull/13819)) - ### Changed - `accelerator="gpu"` now automatically selects an available GPU backend (CUDA and MPS currently) ([#13642](https://github.com/Lightning-AI/lightning/pull/13642)) - - - Enable validation during overfitting ([#12527](https://github.com/Lightning-AI/lightning/pull/12527)) - - - Added dataclass support to `extract_batch_size` ([#12573](https://github.com/Lightning-AI/lightning/pull/12573)) - - - Changed checkpoints save path in the case of one logger and user-provided weights_save_path from `weights_save_path/name/version/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/Lightning-AI/lightning/pull/12372)) - - - Changed checkpoints save path in the case of multiple loggers and user-provided weights_save_path from `weights_save_path/name1_name2/version1_version2/checkpoints` to `weights_save_path/checkpoints` ([#12372](https://github.com/Lightning-AI/lightning/pull/12372)) - - - Marked `swa_lrs` argument in `StochasticWeightAveraging` callback as required ([#12556](https://github.com/Lightning-AI/lightning/pull/12556)) - - - `LightningCLI`'s shorthand notation changed to use jsonargparse native feature ([#12614](https://github.com/Lightning-AI/lightning/pull/12614)) - - - `LightningCLI` changed to use jsonargparse native support for list append ([#13129](https://github.com/Lightning-AI/lightning/pull/13129)) - - - Changed `seed_everything_default` argument in the `LightningCLI` to type `Union[bool, int]`. If set to `True` a seed is automatically generated for the parser argument `--seed_everything`. ([#12822](https://github.com/Lightning-AI/lightning/pull/12822), [#13110](https://github.com/Lightning-AI/lightning/pull/13110)) - - - Make positional arguments required for classes passed into the `add_argparse_args` function. ([#12504](https://github.com/Lightning-AI/lightning/pull/12504)) - - - Raise an error if there are insufficient training batches when using a float value of `limit_train_batches` ([#12885](https://github.com/Lightning-AI/lightning/pull/12885)) - - - `DataLoader` instantiated inside a `*_dataloader` hook will not set the passed arguments as attributes anymore ([#12981](https://github.com/Lightning-AI/lightning/pull/12981)) - - - When a multi-element tensor is logged, an error is now raised instead of silently taking the mean of all elements ([#13164](https://github.com/Lightning-AI/lightning/pull/13164)) - - - The `WandbLogger` will now use the run name in the logs folder if it is provided, and otherwise the project name ([#12604](https://github.com/Lightning-AI/lightning/pull/12604)) - - - Enabled using any Sampler in distributed environment in Lite ([#13646](https://github.com/Lightning-AI/lightning/pull/13646)) - - - Raised a warning instead of forcing `sync_dist=True` on epoch end ([13364](https://github.com/Lightning-AI/lightning/pull/13364)) - - - Updated `val_check_interval`(int) to consider total train batches processed instead of `_batches_that_stepped` for validation check during training ([#12832](https://github.com/Lightning-AI/lightning/pull/12832) - - - Updated Habana Accelerator's `auto_device_count`, `is_available` & `get_device_name` methods based on the latest torch habana package ([#13423](https://github.com/Lightning-AI/lightning/pull/13423)) - - - Disallowed using `BatchSampler` when running on multiple IPUs ([#13854](https://github.com/Lightning-AI/lightning/pull/13854)) - ### Deprecated - Deprecated `pytorch_lightning.accelerators.gpu.GPUAccelerator` in favor of `pytorch_lightning.accelerators.cuda.CUDAAccelerator` ([#13636](https://github.com/Lightning-AI/lightning/pull/13636)) - - - Deprecated `pytorch_lightning.loggers.base.LightningLoggerBase` in favor of `pytorch_lightning.loggers.logger.Logger`, and deprecated `pytorch_lightning.loggers.base` in favor of `pytorch_lightning.loggers.logger` ([#120148](https://github.com/Lightning-AI/lightning/pull/12014)) - - - Deprecated `pytorch_lightning.callbacks.base.Callback` in favor of `pytorch_lightning.callbacks.callback.Callback` ([#13031](https://github.com/Lightning-AI/lightning/pull/13031)) - - - Deprecated `num_processes`, `gpus`, `tpu_cores,` and `ipus` from the `Trainer` constructor in favor of using the `accelerator` and `devices` arguments ([#11040](https://github.com/Lightning-AI/lightning/pull/11040)) - - - Deprecated setting `LightningCLI(seed_everything_default=None)` in favor of `False` ([#12804](https://github.com/Lightning-AI/lightning/issues/12804)). - - - Deprecated `pytorch_lightning.core.lightning.LightningModule` in favor of `pytorch_lightning.core.module.LightningModule` ([#12740](https://github.com/Lightning-AI/lightning/pull/12740)) - - - Deprecated `pytorch_lightning.loops.base.Loop` in favor of `pytorch_lightning.loops.loop.Loop` ([#13043](https://github.com/Lightning-AI/lightning/pull/13043)) - - - Deprecated `Trainer.reset_train_val_dataloaders()` in favor of `Trainer.reset_{train,val}_dataloader` ([#12184](https://github.com/Lightning-AI/lightning/pull/12184)) - - - Deprecated LightningCLI's registries in favor of importing the respective package ([#13221](https://github.com/Lightning-AI/lightning/pull/13221)) - - - Deprecated public utilities in `pytorch_lightning.utilities.cli.LightningCLI` in favor of equivalent copies in `pytorch_lightning.cli.LightningCLI` ([#13767](https://github.com/Lightning-AI/lightning/pull/13767)) - - - Deprecated `pytorch_lightning.profiler` in favor of `pytorch_lightning.profilers` ([#12308](https://github.com/Lightning-AI/lightning/pull/12308)) - ### Removed - Removed deprecated `IndexBatchSamplerWrapper.batch_indices` ([#13565](https://github.com/Lightning-AI/lightning/pull/13565)) - - - Removed the deprecated `LightningModule.add_to_queue` and `LightningModule.get_from_queue` method ([#13600](https://github.com/Lightning-AI/lightning/pull/13600)) - - - Removed deprecated `pytorch_lightning.core.decorators.parameter_validation` from `decorators` ([#13514](https://github.com/Lightning-AI/lightning/pull/13514)) - - - Removed the deprecated `Logger.close` method ([#13149](https://github.com/Lightning-AI/lightning/pull/13149)) - - - Removed the deprecated `weights_summary` argument from the `Trainer` constructor ([#13070](https://github.com/Lightning-AI/lightning/pull/13070)) - - - Removed the deprecated `flush_logs_every_n_steps` argument from the `Trainer` constructor ([#13074](https://github.com/Lightning-AI/lightning/pull/13074)) - - - Removed the deprecated `process_position` argument from the `Trainer` constructor ([13071](https://github.com/Lightning-AI/lightning/pull/13071)) - - - Removed the deprecated `checkpoint_callback` argument from the `Trainer` constructor ([#13027](https://github.com/Lightning-AI/lightning/pull/13027)) - - - Removed the deprecated `on_{train,val,test,predict}_dataloader` hooks from the `LightningModule` and `LightningDataModule` ([#13033](https://github.com/Lightning-AI/lightning/pull/13033)) - - - Removed the deprecated `TestTubeLogger` ([#12859](https://github.com/Lightning-AI/lightning/pull/12859)) - - - Removed the deprecated `pytorch_lightning.core.memory.LayerSummary` and `pytorch_lightning.core.memory.ModelSummary` ([#12593](https://github.com/Lightning-AI/lightning/pull/12593)) - - - Removed the deprecated `summarize` method from the `LightningModule` ([#12559](https://github.com/Lightning-AI/lightning/pull/12559)) - - - Removed the deprecated `model_size` property from the `LightningModule` class ([#12641](https://github.com/Lightning-AI/lightning/pull/12641)) - - - Removed the deprecated `stochastic_weight_avg` argument from the `Trainer` constructor ([#12535](https://github.com/Lightning-AI/lightning/pull/12535)) - - - Removed the deprecated `progress_bar_refresh_rate` argument from the `Trainer` constructor ([#12514](https://github.com/Lightning-AI/lightning/pull/12514)) - - - Removed the deprecated `prepare_data_per_node` argument from the `Trainer` constructor ([#12536](https://github.com/Lightning-AI/lightning/pull/12536)) - - - Removed the deprecated `pytorch_lightning.core.memory.{get_gpu_memory_map,get_memory_profile}` ([#12659](https://github.com/Lightning-AI/lightning/pull/12659)) - - - Removed the deprecated `terminate_on_nan` argument from the `Trainer` constructor ([#12553](https://github.com/Lightning-AI/lightning/pull/12553)) - - - Removed the deprecated `XLAStatsMonitor` callback ([#12688](https://github.com/Lightning-AI/lightning/pull/12688)) - - - Remove deprecated `pytorch_lightning.callbacks.progress.progress` ([#12658](https://github.com/Lightning-AI/lightning/pull/12658)) - - - Removed the deprecated `dim` and `size` arguments from the `LightningDataModule` constructor([#12780](https://github.com/Lightning-AI/lightning/pull/12780)) - - - Removed the deprecated `train_transforms` argument from the `LightningDataModule` constructor([#12662](https://github.com/Lightning-AI/lightning/pull/12662)) - - - Removed the deprecated `log_gpu_memory` argument from the `Trainer` constructor ([#12657](https://github.com/Lightning-AI/lightning/pull/12657)) - - - Removed the deprecated automatic logging of GPU stats by the logger connector ([#12657](https://github.com/Lightning-AI/lightning/pull/12657)) - - - Removed deprecated `GPUStatsMonitor` callback ([#12554](https://github.com/Lightning-AI/lightning/pull/12554)) - - - Removed support for passing strategy names or strategy instances to the accelerator Trainer argument ([#12696](https://github.com/Lightning-AI/lightning/pull/12696)) - - - Removed support for passing strategy names or strategy instances to the plugins Trainer argument ([#12700](https://github.com/Lightning-AI/lightning/pull/12700)) - - - Removed the deprecated `val_transforms` argument from the `LightningDataModule` constructor ([#12763](https://github.com/Lightning-AI/lightning/pull/12763)) - - - Removed the deprecated `test_transforms` argument from the `LightningDataModule` constructor ([#12773](https://github.com/Lightning-AI/lightning/pull/12773)) - - - Removed deprecated `Trainer(max_steps=None)` ([#13591](https://github.com/Lightning-AI/lightning/pull/13591)) - - - Removed deprecated `dataloader_idx` argument from `on_train_batch_start/end` hooks `Callback` and `LightningModule` ([#12769](https://github.com/Lightning-AI/lightning/pull/12769), [#12977](https://github.com/Lightning-AI/lightning/pull/12977)) - - - Removed deprecated `get_progress_bar_dict` property from `LightningModule` ([#12839](https://github.com/Lightning-AI/lightning/pull/12839)) - - - Removed sanity check for multi-optimizer support with habana backends ([#13217](https://github.com/Lightning-AI/lightning/pull/13217)) - - - Removed the need to explicitly load habana module ([#13338](https://github.com/Lightning-AI/lightning/pull/13338)) - - - Removed the deprecated `Strategy.post_dispatch()` hook ([#13461](https://github.com/Lightning-AI/lightning/pull/13461)) - - - Removed deprecated `pytorch_lightning.callbacks.lr_monitor.LearningRateMonitor.lr_sch_names` ([#13353](https://github.com/Lightning-AI/lightning/pull/13353)) - - - Removed deprecated `Trainer.slurm_job_id` in favor of `SLURMEnvironment.job_id` ([#13459](https://github.com/Lightning-AI/lightning/pull/13459)) - - - Removed support for the `DDP2Strategy` ([#12705](https://github.com/Lightning-AI/lightning/pull/12705)) - - - Removed deprecated `LightningDistributed` ([#13549](https://github.com/Lightning-AI/lightning/pull/13549)) - - - Removed deprecated ClusterEnvironment properties `master_address` and `master_port` in favor of `main_address` and `main_port` ([#13458](https://github.com/Lightning-AI/lightning/pull/13458)) - - - Removed deprecated ClusterEnvironment methods `KubeflowEnvironment.is_using_kubelfow()`, `LSFEnvironment.is_using_lsf()` and `TorchElasticEnvironment.is_using_torchelastic()` in favor of the `detect()` method ([#13458](https://github.com/Lightning-AI/lightning/pull/13458)) - - - Removed deprecated `Callback.on_keyboard_interrupt` ([#13438](https://github.com/Lightning-AI/lightning/pull/13438)) - - - Removed deprecated `LightningModule.on_post_move_to_device` ([#13548](https://github.com/Lightning-AI/lightning/pull/13548)) - - - Removed `TPUSpawnStrategy.{tpu_local_core_rank,tpu_global_core_rank}` attributes in favor of `TPUSpawnStrategy.{local_rank,global_rank}` ([#11163](https://github.com/Lightning-AI/lightning/pull/11163)) - - - Removed `SingleTPUStrategy.{tpu_local_core_rank,tpu_global_core_rank}` attributes in favor of `SingleTPUStrategy.{local_rank,global_rank}`([#11163](https://github.com/Lightning-AI/lightning/pull/11163)) - - ### Fixed - - Improved support for custom `DataLoader`s when instantiated in `*_dataloader` hook ([#12981](https://github.com/Lightning-AI/lightning/pull/12981)) - - Allowed custom `BatchSampler`s when instantiated in `*_dataloader` hook [#13640](https://github.com/Lightning-AI/lightning/pull/13640)) - - - Fixed an issue with unsupported torch.inference_mode() on hpu backends by making it use no_grad ([#13014](https://github.com/Lightning-AI/lightning/pull/13014)) - - - The model wrapper returned by `LightningLite.setup()` now properly supports pass-through when looking up attributes ([#12597](https://github.com/Lightning-AI/lightning/pull/12597)) - - - Fixed issue where the CLI fails with certain torch objects ([#13153](https://github.com/Lightning-AI/lightning/pull/13153)) - - - Fixed ``LightningCLI`` signature parameter resolving for some lightning classes ([#13283](https://github.com/Lightning-AI/lightning/pull/13283)) - - - Fixed Model Summary when using DeepSpeed Stage 3 ([#13427](https://github.com/Lightning-AI/lightning/pull/13427)) - - - Fixed `pytorch_lightning.utilities.distributed.gather_all_tensors` to handle tensors of different dimensions ([#12630](https://github.com/Lightning-AI/lightning/pull/12630)) - - - Fixed the input validation for the accelerator Trainer argument when passed as a string ([#13417](https://github.com/Lightning-AI/lightning/pull/13417)) - - - Fixed `Trainer.predict(return_predictions=False)` to track prediction's batch_indices ([#13629](https://github.com/Lightning-AI/lightning/pull/13629)) - - - Fixed and issue that prevented setting a custom `CheckpointIO` plugin with strategies ([#13785](https://github.com/Lightning-AI/lightning/pull/13785)) - - - Fixed main progress bar counter when `val_check_interval=int` and `check_val_every_n_epoch=None` ([#12832](https://github.com/Lightning-AI/lightning/pull/12832) - - - Improved support for custom `ReduceLROnPlateau` scheduler if `reduce_on_plateau` is set by the user in scheduler config ([#13838](https://github.com/Lightning-AI/lightning/pull/13838)) - - - Used `global_step` while restoring logging step for old checkpoints ([#13645](https://github.com/Lightning-AI/lightning/pull/13645)) - - - When training with `precision=16` on IPU, the cast has been moved off the IPU onto the host, making the copies from host to IPU cheaper ([#13880](https://github.com/Lightning-AI/lightning/pull/13880)) - - - Fixed error handling in learning rate finder when not enough data points are available to give a good suggestion ([#13845](https://github.com/Lightning-AI/lightning/pull/13845)) - - - Fixed an issue that caused the learning rate finder to set the model's learning rate to None when no suggestion was possible ([#13845](https://github.com/Lightning-AI/lightning/pull/13845)) - - - Fixed an issue causing deterministic algorighms and other globals to get reset in spawned processes ([#13921](https://github.com/Lightning-AI/lightning/pull/13921)) - - -- Fixed default `amp_level` for `DeepSpeedPrecisionPlugin` to `O2` ([#13897](https://github.com/PyTorchLightning/pytorch-lightning/pull/13897)) - - +- Fixed default `amp_level` for `DeepSpeedPrecisionPlugin` to `O2` ([#13897](https://github.com/Lightning-AI/lightning/pull/13897)) - Fixed Python 3.10 compatibility for truncated back-propagation through time (TBPTT) ([#13973](https://github.com/Lightning-AI/lightning/pull/13973)) - - - Fixed `TQDMProgressBar` reset and update to show correct time estimation (2/2) ([#13962](https://github.com/Lightning-AI/lightning/pull/13962)) - ## [1.6.5] - 2022-07-13 ### Fixed @@ -972,7 +717,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed `CombinedLoader` and `max_size_cycle` didn't receive a `DistributedSampler` ([#10374](https://github.com/Lightning-AI/lightning/pull/10374)) - Fixed an issue where class or init-only variables of dataclasses were passed to the dataclass constructor in `utilities.apply_to_collection` ([#9702](https://github.com/Lightning-AI/lightning/pull/9702)) -- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/PyTorchLightning/metrics/pull/10493)) +- Fixed `isinstance` not working with `init_meta_context`, materialized model not being moved to the device ([#10493](https://github.com/Lightning-AI/lightning/pull/10493)) - Fixed an issue that prevented the Trainer to shutdown workers when execution is interrupted due to failure([#10463](https://github.com/Lightning-AI/lightning/pull/10463)) - Squeeze the early stopping monitor to remove empty tensor dimensions ([#10461](https://github.com/Lightning-AI/lightning/pull/10461)) - Fixed sampler replacement logic with `overfit_batches` to only replace the sample when `SequentialSampler` is not used ([#10486](https://github.com/Lightning-AI/lightning/pull/10486)) @@ -2991,7 +2736,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Using `.comet.config` file for `CometLogger` ([#1913](https://github.com/Lightning-AI/lightning/pull/1913)) - Updated hooks arguments - breaking for `setup` and `teardown` ([#2850](https://github.com/Lightning-AI/lightning/pull/2850)) - Using `gfile` to support remote directories ([#2164](https://github.com/Lightning-AI/lightning/pull/2164)) -- Moved optimizer creation after device placement for DDP backends ([#2904](https://github.com/PyTorchLightning/pytorch-lighting/pull/2904)) +- Moved optimizer creation after device placement for DDP backends ([#2904](https://github.com/Lightning-AI/lighting/pull/2904)) - Support `**DictConfig` for `hparam` serialization ([#2519](https://github.com/Lightning-AI/lightning/pull/2519)) - Removed callback metrics from test results obj ([#2994](https://github.com/Lightning-AI/lightning/pull/2994)) - Re-enabled naming metrics in ckpt name ([#3060](https://github.com/Lightning-AI/lightning/pull/3060)) From beb14563f4ba915d9cef06f7d727cc5437c6e9dd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Aug 2022 18:26:38 +0000 Subject: [PATCH 064/230] Bump pypa/gh-action-pypi-publish from 1.5.0 to 1.5.1 (#13954) Bumps [pypa/gh-action-pypi-publish](https://github.com/pypa/gh-action-pypi-publish) from 1.5.0 to 1.5.1. - [Release notes](https://github.com/pypa/gh-action-pypi-publish/releases) - [Commits](https://github.com/pypa/gh-action-pypi-publish/compare/v1.5.0...v1.5.1) --- updated-dependencies: - dependency-name: pypa/gh-action-pypi-publish dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta Co-authored-by: Akihiro Nitta --- .github/workflows/events-nightly.yml | 2 +- .github/workflows/release-pypi.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/events-nightly.yml b/.github/workflows/events-nightly.yml index 3e955db3738d3..36907bf955999 100644 --- a/.github/workflows/events-nightly.yml +++ b/.github/workflows/events-nightly.yml @@ -38,7 +38,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@v1.5.0 + uses: pypa/gh-action-pypi-publish@v1.5.1 with: user: __token__ password: ${{ secrets.test_pypi_password }} diff --git a/.github/workflows/release-pypi.yml b/.github/workflows/release-pypi.yml index 7876f05cfe1e6..97c3b8eca77d1 100644 --- a/.github/workflows/release-pypi.yml +++ b/.github/workflows/release-pypi.yml @@ -202,7 +202,7 @@ jobs: # We do this, since failures on test.pypi aren't that bad - name: Publish to Test PyPI - uses: pypa/gh-action-pypi-publish@v1.5.0 + uses: pypa/gh-action-pypi-publish@v1.5.1 with: user: __token__ password: ${{ secrets.test_pypi_password }} @@ -210,7 +210,7 @@ jobs: verbose: true - name: Publish distribution 📦 to PyPI - uses: pypa/gh-action-pypi-publish@v1.5.0 + uses: pypa/gh-action-pypi-publish@v1.5.1 with: user: __token__ password: ${{ secrets.pypi_password }} From 2415834aaf4f215ddb0d618bf415405d670f9114 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 2 Aug 2022 14:30:41 -0400 Subject: [PATCH 065/230] Bump tj-actions/changed-files from 23 to 24 (#13956) Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 23 to 24. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v23...v24) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-app_block.yml | 2 +- .github/workflows/ci-pytorch_test-conda.yml | 2 +- .github/workflows/ci-pytorch_test-full.yml | 2 +- .github/workflows/ci-pytorch_test-slow.yml | 2 +- .github/workflows/ci_pr-gatekeeper.yml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-app_block.yml b/.github/workflows/ci-app_block.yml index fa582e99acd8d..75fa50794b03b 100644 --- a/.github/workflows/ci-app_block.yml +++ b/.github/workflows/ci-app_block.yml @@ -12,7 +12,7 @@ jobs: - name: Get changed files using defaults id: changed-files - uses: tj-actions/changed-files@v23 + uses: tj-actions/changed-files@v24 - name: List all added files run: | diff --git a/.github/workflows/ci-pytorch_test-conda.yml b/.github/workflows/ci-pytorch_test-conda.yml index 65bee898a6345..777ec2af759a0 100644 --- a/.github/workflows/ci-pytorch_test-conda.yml +++ b/.github/workflows/ci-pytorch_test-conda.yml @@ -39,7 +39,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v23.1 + uses: tj-actions/changed-files@v24 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch_test-full.yml index 3e96dd22fe702..fb6916d1414fe 100644 --- a/.github/workflows/ci-pytorch_test-full.yml +++ b/.github/workflows/ci-pytorch_test-full.yml @@ -35,7 +35,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v23.1 + uses: tj-actions/changed-files@v24 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch_test-slow.yml index 9de5687bba829..905f60aa85699 100644 --- a/.github/workflows/ci-pytorch_test-slow.yml +++ b/.github/workflows/ci-pytorch_test-slow.yml @@ -30,7 +30,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v23.1 + uses: tj-actions/changed-files@v24 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci_pr-gatekeeper.yml index 92215edd3c107..1e808c3397128 100644 --- a/.github/workflows/ci_pr-gatekeeper.yml +++ b/.github/workflows/ci_pr-gatekeeper.yml @@ -20,7 +20,7 @@ jobs: fetch-depth: "2" # To retrieve the preceding commit. - name: Get changed files using defaults id: changed-files - uses: tj-actions/changed-files@v23 + uses: tj-actions/changed-files@v24 - name: Determine changes id: touched run: | From 9651e420c8cbed025dd0629c9f347b6e7b9c0220 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 3 Aug 2022 13:48:22 +0530 Subject: [PATCH 066/230] Update changelog after v1.7.0 release (#13982) --- src/pytorch_lightning/CHANGELOG.md | 41 ++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 89fa726922a40..a95a88a420840 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -4,6 +4,47 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +## [1.8.0] - 2022-MM-DD + +### Added + +- + + +- + + +### Changed + +- + + +- + + +### Deprecated + +- + + +- + + +### Removed + +- + + +- + + +### Fixed + +- + + +- + ## [1.7.0] - 2022-08-02 From ce025bf954f7293ea263f94e83cea67658b8549b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Aug 2022 10:27:16 +0200 Subject: [PATCH 067/230] Lazy import check for hydra dependency (#13812) --- src/pytorch_lightning/loggers/neptune.py | 2 +- .../strategies/launchers/subprocess_script.py | 22 ++++++++++++------- src/pytorch_lightning/utilities/__init__.py | 2 -- src/pytorch_lightning/utilities/imports.py | 2 -- tests/tests_pytorch/models/test_hparams.py | 9 ++++---- 5 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/pytorch_lightning/loggers/neptune.py b/src/pytorch_lightning/loggers/neptune.py index 55b7cc46eb761..be61705289d49 100644 --- a/src/pytorch_lightning/loggers/neptune.py +++ b/src/pytorch_lightning/loggers/neptune.py @@ -39,7 +39,7 @@ from pytorch_lightning.utilities.rank_zero import rank_zero_only _NEPTUNE_AVAILABLE = _RequirementAvailable("neptune") -_NEPTUNE_GREATER_EQUAL_0_9 = _RequirementAvailable("neptune>=0.9.0") +_NEPTUNE_GREATER_EQUAL_0_9 = _RequirementAvailable("neptune-client>=0.9.0") if _NEPTUNE_AVAILABLE and _NEPTUNE_GREATER_EQUAL_0_9: diff --git a/src/pytorch_lightning/strategies/launchers/subprocess_script.py b/src/pytorch_lightning/strategies/launchers/subprocess_script.py index 5a8632fb87306..a51a109917f6e 100644 --- a/src/pytorch_lightning/strategies/launchers/subprocess_script.py +++ b/src/pytorch_lightning/strategies/launchers/subprocess_script.py @@ -23,11 +23,9 @@ import pytorch_lightning as pl from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.strategies.launchers.base import _Launcher -from pytorch_lightning.utilities import _HYDRA_AVAILABLE +from pytorch_lightning.utilities.imports import _RequirementAvailable -if _HYDRA_AVAILABLE: - from hydra.core.hydra_config import HydraConfig - from hydra.utils import get_original_cwd, to_absolute_path +_HYDRA_AVAILABLE = _RequirementAvailable("hydra") class _SubprocessScriptLauncher(_Launcher): @@ -108,13 +106,18 @@ def _call_children_scripts(self) -> None: # See https://docs.python.org/3/reference/import.html#main-spec if __main__.__spec__ is None: # pragma: no-cover # Script called as `python a/b/c.py` - # when user is using hydra find the absolute path - path_lib = os.path.abspath if not _HYDRA_AVAILABLE else to_absolute_path + if _HYDRA_AVAILABLE: + # when user is using hydra find the absolute path + from hydra.utils import to_absolute_path + + to_abs_path = to_absolute_path + else: + to_abs_path = os.path.abspath - # pull out the commands used to run the script and resolve the abs file path + # pull out the commands used to run the script and resolve the absolute file path command = sys.argv try: - full_path = path_lib(command[0]) + full_path = to_abs_path(command[0]) except Exception: full_path = os.path.abspath(command[0]) @@ -138,6 +141,9 @@ def _call_children_scripts(self) -> None: # if hydra is available and initialized, make sure to set the cwd correctly cwd: Optional[str] = None if _HYDRA_AVAILABLE: + from hydra.core.hydra_config import HydraConfig + from hydra.utils import get_original_cwd + if HydraConfig.initialized(): cwd = get_original_cwd() os_cwd = f'"{os.getcwd()}"' diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index ea0227eefb8c1..df5084dd85490 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -34,8 +34,6 @@ _HIVEMIND_AVAILABLE, _HOROVOD_AVAILABLE, _HPU_AVAILABLE, - _HYDRA_AVAILABLE, - _HYDRA_EXPERIMENTAL_AVAILABLE, _IPU_AVAILABLE, _IS_INTERACTIVE, _IS_WINDOWS, diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index 7784741ca87c1..67bf75be3c4d3 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -140,8 +140,6 @@ def __repr__(self) -> str: _HABANA_FRAMEWORK_AVAILABLE = _package_available("habana_frameworks") _HIVEMIND_AVAILABLE = _package_available("hivemind") _HOROVOD_AVAILABLE = _module_available("horovod.torch") -_HYDRA_AVAILABLE = _package_available("hydra") -_HYDRA_EXPERIMENTAL_AVAILABLE = _module_available("hydra.experimental") _KINETO_AVAILABLE = torch.profiler.kineto_available() _OMEGACONF_AVAILABLE = _package_available("omegaconf") _POPTORCH_AVAILABLE = _package_available("poptorch") diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index c064d0f8c055e..20bdfda5dc224 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -31,14 +31,12 @@ from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset -from pytorch_lightning.utilities import _HYDRA_EXPERIMENTAL_AVAILABLE, _OMEGACONF_AVAILABLE, AttributeDict, is_picklable +from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict, is_picklable from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _RequirementAvailable from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call -if _HYDRA_EXPERIMENTAL_AVAILABLE: - from hydra.experimental import compose, initialize - if _OMEGACONF_AVAILABLE: from omegaconf import Container, OmegaConf from omegaconf.dictconfig import DictConfig @@ -649,9 +647,10 @@ def test_model_with_fsspec_as_parameter(tmpdir): trainer.test() -@pytest.mark.skipif(not _HYDRA_EXPERIMENTAL_AVAILABLE, reason="Hydra experimental is not available") +@pytest.mark.skipif(_RequirementAvailable("hydra-core<1.1"), reason="Requires Hydra's Compose API") def test_model_save_hyper_parameters_interpolation_with_hydra(tmpdir): """This test relies on configuration saved under tests/models/conf/config.yaml.""" + from hydra import compose, initialize class TestHydraModel(BoringModel): def __init__(self, args_0, args_1, args_2, kwarg_1=None): From 8af85eeaafc4fe4ef11098a27f795416fe608c6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Aug 2022 12:22:47 +0200 Subject: [PATCH 068/230] Ignore _notebooks when running flake8 (#13990) --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index 055af361c4bcb..1f2e17557310e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -67,6 +67,7 @@ exclude = *.egg build temp + _notebooks select = E,W,F doctests = True From dcb4dd55d9dedb2bd25f0cecf40d54b5fbc58b0b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Aug 2022 15:06:07 +0200 Subject: [PATCH 069/230] Update docstrings for backward methods (#13886) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rohit Gupta Co-authored-by: thomas chaton --- src/pytorch_lightning/plugins/precision/apex_amp.py | 1 + src/pytorch_lightning/plugins/precision/deepspeed.py | 10 ++++++++++ .../plugins/precision/precision_plugin.py | 6 +++++- src/pytorch_lightning/strategies/strategy.py | 7 ++++++- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/plugins/precision/apex_amp.py b/src/pytorch_lightning/plugins/precision/apex_amp.py index 2077f2072ab95..cfc13630768db 100644 --- a/src/pytorch_lightning/plugins/precision/apex_amp.py +++ b/src/pytorch_lightning/plugins/precision/apex_amp.py @@ -69,6 +69,7 @@ def backward( model: the model to be optimized closure_loss: the loss value obtained from the closure optimizer: current optimizer being used. ``None`` if using manual optimization + optimizer_idx: the index of the current optimizer. ``None`` if using manual optimization """ opt = optimizer or model.trainer.optimizers with amp.scale_loss(closure_loss, opt) as closure_loss: diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 791a08a87d107..01d3017760b0e 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -81,6 +81,16 @@ def backward( *args: Any, **kwargs: Any, ) -> None: + r"""Performs back-propagation using DeepSpeed's engine. + + Args: + model: the model to be optimized + closure_loss: the loss tensor + optimizer: ignored for DeepSpeed + optimizer_idx: ignored for DeepSpeed + \*args: additional positional arguments for the :meth:`deepspeed.DeepSpeedEngine.backward` call + \**kwargs: additional keyword arguments for the :meth:`deepspeed.DeepSpeedEngine.backward` call + """ if is_overridden("backward", model): warning_cache.warn( "You have overridden the `LightningModule.backward` hook but it will be ignored since DeepSpeed handles" diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index 02d343a0876b4..60dfb1ab6c92f 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -68,12 +68,16 @@ def backward( *args: Any, **kwargs: Any, ) -> None: - """Performs the actual backpropagation. + r"""Performs the actual backpropagation. Args: model: the model to be optimized closure_loss: the loss value obtained from the closure optimizer: current optimizer being used. ``None`` if using manual optimization + optimizer_idx: the index of the current optimizer. ``None`` if using manual optimization + \*args: Positional arguments intended for the actual function that performs the backward, like + :meth:`~torch.Tensor.backward`. + \**kwargs: Keyword arguments for the same purpose as ``*args``. """ # do backward pass if model is not None and isinstance(model, pl.LightningModule): diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 0de904ccbd283..59f1e37095e60 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -179,10 +179,15 @@ def backward( *args: Any, **kwargs: Any, ) -> Tensor: - """Forwards backward-calls to the precision plugin. + r"""Forwards backward-calls to the precision plugin. Args: closure_loss: a tensor holding the loss value to backpropagate + optimizer: An optional optimizer that gets passed down to the precision plugin's backward + optimizer_idx: An optional optimizer index that gets passed down to the precision plugin's backward + \*args: Positional arguments that get passed down to the precision plugin's backward, intended as arguments + for the actual function that performs the backward, like :meth:`~torch.Tensor.backward`. + \**kwargs: Keyword arguments for the same purpose as ``*args``. """ self.pre_backward(closure_loss) assert self.lightning_module is not None From 4ce97f37a2c8b163e093bf9864d11d2304942ae3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Aug 2022 15:38:42 +0200 Subject: [PATCH 070/230] Validate the model input of trainer methods (#13892) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/trainer/trainer.py | 11 +++++++++++ tests/tests_pytorch/trainer/test_trainer.py | 17 +++++++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index a95a88a420840..05c7cda5d7242 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -16,7 +16,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892)) - diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 561fe799f1010..01d13849f8126 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -696,6 +696,8 @@ def fit( datamodule: An instance of :class:`~pytorch_lightning.core.datamodule.LightningDataModule`. """ + if not isinstance(model, pl.LightningModule): + raise TypeError(f"`Trainer.fit()` requires a `LightningModule`, got: {model.__class__.__qualname__}") self.strategy.model = model self._call_and_handle_interrupt( self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path @@ -776,6 +778,8 @@ def validate( :meth:`~pytorch_lightning.core.module.LightningModule.validation_epoch_end`, etc. The length of the list corresponds to the number of validation dataloaders used. """ + if model is not None and not isinstance(model, pl.LightningModule): + raise TypeError(f"`Trainer.validate()` requires a `LightningModule`, got: {model.__class__.__qualname__}") self.strategy.model = model or self.lightning_module return self._call_and_handle_interrupt(self._validate_impl, model, dataloaders, ckpt_path, verbose, datamodule) @@ -864,6 +868,8 @@ def test( :meth:`~pytorch_lightning.core.module.LightningModule.test_epoch_end`, etc. The length of the list corresponds to the number of test dataloaders used. """ + if model is not None and not isinstance(model, pl.LightningModule): + raise TypeError(f"`Trainer.test()` requires a `LightningModule`, got: {model.__class__.__qualname__}") self.strategy.model = model or self.lightning_module return self._call_and_handle_interrupt(self._test_impl, model, dataloaders, ckpt_path, verbose, datamodule) @@ -951,6 +957,8 @@ def predict( Returns: Returns a list of dictionaries, one for each provided dataloader containing their respective predictions. """ + if model is not None and not isinstance(model, pl.LightningModule): + raise TypeError(f"`Trainer.predict()` requires a `LightningModule`, got: {model.__class__.__qualname__}") self.strategy.model = model or self.lightning_module return self._call_and_handle_interrupt( self._predict_impl, model, dataloaders, datamodule, return_predictions, ckpt_path @@ -1033,6 +1041,9 @@ def tune( lr_find_kwargs: Arguments for :func:`~pytorch_lightning.tuner.lr_finder.lr_find` """ + if not isinstance(model, pl.LightningModule): + raise TypeError(f"`Trainer.tune()` requires a `LightningModule`, got: {model.__class__.__qualname__}") + Trainer._log_api_event("tune") self.state.fn = TrainerFn.TUNING diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index ecc0ad724e879..f868dcc353e72 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -20,12 +20,14 @@ from contextlib import nullcontext from copy import deepcopy from pathlib import Path +from re import escape from unittest import mock from unittest.mock import ANY, call, patch import cloudpickle import pytest import torch +import torch.nn as nn from torch.multiprocessing import ProcessRaisedException from torch.nn.parallel.distributed import DistributedDataParallel from torch.optim import SGD @@ -71,6 +73,21 @@ torch_test_assert_close = torch.testing.assert_allclose +def test_trainer_error_when_input_not_lightning_module(): + """Test that a useful error gets raised when the Trainer methods receive something other than a + LightningModule.""" + trainer = Trainer() + + for method in ("fit", "validate", "test", "predict"): + with pytest.raises(TypeError, match=escape(f"`Trainer.{method}()` requires a `LightningModule`, got: Linear")): + run_method = getattr(trainer, method) + run_method(nn.Linear(2, 2)) + + trainer = Trainer(auto_lr_find=True, auto_scale_batch_size=True) + with pytest.raises(TypeError, match=escape("`Trainer.tune()` requires a `LightningModule`, got: Linear")): + trainer.tune(nn.Linear(2, 2)) + + @pytest.mark.parametrize("url_ckpt", [True, False]) def test_no_val_module(monkeypatch, tmpdir, tmpdir_server, url_ckpt): """Tests use case where trainer saves the model, and user loads it from tags independently.""" From 5479c60b2226d3c911afb807b86acd522d0c5964 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 3 Aug 2022 15:47:16 +0200 Subject: [PATCH 071/230] Reduce state size (#13970) --- requirements/app/test.txt | 1 + src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/core/app.py | 22 ++--- src/lightning_app/core/work.py | 91 ++++++++++++++---- src/lightning_app/runners/runtime.py | 11 ++- src/lightning_app/testing/testing.py | 15 ++- src/lightning_app/utilities/enum.py | 13 ++- src/lightning_app/utilities/network.py | 2 +- .../utilities/packaging/build_config.py | 4 +- src/lightning_app/utilities/proxies.py | 93 +++++++++++-------- .../components/python/test_python.py | 3 +- tests/tests_app/core/test_lightning_app.py | 59 +++++++++--- tests/tests_app/core/test_lightning_flow.py | 15 ++- tests/tests_app/core/test_lightning_work.py | 37 ++++---- tests/tests_app/storage/test_payload.py | 5 + tests/tests_app/structures/test_structures.py | 13 ++- tests/tests_app/utilities/test_login.py | 2 +- tests/tests_app/utilities/test_proxies.py | 18 ++-- .../collect_failures/app.py | 2 +- 19 files changed, 266 insertions(+), 142 deletions(-) diff --git a/requirements/app/test.txt b/requirements/app/test.txt index d93aae4eaf143..9d2ed0af910ca 100644 --- a/requirements/app/test.txt +++ b/requirements/app/test.txt @@ -12,3 +12,4 @@ isort>=5.0 mypy>=0.720 httpx trio +pympler diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 34fdb9665f5aa..0f9838b1efe2e 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -24,3 +24,5 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated ### Fixed + +- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 6599b53efcb95..ab41fb256ffe6 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -18,7 +18,7 @@ from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests from lightning_app.utilities.component import _convert_paths_after_init -from lightning_app.utilities.enum import AppStage +from lightning_app.utilities.enum import AppStage, CacheCallsKeys from lightning_app.utilities.exceptions import CacheMissException, ExitAppException from lightning_app.utilities.layout import _collect_layout from lightning_app.utilities.proxies import ComponentDelta @@ -399,8 +399,8 @@ def _run(self) -> bool: if self.should_publish_changes_to_api and self.api_publish_state_queue: logger.debug("Publishing the state with changes") # Push two states to optimize start in the cloud. - self.api_publish_state_queue.put(self.state) - self.api_publish_state_queue.put(self.state) + self.api_publish_state_queue.put(self.state_vars) + self.api_publish_state_queue.put(self.state_vars) self._reset_run_time_monitor() @@ -412,7 +412,7 @@ def _run(self) -> bool: self._update_run_time_monitor() if self._has_updated and self.should_publish_changes_to_api and self.api_publish_state_queue: - self.api_publish_state_queue.put(self.state) + self.api_publish_state_queue.put(self.state_vars) return True @@ -430,16 +430,12 @@ def _apply_restarting(self) -> bool: self.stage = AppStage.BLOCKING return False - def _collect_work_finish_status(self) -> dict: - work_finished_status = {} - for work in self.works: - work_finished_status[work.name] = False - for key in work._calls: - if key == "latest_call_hash": - continue - fn_metadata = work._calls[key] - work_finished_status[work.name] = fn_metadata["name"] == "run" and "ret" in fn_metadata + def _has_work_finished(self, work): + latest_call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH] + return "ret" in work._calls[latest_call_hash] + def _collect_work_finish_status(self) -> dict: + work_finished_status = {work.name: self._has_work_finished(work) for work in self.works} assert len(work_finished_status) == len(self.works) return work_finished_status diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index 53c9e07e80020..e7c800c0d15fa 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -12,8 +12,15 @@ from lightning_app.storage.drive import _maybe_create_drive, Drive from lightning_app.storage.payload import Payload from lightning_app.utilities.app_helpers import _is_json_serializable, _LightningAppRef -from lightning_app.utilities.component import _sanitize_state -from lightning_app.utilities.enum import make_status, WorkFailureReasons, WorkStageStatus, WorkStatus, WorkStopReasons +from lightning_app.utilities.component import _is_flow_context, _sanitize_state +from lightning_app.utilities.enum import ( + CacheCallsKeys, + make_status, + WorkFailureReasons, + WorkStageStatus, + WorkStatus, + WorkStopReasons, +) from lightning_app.utilities.exceptions import LightningWorkException from lightning_app.utilities.introspection import _is_init_context from lightning_app.utilities.network import find_free_network_port @@ -107,7 +114,21 @@ def __init__( # setattr_replacement is used by the multiprocessing runtime to send the latest changes to the main coordinator self._setattr_replacement: Optional[Callable[[str, Any], None]] = None self._name = "" - self._calls = {"latest_call_hash": None} + # The ``self._calls`` is used to track whether the run + # method with a given set of input arguments has already been called. + # Example of its usage: + # { + # 'latest_call_hash': '167fe2e', + # '167fe2e': { + # 'statuses': [ + # {'stage': 'pending', 'timestamp': 1659433519.851271}, + # {'stage': 'running', 'timestamp': 1659433519.956482}, + # {'stage': 'stopped', 'timestamp': 1659433520.055768}]} + # ] + # }, + # ... + # } + self._calls = {CacheCallsKeys.LATEST_CALL_HASH: None} self._changes = {} self._raise_exception = raise_exception self._paths = {} @@ -215,13 +236,13 @@ def status(self) -> WorkStatus: All statuses are stored in the state. """ - call_hash = self._calls["latest_call_hash"] - if call_hash: + call_hash = self._calls[CacheCallsKeys.LATEST_CALL_HASH] + if call_hash in self._calls: statuses = self._calls[call_hash]["statuses"] # deltas aren't necessarily coming in the expected order. statuses = sorted(statuses, key=lambda x: x["timestamp"]) latest_status = statuses[-1] - if latest_status["reason"] == WorkFailureReasons.TIMEOUT: + if latest_status.get("reason") == WorkFailureReasons.TIMEOUT: return self._aggregate_status_timeout(statuses) return WorkStatus(**latest_status) return WorkStatus(stage=WorkStageStatus.NOT_STARTED, timestamp=time.time()) @@ -229,8 +250,8 @@ def status(self) -> WorkStatus: @property def statuses(self) -> List[WorkStatus]: """Return all the status of the work.""" - call_hash = self._calls["latest_call_hash"] - if call_hash: + call_hash = self._calls[CacheCallsKeys.LATEST_CALL_HASH] + if call_hash in self._calls: statuses = self._calls[call_hash]["statuses"] # deltas aren't necessarily coming in the expected order. statuses = sorted(statuses, key=lambda x: x["timestamp"]) @@ -398,10 +419,13 @@ def __getattr__(self, item): return path return self.__getattribute__(item) - def _call_hash(self, fn, args, kwargs): + def _call_hash(self, fn, args, kwargs) -> str: hash_args = args[1:] if len(args) > 0 and args[0] == self else args call_obj = {"args": hash_args, "kwargs": kwargs} - return f"{fn.__name__}:{DeepHash(call_obj)[call_obj]}" + # Note: Generate a hash as 167fe2e. + # Seven was selected after checking upon Github default SHA length + # and to minimize hidden state size. + return str(DeepHash(call_obj)[call_obj])[:7] def _wrap_run_for_caching(self, fn): @wraps(fn) @@ -415,11 +439,11 @@ def new_fn(*args, **kwargs): entry = self._calls[call_hash] return entry["ret"] - self._calls[call_hash] = {"name": fn.__name__, "call_hash": call_hash} + self._calls[call_hash] = {} result = fn(*args, **kwargs) - self._calls[call_hash] = {"name": fn.__name__, "call_hash": call_hash, "ret": result} + self._calls[call_hash] = {"ret": result} return result @@ -457,8 +481,40 @@ def set_state(self, provided_state): if isinstance(v, Dict): v = _maybe_create_drive(self.name, v) setattr(self, k, v) + self._changes = provided_state["changes"] - self._calls.update(provided_state["calls"]) + + # Note, this is handled by the flow only. + if _is_flow_context(): + self._cleanup_calls(provided_state["calls"]) + + self._calls = provided_state["calls"] + + @staticmethod + def _cleanup_calls(calls: Dict[str, Any]): + # 1: Collect all the in_progress call hashes + in_progress_call_hash = [k for k in list(calls) if k not in (CacheCallsKeys.LATEST_CALL_HASH)] + + for call_hash in in_progress_call_hash: + if "statuses" not in calls[call_hash]: + continue + + # 2: Filter the statuses by timestamp + statuses = sorted(calls[call_hash]["statuses"], key=lambda x: x["timestamp"]) + + # If the latest status is succeeded, then drop everything before. + if statuses[-1]["stage"] == WorkStageStatus.SUCCEEDED: + status = statuses[-1] + status["timestamp"] = int(status["timestamp"]) + calls[call_hash]["statuses"] = [status] + else: + # TODO: Some status are being duplicated, + # this seems related to the StateObserver. + final_statuses = [] + for status in statuses: + if status not in final_statuses: + final_statuses.append(status) + calls[call_hash]["statuses"] = final_statuses @abc.abstractmethod def run(self, *args, **kwargs): @@ -479,7 +535,7 @@ def _aggregate_status_timeout(self, statuses: List[Dict]) -> WorkStatus: if succeeded_statuses: succeed_status_id = succeeded_statuses[-1] + 1 statuses = statuses[succeed_status_id:] - timeout_statuses = [status for status in statuses if status["reason"] == WorkFailureReasons.TIMEOUT] + timeout_statuses = [status for status in statuses if status.get("reason") == WorkFailureReasons.TIMEOUT] assert statuses[0]["stage"] == WorkStageStatus.PENDING status = {**timeout_statuses[-1], "timestamp": statuses[0]["timestamp"]} return WorkStatus(**status, count=len(timeout_statuses)) @@ -501,9 +557,8 @@ def stop(self): ) if self.status.stage == WorkStageStatus.STOPPED: return - latest_hash = self._calls["latest_call_hash"] - self._calls[latest_hash]["statuses"].append( - make_status(WorkStageStatus.STOPPED, reason=WorkStopReasons.PENDING) - ) + latest_hash = self._calls[CacheCallsKeys.LATEST_CALL_HASH] + stop_status = make_status(WorkStageStatus.STOPPED, reason=WorkStopReasons.PENDING) + self._calls[latest_hash]["statuses"].append(stop_status) app = _LightningAppRef().get_current() self._backend.stop_work(app, self) diff --git a/src/lightning_app/runners/runtime.py b/src/lightning_app/runners/runtime.py index 3e15f958b8538..123e16d89ede5 100644 --- a/src/lightning_app/runners/runtime.py +++ b/src/lightning_app/runners/runtime.py @@ -10,7 +10,7 @@ from lightning_app import LightningApp from lightning_app.core.constants import APP_SERVER_HOST, APP_SERVER_PORT from lightning_app.runners.backends import Backend, BackendType -from lightning_app.utilities.enum import AppStage, make_status, WorkStageStatus +from lightning_app.utilities.enum import AppStage, CacheCallsKeys, make_status, WorkStageStatus from lightning_app.utilities.load_app import load_app_from_file from lightning_app.utilities.proxies import WorkRunner @@ -133,9 +133,10 @@ def dispatch(self, *args, **kwargs): raise NotImplementedError def _add_stopped_status_to_work(self, work: "lightning_app.LightningWork") -> None: + if work.status.stage == WorkStageStatus.STOPPED: return - latest_hash = work._calls["latest_call_hash"] - if latest_hash is None: - return - work._calls[latest_hash]["statuses"].append(make_status(WorkStageStatus.STOPPED)) + + latest_call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH] + if latest_call_hash in work._calls: + work._calls[latest_call_hash]["statuses"].append(make_status(WorkStageStatus.STOPPED)) diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 10abdac4aad5d..dd34614a34353 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -22,6 +22,7 @@ from lightning_app.runners.multiprocess import MultiProcessRuntime from lightning_app.testing.config import Config from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.imports import _is_playwright_available, requires from lightning_app.utilities.network import _configure_session, LightningClient from lightning_app.utilities.proxies import ProxyWorkRun @@ -114,8 +115,11 @@ def run_work_isolated(work, *args, start_server: bool = False, **kwargs): start_server=start_server, ).dispatch() # pop the stopped status. - call_hash = work._calls["latest_call_hash"] - work._calls[call_hash]["statuses"].pop(-1) + call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH] + + if call_hash in work._calls: + work._calls[call_hash]["statuses"].pop(-1) + if isinstance(work.run, ProxyWorkRun): work.run = work.run.work_run @@ -176,7 +180,7 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator: # 3. Launch the application in the cloud from the Lightning CLI. with tempfile.TemporaryDirectory() as tmpdir: env_copy = os.environ.copy() - env_copy["PREPARE_LIGHTING"] = "1" + env_copy["PACKAGE_LIGHTNING"] = "1" shutil.copytree(app_folder, tmpdir, dirs_exist_ok=True) # TODO - add -no-cache to the command line. process = Popen( @@ -216,7 +220,10 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator: record_har_path=Config.har_location, ) admin_page = context.new_page() - res = requests.post(Config.url + "/v1/auth/login", data=json.dumps(payload)) + url = Config.url + if url.endswith("/"): + url = url[:-1] + res = requests.post(url + "/v1/auth/login", data=json.dumps(payload)) token = res.json()["token"] print(f"The Lightning App Token is: {token}") print(f"The Lightning App user key is: {Config.key}") diff --git a/src/lightning_app/utilities/enum.py b/src/lightning_app/utilities/enum.py index 9469deffd925e..dbf20413aa9d9 100644 --- a/src/lightning_app/utilities/enum.py +++ b/src/lightning_app/utilities/enum.py @@ -59,9 +59,16 @@ def __post_init__(self): def make_status(stage: str, message: Optional[str] = None, reason: Optional[str] = None): - return { + status = { "stage": stage, - "message": message, - "reason": reason, "timestamp": datetime.now(tz=timezone.utc).timestamp(), } + if message: + status["message"] = message + if reason: + status["reason"] = reason + return status + + +class CacheCallsKeys: + LATEST_CALL_HASH = "latest_call_hash" diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index a9ebcf37ab564..7fd03750a515d 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -48,7 +48,7 @@ def _configure_session() -> Session: return http -def _check_service_url_is_ready(url: str, timeout: float = 1) -> bool: +def _check_service_url_is_ready(url: str, timeout: float = 100) -> bool: try: response = requests.get(url, timeout=timeout) return response.status_code in (200, 404) diff --git a/src/lightning_app/utilities/packaging/build_config.py b/src/lightning_app/utilities/packaging/build_config.py index 9231875d5d7fd..b776e202666de 100644 --- a/src/lightning_app/utilities/packaging/build_config.py +++ b/src/lightning_app/utilities/packaging/build_config.py @@ -110,7 +110,7 @@ def _find_requirements(self, work: "LightningWork") -> List[str]: file = inspect.getfile(work.__class__) # 2. Try to find a requirement file associated the file. - dirname = os.path.dirname(file) + dirname = os.path.dirname(file) or "." requirement_files = [os.path.join(dirname, f) for f in os.listdir(dirname) if f == "requirements.txt"] if not requirement_files: return [] @@ -126,7 +126,7 @@ def _find_dockerfile(self, work: "LightningWork") -> List[str]: file = inspect.getfile(work.__class__) # 2. Check for Dockerfile. - dirname = os.path.dirname(file) + dirname = os.path.dirname(file) or "." dockerfiles = [os.path.join(dirname, f) for f in os.listdir(dirname) if f == "Dockerfile"] if not dockerfiles: diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index c33e41bb70203..28d436f3e4a23 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -23,7 +23,13 @@ from lightning_app.utilities.app_helpers import affiliation from lightning_app.utilities.apply_func import apply_to_collection from lightning_app.utilities.component import _set_work_context -from lightning_app.utilities.enum import make_status, WorkFailureReasons, WorkStageStatus, WorkStopReasons +from lightning_app.utilities.enum import ( + CacheCallsKeys, + make_status, + WorkFailureReasons, + WorkStageStatus, + WorkStopReasons, +) from lightning_app.utilities.exceptions import CacheMissException, LightningSigtermStateException if TYPE_CHECKING: @@ -45,19 +51,13 @@ def unwrap(fn): return fn -def _send_data_to_caller_queue( - work: "LightningWork", caller_queue: "BaseQueue", data: Dict, call_hash: str, work_run: Callable, use_args: bool -) -> Dict: - if work._calls["latest_call_hash"] is None: - work._calls["latest_call_hash"] = call_hash +def _send_data_to_caller_queue(work: "LightningWork", caller_queue: "BaseQueue", data: Dict, call_hash: str) -> Dict: + + if work._calls[CacheCallsKeys.LATEST_CALL_HASH] is None: + work._calls[CacheCallsKeys.LATEST_CALL_HASH] = call_hash if call_hash not in work._calls: - work._calls[call_hash] = { - "name": work_run.__name__, - "call_hash": call_hash, - "use_args": use_args, - "statuses": [], - } + work._calls[call_hash] = {"statuses": []} else: # remove ret when relaunching the work. work._calls[call_hash].pop("ret", None) @@ -65,9 +65,19 @@ def _send_data_to_caller_queue( work._calls[call_hash]["statuses"].append(make_status(WorkStageStatus.PENDING)) work_state = work.state + + # There is no need to send all call hashes to the work. + calls = deepcopy(work_state["calls"]) + work_state["calls"] = { + k: v for k, v in work_state["calls"].items() if k in (call_hash, CacheCallsKeys.LATEST_CALL_HASH) + } + data.update({"state": work_state}) logger.debug(f"Sending to {work.name}: {data}") caller_queue.put(data) + + # Reset the calls entry. + work_state["calls"] = calls work._restarting = False return work_state @@ -85,9 +95,6 @@ def __post_init__(self): self.work_state = None def __call__(self, *args, **kwargs): - provided_none = len(args) == 1 and args[0] is None - use_args = len(kwargs) > 0 or (len(args) > 0 and not provided_none) - self._validate_call_args(args, kwargs) args, kwargs = self._process_call_args(args, kwargs) @@ -103,18 +110,18 @@ def __call__(self, *args, **kwargs): # for the readers. if self.cache_calls: if not entered or stopped_on_sigterm: - _send_data_to_caller_queue(self.work, self.caller_queue, data, call_hash, self.work_run, use_args) + _send_data_to_caller_queue(self.work, self.caller_queue, data, call_hash) else: if returned: return else: if not entered or stopped_on_sigterm: - _send_data_to_caller_queue(self.work, self.caller_queue, data, call_hash, self.work_run, use_args) + _send_data_to_caller_queue(self.work, self.caller_queue, data, call_hash) else: if returned or stopped_on_sigterm: # the previous task has completed and we can re-queue the next one. # overriding the return value for next loop iteration. - _send_data_to_caller_queue(self.work, self.caller_queue, data, call_hash, self.work_run, use_args) + _send_data_to_caller_queue(self.work, self.caller_queue, data, call_hash) if not self.parallel: raise CacheMissException("Task never called before. Triggered now") @@ -171,10 +178,9 @@ def sanitize(obj: Union[Path, Drive]) -> Union[Path, Dict]: class WorkStateObserver(Thread): - """This thread runs alongside LightningWork and periodically checks for state changes. - - If the state changed from one interval to the next, it will compute the delta and add it to the queue which is - connected to the Flow. This enables state changes to be captured that are not triggered through a setattr call. + """This thread runs alongside LightningWork and periodically checks for state changes. If the state changed + from one interval to the next, it will compute the delta and add it to the queue which is connected to the + Flow. This enables state changes to be captured that are not triggered through a setattr call. Args: work: The LightningWork for which the state should be monitored @@ -371,21 +377,24 @@ def run_once(self): self._proxy_setattr() # 8. Deepcopy the work state and send the first `RUNNING` status delta to the flow. - state = deepcopy(self.work.state) - self.work._calls["latest_call_hash"] = call_hash - self.work._calls[call_hash]["statuses"].append(make_status(WorkStageStatus.RUNNING)) - self.delta_queue.put(ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(state, self.work.state)))) + reference_state = deepcopy(self.work.state) - # 9. Start the state observer thread. It will look for state changes and send them back to the Flow - # The observer has to be initialized here, after the set_state call above so that the thread can start with - # the proper initial state of the work - self.state_observer.start() + # 9. Inform the flow the work is running and add the delta to the deepcopy. + self.work._calls[CacheCallsKeys.LATEST_CALL_HASH] = call_hash + self.work._calls[call_hash]["statuses"].append(make_status(WorkStageStatus.RUNNING)) + delta = Delta(DeepDiff(reference_state, self.work.state)) + self.delta_queue.put(ComponentDelta(id=self.work_name, delta=delta)) # 10. Unwrap the run method if wrapped. work_run = self.work.run if hasattr(work_run, "__wrapped__"): work_run = work_run.__wrapped__ + # 11. Start the state observer thread. It will look for state changes and send them back to the Flow + # The observer has to be initialized here, after the set_state call above so that the thread can start with + # the proper initial state of the work + self.state_observer.start() + # 12. Run the `work_run` method. # If an exception is raised, send a `FAILED` status delta to the flow and call the `on_exception` hook. try: @@ -394,23 +403,26 @@ def run_once(self): raise e except BaseException as e: # 10.2 Send failed delta to the flow. + reference_state = deepcopy(self.work.state) self.work._calls[call_hash]["statuses"].append( make_status(WorkStageStatus.FAILED, message=str(e), reason=WorkFailureReasons.USER_EXCEPTION) ) - self.delta_queue.put(ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(state, self.work.state)))) + self.delta_queue.put( + ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(reference_state, self.work.state))) + ) self.work.on_exception(e) print("########## CAPTURED EXCEPTION ###########") print(traceback.print_exc()) print("########## CAPTURED EXCEPTION ###########") return - # 14. Copy all artifacts to the shared storage so other Works can access them while this Work gets scaled down - persist_artifacts(work=self.work) - - # 15. Destroy the state observer. + # 13. Destroy the state observer. self.state_observer.join(0) self.state_observer = None + # 14. Copy all artifacts to the shared storage so other Works can access them while this Work gets scaled down + persist_artifacts(work=self.work) + # 15. An asynchronous work shouldn't return a return value. if ret is not None: raise RuntimeError( @@ -418,23 +430,24 @@ def run_once(self): "HINT: Use the Payload API instead." ) - # 16. DeepCopy the state and send the latest delta to the flow. + # 17. DeepCopy the state and send the latest delta to the flow. # use the latest state as we have already sent delta # during its execution. # inform the task has completed - state = deepcopy(self.work.state) + reference_state = deepcopy(self.work.state) self.work._calls[call_hash]["statuses"].append(make_status(WorkStageStatus.SUCCEEDED)) self.work._calls[call_hash]["ret"] = ret - self.delta_queue.put(ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(state, self.work.state)))) + self.delta_queue.put(ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(reference_state, self.work.state)))) - # 17. Update the work for the next delta if any. + # 18. Update the work for the next delta if any. self._proxy_setattr(cleanup=True) def _sigterm_signal_handler(self, signum, frame, call_hash: str) -> None: """Signal handler used to react when spot instances are being retrived.""" - logger.debug("Received SIGTERM signal. Gracefully terminating...") + logger.info(f"Received SIGTERM signal. Gracefully terminating {self.work.name.replace('root.', '')}...") persist_artifacts(work=self.work) with _state_observer_lock: + self.work._calls[call_hash]["statuses"] = [] state = deepcopy(self.work.state) self.work._calls[call_hash]["statuses"].append( make_status(WorkStageStatus.STOPPED, reason=WorkStopReasons.SIGTERM_SIGNAL_HANDLER) diff --git a/tests/tests_app/components/python/test_python.py b/tests/tests_app/components/python/test_python.py index 678655d6ee908..a8554e133e1a9 100644 --- a/tests/tests_app/components/python/test_python.py +++ b/tests/tests_app/components/python/test_python.py @@ -10,6 +10,7 @@ from lightning_app.testing.helpers import RunIf from lightning_app.testing.testing import run_work_isolated from lightning_app.utilities.component import _set_work_context +from lightning_app.utilities.enum import CacheCallsKeys COMPONENTS_SCRIPTS_FOLDER = str(os.path.join(_PROJECT_ROOT, "tests/tests_app/components/python/scripts/")) @@ -112,7 +113,7 @@ def test_tracer_component_with_code(): with open("file.py", "w") as f: f.write('raise Exception("An error")') - call_hash = python_script._calls["latest_call_hash"] + call_hash = python_script._calls[CacheCallsKeys.LATEST_CALL_HASH] python_script._calls[call_hash]["statuses"].pop(-1) python_script._calls[call_hash]["statuses"].pop(-1) diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index f55e7cb84b66a..a3a15085b98e3 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -6,6 +6,7 @@ import pytest from deepdiff import Delta +from pympler import asizeof from tests_app import _PROJECT_ROOT from lightning_app import LightningApp, LightningFlow, LightningWork # F401 @@ -486,12 +487,11 @@ def _dump_checkpoint(self): raise SuccessException -@pytest.mark.parametrize("runtime_cls", [MultiProcessRuntime]) -def test_snapshotting(runtime_cls, tmpdir): +def test_snap_shotting(): try: app = CheckpointLightningApp(FlowA()) app.checkpointing = True - runtime_cls(app, start_server=False).dispatch() + MultiProcessRuntime(app, start_server=False).dispatch() except SuccessException: pass checkpoint_dir = os.path.join(storage_root_dir(), "checkpoints") @@ -765,15 +765,17 @@ def run(self): def test_protected_attributes_not_in_state(): flow = ProtectedAttributesFlow() - MultiProcessRuntime(LightningApp(flow)).dispatch() + MultiProcessRuntime(LightningApp(flow), start_server=False).dispatch() class WorkExit(LightningWork): def __init__(self): - super().__init__() + super().__init__(raise_exception=False) + self.counter = 0 def run(self): - pass + self.counter += 1 + raise Exception("Hello") class FlowExit(LightningFlow): @@ -782,13 +784,14 @@ def __init__(self): self.work = WorkExit() def run(self): + if self.work.counter == 1: + self._exit() self.work.run() - self._exit() def test_lightning_app_exit(): app = LightningApp(FlowExit()) - MultiProcessRuntime(app).dispatch() + MultiProcessRuntime(app, start_server=False).dispatch() assert app.root.work.status.stage == WorkStageStatus.STOPPED @@ -860,12 +863,12 @@ def run(self): def test_slow_flow(): app0 = LightningApp(SleepyFlow(sleep_interval=0.5 * FLOW_DURATION_THRESHOLD)) - MultiProcessRuntime(app0).dispatch() + MultiProcessRuntime(app0, start_server=False).dispatch() app1 = LightningApp(SleepyFlow(sleep_interval=2 * FLOW_DURATION_THRESHOLD)) with pytest.warns(LightningFlowWarning): - MultiProcessRuntime(app1).dispatch() + MultiProcessRuntime(app1, start_server=False).dispatch() app0 = LightningApp( SleepyFlowWithWork( @@ -875,7 +878,7 @@ def test_slow_flow(): ) ) - MultiProcessRuntime(app0).dispatch() + MultiProcessRuntime(app0, start_server=False).dispatch() app1 = LightningApp( SleepyFlowWithWork( @@ -883,4 +886,36 @@ def test_slow_flow(): ) ) - MultiProcessRuntime(app1).dispatch() + MultiProcessRuntime(app1, start_server=False).dispatch() + + +class SizeWork(LightningWork): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.counter = 0 + + def run(self, signal: int): + self.counter += 1 + + +class SizeFlow(LightningFlow): + def __init__(self): + super().__init__() + self.work0 = SizeWork(parallel=True, cache_calls=True) + self._state_sizes = {} + + def run(self): + for idx in range(self.work0.counter + 2): + self.work0.run(idx) + + self._state_sizes[self.work0.counter] = asizeof.asizeof(self.state) + + if self.work0.counter >= 20: + self._exit() + + +def test_state_size_constant_growth(): + app = LightningApp(SizeFlow()) + MultiProcessRuntime(app, start_server=False).dispatch() + assert app.root._state_sizes[0] <= 5904 + assert app.root._state_sizes[20] <= 23736 diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py index 26841e057621b..1966c6d7b23d6 100644 --- a/tests/tests_app/core/test_lightning_flow.py +++ b/tests/tests_app/core/test_lightning_flow.py @@ -17,6 +17,7 @@ from lightning_app.storage.path import storage_root_dir from lightning_app.testing.helpers import EmptyFlow, EmptyWork from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef +from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.exceptions import ExitAppException @@ -320,7 +321,7 @@ def run(self): "_restarting": False, "_internal_ip": "", }, - "calls": {"latest_call_hash": None}, + "calls": {CacheCallsKeys.LATEST_CALL_HASH: None}, "changes": {}, }, "work_a": { @@ -334,7 +335,7 @@ def run(self): "_restarting": False, "_internal_ip": "", }, - "calls": {"latest_call_hash": None}, + "calls": {CacheCallsKeys.LATEST_CALL_HASH: None}, "changes": {}, }, }, @@ -364,7 +365,7 @@ def run(self): "_restarting": False, "_internal_ip": "", }, - "calls": {"latest_call_hash": None}, + "calls": {CacheCallsKeys.LATEST_CALL_HASH: None}, "changes": {}, }, "work_a": { @@ -379,10 +380,8 @@ def run(self): "_internal_ip": "", }, "calls": { - "latest_call_hash": None, - "run:fe3fa0f34fc1317e152e5afb023332995392071046f1ea51c34c7c9766e3676c": { - "name": "run", - "call_hash": "run:fe3fa0f34fc1317e152e5afb023332995392071046f1ea51c34c7c9766e3676c", + CacheCallsKeys.LATEST_CALL_HASH: None, + "fe3fa0f": { "ret": None, }, }, @@ -435,7 +434,7 @@ def test_populate_changes_status_removed(): "work": { "vars": {}, "calls": { - "latest_call_hash": "run:fe3f", + CacheCallsKeys.LATEST_CALL_HASH: "run:fe3f", "run:fe3f": { "statuses": [ {"stage": "requesting", "message": None, "reason": None, "timestamp": 1}, diff --git a/tests/tests_app/core/test_lightning_work.py b/tests/tests_app/core/test_lightning_work.py index 913fdf04c3299..14d8d26a458a6 100644 --- a/tests/tests_app/core/test_lightning_work.py +++ b/tests/tests_app/core/test_lightning_work.py @@ -8,7 +8,6 @@ from lightning_app.core.work import LightningWork, LightningWorkException from lightning_app.runners import MultiProcessRuntime from lightning_app.storage import Path -from lightning_app.storage.requests import GetRequest from lightning_app.testing.helpers import EmptyFlow, EmptyWork, MockQueue from lightning_app.utilities.enum import WorkStageStatus from lightning_app.utilities.proxies import ProxyWorkRun, WorkRunner @@ -130,8 +129,8 @@ def run(self): FlowFixed().run() -@pytest.mark.parametrize("raise_exception", [False, True]) @pytest.mark.parametrize("enable_exception", [False, True]) +@pytest.mark.parametrize("raise_exception", [False, True]) def test_lightning_status(enable_exception, raise_exception): class Work(EmptyWork): def __init__(self, raise_exception, enable_exception=True): @@ -143,17 +142,6 @@ def run(self): if self.enable_exception: raise Exception("Custom Exception") - class BlockingQueue(MockQueue): - """A Mock for the file copier queues that keeps blocking until we want to end the thread.""" - - keep_blocking = True - - def get(self, timeout: int = 0): - while BlockingQueue.keep_blocking: - pass - # A dummy request so the Copier gets something to process without an error - return GetRequest(source="src", name="dummy_path", path="test", hash="123", destination="dst") - work = Work(raise_exception, enable_exception=enable_exception) work._name = "root.w" assert work.status.stage == WorkStageStatus.NOT_STARTED @@ -163,9 +151,9 @@ def get(self, timeout: int = 0): error_queue = MockQueue("error_queue") request_queue = MockQueue("request_queue") response_queue = MockQueue("response_queue") - copy_request_queue = BlockingQueue("copy_request_queue") - copy_response_queue = BlockingQueue("copy_response_queue") - call_hash = "run:fe3fa0f34fc1317e152e5afb023332995392071046f1ea51c34c7c9766e3676c" + copy_request_queue = MockQueue("copy_request_queue") + copy_response_queue = MockQueue("copy_response_queue") + call_hash = "fe3fa0f" work._calls[call_hash] = { "args": (), "kwargs": {}, @@ -203,14 +191,13 @@ def get(self, timeout: int = 0): if enable_exception: exception_cls = Exception if raise_exception else Empty assert isinstance(error_queue._queue[0], exception_cls) - res[f"root['calls']['{call_hash}']['statuses'][0]"]["stage"] == "failed" - res[f"root['calls']['{call_hash}']['statuses'][0]"]["message"] == "Custom Exception" + res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["stage"] == "failed" + res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["message"] == "Custom Exception" else: assert res[f"root['calls']['{call_hash}']['statuses'][0]"]["stage"] == "running" assert res_end[f"root['calls']['{call_hash}']['statuses'][1]"]["stage"] == "succeeded" # Stop blocking and let the thread join - BlockingQueue.keep_blocking = False work_runner.copier.join() @@ -281,3 +268,15 @@ def run(self): assert flow.work.state["vars"]["none_to_path"] == Path("lit://none/to/path") assert flow.work.state["vars"]["path_to_none"] is None assert flow.work.state["vars"]["path_to_path"] == Path("lit://path/to/path") + + +def test_lightning_work_calls(): + class W(LightningWork): + def run(self, *args, **kwargs): + pass + + w = W() + assert len(w._calls) == 1 + w.run(1, [2], (3, 4), {"1": "3"}) + assert len(w._calls) == 2 + assert w._calls["0d824f7"] == {"ret": None} diff --git a/tests/tests_app/storage/test_payload.py b/tests/tests_app/storage/test_payload.py index 7a64750a01a92..2481320ff2d57 100644 --- a/tests/tests_app/storage/test_payload.py +++ b/tests/tests_app/storage/test_payload.py @@ -1,3 +1,4 @@ +import os import pathlib import pickle from copy import deepcopy @@ -146,3 +147,7 @@ def test_payload_works(tmpdir): with mock.patch("lightning_app.storage.path.storage_root_dir", lambda: pathlib.Path(tmpdir)): app = LightningApp(Flow(), debug=True) MultiProcessRuntime(app, start_server=False).dispatch() + + os.remove("value_all") + os.remove("value_b") + os.remove("value_c") diff --git a/tests/tests_app/structures/test_structures.py b/tests/tests_app/structures/test_structures.py index aaa7db18a5af2..18a6d372bfee9 100644 --- a/tests/tests_app/structures/test_structures.py +++ b/tests/tests_app/structures/test_structures.py @@ -8,7 +8,7 @@ from lightning_app.storage.payload import Payload from lightning_app.structures import Dict, List from lightning_app.testing.helpers import EmptyFlow -from lightning_app.utilities.enum import WorkStageStatus +from lightning_app.utilities.enum import CacheCallsKeys, WorkStageStatus def test_dict(): @@ -49,7 +49,7 @@ def run(self): for k in ("a", "b", "c", "d") ) assert all( - flow.state["structures"]["dict"]["works"][f"work_{k}"]["calls"] == {"latest_call_hash": None} + flow.state["structures"]["dict"]["works"][f"work_{k}"]["calls"] == {CacheCallsKeys.LATEST_CALL_HASH: None} for k in ("a", "b", "c", "d") ) assert all(flow.state["structures"]["dict"]["works"][f"work_{k}"]["changes"] == {} for k in ("a", "b", "c", "d")) @@ -95,7 +95,8 @@ def run(self): for k in ("a", "b", "c", "d") ) assert all( - flow.state_with_changes["structures"]["dict"]["works"][f"work_{k}"]["calls"] == {"latest_call_hash": None} + flow.state_with_changes["structures"]["dict"]["works"][f"work_{k}"]["calls"] + == {CacheCallsKeys.LATEST_CALL_HASH: None} for k in ("a", "b", "c", "d") ) assert all( @@ -169,7 +170,8 @@ def run(self): for i in range(4) ) assert all( - flow.state["structures"]["list"]["works"][str(i)]["calls"] == {"latest_call_hash": None} for i in range(4) + flow.state["structures"]["list"]["works"][str(i)]["calls"] == {CacheCallsKeys.LATEST_CALL_HASH: None} + for i in range(4) ) assert all(flow.state["structures"]["list"]["works"][str(i)]["changes"] == {} for i in range(4)) @@ -209,7 +211,8 @@ def run(self): for i in range(4) ) assert all( - flow.state_with_changes["structures"]["list"]["works"][str(i)]["calls"] == {"latest_call_hash": None} + flow.state_with_changes["structures"]["list"]["works"][str(i)]["calls"] + == {CacheCallsKeys.LATEST_CALL_HASH: None} for i in range(4) ) assert all(flow.state_with_changes["structures"]["list"]["works"][str(i)]["changes"] == {} for i in range(4)) diff --git a/tests/tests_app/utilities/test_login.py b/tests/tests_app/utilities/test_login.py index 43b10519e20ee..e0ad4b110c868 100644 --- a/tests/tests_app/utilities/test_login.py +++ b/tests/tests_app/utilities/test_login.py @@ -6,7 +6,7 @@ from lightning_app.utilities import login -LIGHTNING_CLOUD_URL = "https://lightning.ai" +LIGHTNING_CLOUD_URL = os.getenv("LIGHTNING_CLOUD_URL", "https://lightning.ai") @pytest.fixture(autouse=True) diff --git a/tests/tests_app/utilities/test_proxies.py b/tests/tests_app/utilities/test_proxies.py index 3331a5e69e42b..cd0dfd7026e09 100644 --- a/tests/tests_app/utilities/test_proxies.py +++ b/tests/tests_app/utilities/test_proxies.py @@ -18,7 +18,7 @@ from lightning_app.storage.requests import GetRequest from lightning_app.testing.helpers import EmptyFlow, MockQueue from lightning_app.utilities.component import _convert_paths_after_init -from lightning_app.utilities.enum import WorkFailureReasons, WorkStageStatus +from lightning_app.utilities.enum import CacheCallsKeys, WorkFailureReasons, WorkStageStatus from lightning_app.utilities.exceptions import CacheMissException, ExitAppException from lightning_app.utilities.proxies import ( ComponentDelta, @@ -240,7 +240,7 @@ class WorkRunnerPatch(WorkRunner): counter = 0 def __call__(self): - call_hash = "run:fe3fa0f34fc1317e152e5afb023332995392071046f1ea51c34c7c9766e3676c" + call_hash = "fe3fa0f" while True: try: called = self.caller_queue.get() @@ -267,7 +267,7 @@ def test_proxy_timeout(): app = LightningApp(FlowTimeout(), debug=True) MultiProcessRuntime(app, start_server=False).dispatch() - call_hash = app.root.work._calls["latest_call_hash"] + call_hash = app.root.work._calls[CacheCallsKeys.LATEST_CALL_HASH] assert len(app.root.work._calls[call_hash]["statuses"]) == 3 assert app.root.work._calls[call_hash]["statuses"][0]["stage"] == "pending" assert app.root.work._calls[call_hash]["statuses"][1]["stage"] == "failed" @@ -308,7 +308,7 @@ def run(self, *args, **kwargs): "state": { "vars": {"_paths": {}, "_urls": {}}, "calls": { - "latest_call_hash": "any", + CacheCallsKeys.LATEST_CALL_HASH: "any", "any": { "name": "run", "call_hash": "any", @@ -361,7 +361,7 @@ def run(self, *args, **kwargs): ], ) @mock.patch("lightning_app.utilities.proxies.Copier") -def test_path_attributes_to_transfer(_, monkeypatch, origin, exists_remote, expected_get): +def test_path_attributes_to_transfer(_, origin, exists_remote, expected_get): """Test that any Lightning Path objects passed to the run method get transferred automatically (if they exist).""" path_mock = Mock() @@ -399,7 +399,7 @@ def run(self): "state": { "vars": {"_paths": flow.work._paths, "_urls": {}}, "calls": { - "latest_call_hash": "any", + CacheCallsKeys.LATEST_CALL_HASH: "any", "any": { "name": "run", "call_hash": "any", @@ -550,9 +550,9 @@ def run(self, use_setattr=False, use_containers=False): ############################ work.run(use_setattr=True, use_containers=False) - # this is necessary only in this test where we siumulate the calls + # this is necessary only in this test where we simulate the calls work._calls.clear() - work._calls.update({"latest_call_hash": None}) + work._calls.update({CacheCallsKeys.LATEST_CALL_HASH: None}) delta = delta_queue.get().delta.to_dict() assert delta["values_changed"] == {"root['vars']['var']": {"new_value": 2}} @@ -583,7 +583,7 @@ def run(self, use_setattr=False, use_containers=False): # this is necessary only in this test where we siumulate the calls work._calls.clear() - work._calls.update({"latest_call_hash": None}) + work._calls.update({CacheCallsKeys.LATEST_CALL_HASH: None}) delta = delta_queue.get().delta.to_dict() assert delta == {"values_changed": {"root['vars']['var']": {"new_value": 3}}} diff --git a/tests/tests_app_examples/collect_failures/app.py b/tests/tests_app_examples/collect_failures/app.py index 7f82f2367775d..89e302b2e6723 100644 --- a/tests/tests_app_examples/collect_failures/app.py +++ b/tests/tests_app_examples/collect_failures/app.py @@ -11,7 +11,7 @@ class SimpleWork(LightningWork): def __init__(self): - super().__init__(cache_calls=False, parallel=True) + super().__init__(cache_calls=False, parallel=True, raise_exception=False) self.is_running_now = False def run(self): From 7010b901118806e1e29b28e7fc665d2511dadf47 Mon Sep 17 00:00:00 2001 From: Birch-san Date: Wed, 3 Aug 2022 14:49:40 +0100 Subject: [PATCH 072/230] Make MPSAccelerator platform check expect arm64 (#13992) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: rohitgr7 Co-authored-by: Akihiro Nitta Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/accelerators/mps.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 05c7cda5d7242..740f5709c5a34 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -43,7 +43,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - -- +- Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) ## [1.7.0] - 2022-08-02 diff --git a/src/pytorch_lightning/accelerators/mps.py b/src/pytorch_lightning/accelerators/mps.py index 20a2e609fa54b..5ebcb37cd0ed7 100644 --- a/src/pytorch_lightning/accelerators/mps.py +++ b/src/pytorch_lightning/accelerators/mps.py @@ -24,7 +24,9 @@ # For using the `MPSAccelerator`, user's machine should have `torch>=1.12`, Metal programming framework and # the ARM-based Apple Silicon processors. -_MPS_AVAILABLE = _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() and platform.processor() == "arm" +_MPS_AVAILABLE = ( + _TORCH_GREATER_EQUAL_1_12 and torch.backends.mps.is_available() and platform.processor() in ("arm", "arm64") +) class MPSAccelerator(Accelerator): From e6a8283e9cd9df53fb661c64bbf2037e1391a16d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 3 Aug 2022 15:49:55 +0200 Subject: [PATCH 073/230] Organize accelerator tests (#13986) --- .../{test_accelerator_registry.py => test_registry.py} | 0 .../connectors}/test_accelerator_connector.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/tests_pytorch/accelerators/{test_accelerator_registry.py => test_registry.py} (100%) rename tests/tests_pytorch/{accelerators => trainer/connectors}/test_accelerator_connector.py (100%) diff --git a/tests/tests_pytorch/accelerators/test_accelerator_registry.py b/tests/tests_pytorch/accelerators/test_registry.py similarity index 100% rename from tests/tests_pytorch/accelerators/test_accelerator_registry.py rename to tests/tests_pytorch/accelerators/test_registry.py diff --git a/tests/tests_pytorch/accelerators/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py similarity index 100% rename from tests/tests_pytorch/accelerators/test_accelerator_connector.py rename to tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py From 800cf0b3ffb1b8dc94673cf5abe8e91703a59cd7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20POIRET?= Date: Wed, 3 Aug 2022 16:15:21 +0200 Subject: [PATCH 074/230] Fix NeptuneLogger unusable after pytorch-lightning 1.7.0 (#13988) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: rohitgr7 Co-authored-by: Akihiro Nitta Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/loggers/neptune.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 740f5709c5a34..1ce0810e114f3 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -40,7 +40,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) diff --git a/src/pytorch_lightning/loggers/neptune.py b/src/pytorch_lightning/loggers/neptune.py index be61705289d49..c9b438fd7c732 100644 --- a/src/pytorch_lightning/loggers/neptune.py +++ b/src/pytorch_lightning/loggers/neptune.py @@ -38,7 +38,7 @@ from pytorch_lightning.utilities.model_summary import ModelSummary from pytorch_lightning.utilities.rank_zero import rank_zero_only -_NEPTUNE_AVAILABLE = _RequirementAvailable("neptune") +_NEPTUNE_AVAILABLE = _RequirementAvailable("neptune-client") _NEPTUNE_GREATER_EQUAL_0_9 = _RequirementAvailable("neptune-client>=0.9.0") From ee4233b36cb70b19a13013371ada50875b429ef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 3 Aug 2022 16:20:51 +0200 Subject: [PATCH 075/230] Update __version__.py to 1.8.0dev (#13999) --- src/pytorch_lightning/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pytorch_lightning/__version__.py b/src/pytorch_lightning/__version__.py index a55413d1549b4..e71f40e375719 100644 --- a/src/pytorch_lightning/__version__.py +++ b/src/pytorch_lightning/__version__.py @@ -1 +1 @@ -version = "1.7.0" +version = "1.8.0dev" From d748dae548a744609ad2919de8ee7416258ddc63 Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Wed, 3 Aug 2022 12:17:21 -0700 Subject: [PATCH 076/230] Fix erroneous warning for unset `max_epochs` (#13262) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Akihiro Nitta Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 + .../callbacks/stochastic_weight_avg.py | 2 + src/pytorch_lightning/loops/fit_loop.py | 5 +- src/pytorch_lightning/loops/utilities.py | 21 +++--- src/pytorch_lightning/trainer/trainer.py | 75 ++++++++++--------- .../tests_pytorch/loops/test_training_loop.py | 2 +- tests/tests_pytorch/test_cli.py | 3 +- .../trainer/flags/test_env_vars.py | 9 ++- .../trainer/flags/test_min_max_epochs.py | 25 ++++++- .../test_estimated_stepping_batches.py | 5 +- tests/tests_pytorch/trainer/test_trainer.py | 8 +- 11 files changed, 94 insertions(+), 64 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 1ce0810e114f3..8676d3f44c54a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -43,6 +43,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) +- Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262)) + + - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 8c141b28bc93d..20a3dcc3f0f26 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -155,6 +155,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - self._max_epochs = trainer.max_epochs if self._model_contains_batch_norm: # virtually increase max_epochs to perform batch norm update on latest epoch. + assert trainer.fit_loop.max_epochs is not None trainer.fit_loop.max_epochs += 1 def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: @@ -227,6 +228,7 @@ def on_train_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - # BatchNorm epoch update. Reset state trainer.accumulate_grad_batches = self._accumulate_grad_batches trainer.num_training_batches -= 1 + assert trainer.fit_loop.max_epochs is not None trainer.fit_loop.max_epochs -= 1 self.reset_momenta() elif trainer.current_epoch - 1 == self.swa_end: diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py index f4f7735f4b66e..9e0d53c66b4e3 100644 --- a/src/pytorch_lightning/loops/fit_loop.py +++ b/src/pytorch_lightning/loops/fit_loop.py @@ -50,10 +50,10 @@ class FitLoop(Loop[None]): def __init__( self, min_epochs: int = 0, - max_epochs: int = 1000, + max_epochs: Optional[int] = None, ) -> None: super().__init__() - if max_epochs < -1: + if isinstance(max_epochs, int) and max_epochs < -1: # Allow max_epochs to be zero, since this will be handled by fit_loop.done raise MisconfigurationException( f"`max_epochs` must be a non-negative integer or -1. You passed in {max_epochs}." @@ -162,6 +162,7 @@ def done(self) -> bool: # `processed` is increased before `on_train_epoch_end`, the hook where checkpoints are typically saved. # we use it here because the checkpoint data won't have `completed` increased yet + assert isinstance(self.max_epochs, int) stop_epochs = _is_max_limit_reached(self.epoch_progress.current.processed, self.max_epochs) if stop_epochs: # in case they are not equal, override so `trainer.current_epoch` has the expected value diff --git a/src/pytorch_lightning/loops/utilities.py b/src/pytorch_lightning/loops/utilities.py index d0631f5953e2f..491af6605c135 100644 --- a/src/pytorch_lightning/loops/utilities.py +++ b/src/pytorch_lightning/loops/utilities.py @@ -14,9 +14,8 @@ import inspect from collections import OrderedDict from contextlib import contextmanager -from datetime import timedelta from functools import lru_cache -from typing import Any, Callable, Dict, Generator, List, Optional, Sequence, Tuple, Union +from typing import Any, Callable, Generator, List, Optional, Sequence, Tuple import numpy as np import torch @@ -25,6 +24,7 @@ from torch.utils.data import DataLoader import pytorch_lightning as pl +from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.loops import Loop from pytorch_lightning.strategies import ParallelStrategy, Strategy from pytorch_lightning.trainer.progress import BaseProgress @@ -69,12 +69,8 @@ def _extract_hiddens(training_step_output: STEP_OUTPUT, truncated_bptt_steps: in def _parse_loop_limits( - min_steps: Optional[int], - max_steps: int, - min_epochs: Optional[int], - max_epochs: int, - max_time: Optional[Union[str, timedelta, Dict[str, int]]], -) -> Tuple[Optional[int], int, int, int, Optional[Union[str, timedelta, Dict[str, int]]]]: + min_steps: Optional[int], max_steps: int, min_epochs: Optional[int], max_epochs: int, trainer: "pl.Trainer" +) -> Tuple[int, int]: """This utility computes the default values for the minimum and maximum number of steps and epochs given the values the user has selected. @@ -83,13 +79,13 @@ def _parse_loop_limits( max_steps: Maximum number of steps. min_epochs: Minimum number of epochs. max_epochs: Maximum number of epochs. - max_time: Maximum time for the training. + trainer: Trainer instance. Returns: The parsed limits, with default values being set for the ones that the user did not specify. """ if max_epochs is None: - if max_steps == -1 and max_time is None: + if max_steps == -1 and not any(isinstance(cb, Timer) for cb in trainer.callbacks): rank_zero_warn( "`max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit," " set `max_epochs=-1`.", @@ -98,13 +94,16 @@ def _parse_loop_limits( max_epochs = 1000 else: max_epochs = -1 + if min_epochs is None and min_steps is not None: # setting this allows FitLoop.done to re-evaluate should_stop when it gets triggered `on_fit_start` min_epochs = 1 + if min_epochs is None: # the default value is 0 so no training will be done when should_stop is triggered `on_fit_start` min_epochs = 0 - return min_steps, max_steps, min_epochs, max_epochs, max_time + + return min_epochs, max_epochs def _build_training_step_kwargs( diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 01d13849f8126..4b3c046ae0f0c 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -455,9 +455,6 @@ def __init__( self._signal_connector = SignalConnector(self) self.tuner = Tuner(self) - min_steps, max_steps, min_epochs, max_epochs, max_time = _parse_loop_limits( - min_steps, max_steps, min_epochs, max_epochs, max_time - ) fit_loop = FitLoop(min_epochs=min_epochs, max_epochs=max_epochs) training_epoch_loop = TrainingEpochLoop(min_steps=min_steps, max_steps=max_steps) fit_loop.connect(epoch_loop=training_epoch_loop) @@ -534,7 +531,7 @@ def __init__( self.track_grad_norm: float = float(track_grad_norm) self._detect_anomaly: bool = detect_anomaly - self._setup_on_init(num_sanity_val_steps) + self._setup_on_init() # configure tuner self.tuner.on_trainer_init(auto_lr_find, auto_scale_batch_size) @@ -553,9 +550,10 @@ def __init__( limit_val_batches, limit_test_batches, limit_predict_batches, - val_check_interval, - overfit_batches, fast_dev_run, + overfit_batches, + val_check_interval, + num_sanity_val_steps, ) # Callback system @@ -567,54 +565,57 @@ def _init_debugging_flags( limit_val_batches: Optional[Union[int, float]], limit_test_batches: Optional[Union[int, float]], limit_predict_batches: Optional[Union[int, float]], - val_check_interval: Optional[Union[int, float]], - overfit_batches: Union[int, float], fast_dev_run: Union[int, bool], - ) -> None: + overfit_batches: Union[int, float], + val_check_interval: Optional[Union[int, float]], + num_sanity_val_steps: int, + ): + # init debugging flags if isinstance(fast_dev_run, int) and (fast_dev_run < 0): raise MisconfigurationException( - f"fast_dev_run={fast_dev_run} is not a valid configuration. It should be >= 0." + f"fast_dev_run={fast_dev_run!r} is not a valid configuration. It should be >= 0." ) - self.fast_dev_run = fast_dev_run # set fast_dev_run=True when it is 1, used while logging if fast_dev_run == 1: self.fast_dev_run = True + self.overfit_batches = _determine_batch_limits(overfit_batches, "overfit_batches") + overfit_batches_enabled = overfit_batches > 0 + if fast_dev_run: num_batches = int(fast_dev_run) - limit_train_batches = num_batches - limit_val_batches = num_batches - limit_test_batches = num_batches - limit_predict_batches = num_batches + if not overfit_batches_enabled: + self.limit_train_batches = num_batches + self.limit_val_batches = num_batches + + self.limit_test_batches = num_batches + self.limit_predict_batches = num_batches self.fit_loop.max_steps = num_batches self.num_sanity_val_steps = 0 self.fit_loop.max_epochs = 1 - val_check_interval = 1.0 + self.val_check_interval = 1.0 self.check_val_every_n_epoch = 1 self.loggers = [DummyLogger()] if self.loggers else [] - rank_zero_info( f"Running in `fast_dev_run` mode: will run the requested loop using {num_batches} batch(es). " "Logging and checkpointing is suppressed." ) - - self.limit_train_batches = _determine_batch_limits(limit_train_batches, "limit_train_batches") - self.limit_val_batches = _determine_batch_limits(limit_val_batches, "limit_val_batches") - self.limit_test_batches = _determine_batch_limits(limit_test_batches, "limit_test_batches") - self.limit_predict_batches = _determine_batch_limits(limit_predict_batches, "limit_predict_batches") - self.val_check_interval = _determine_batch_limits(val_check_interval, "val_check_interval") - self.overfit_batches = _determine_batch_limits(overfit_batches, "overfit_batches") - self._configure_overfit_batches(self.overfit_batches) - - def _configure_overfit_batches(self, overfit_batches: Union[int, float]) -> None: - """Configure batch limits using `overfit_batches`.""" - if overfit_batches > 0: + else: + if not overfit_batches_enabled: + self.limit_train_batches = _determine_batch_limits(limit_train_batches, "limit_train_batches") + self.limit_val_batches = _determine_batch_limits(limit_val_batches, "limit_val_batches") + self.limit_test_batches = _determine_batch_limits(limit_test_batches, "limit_test_batches") + self.limit_predict_batches = _determine_batch_limits(limit_predict_batches, "limit_predict_batches") + self.num_sanity_val_steps = float("inf") if num_sanity_val_steps == -1 else num_sanity_val_steps + self.val_check_interval = _determine_batch_limits(val_check_interval, "val_check_interval") + + if overfit_batches_enabled: self.limit_train_batches = overfit_batches self.limit_val_batches = overfit_batches - def _setup_on_init(self, num_sanity_val_steps: int) -> None: + def _setup_on_init(self) -> None: self._log_device_info() self.should_stop = False @@ -622,11 +623,6 @@ def _setup_on_init(self, num_sanity_val_steps: int) -> None: self.num_training_batches = float("inf") self.train_dataloader = None - if num_sanity_val_steps == -1: - self.num_sanity_val_steps = float("inf") - else: - self.num_sanity_val_steps = num_sanity_val_steps - self.num_sanity_val_batches = [] self.num_test_batches = [] self.num_val_batches = [] @@ -1088,6 +1084,13 @@ def _restore_modules_and_callbacks(self, checkpoint_path: Optional[_PATH] = None def _run( self, model: "pl.LightningModule", ckpt_path: Optional[str] = None ) -> Optional[Union[_EVALUATE_OUTPUT, _PREDICT_OUTPUT]]: + if self.state.fn in (TrainerFn.FITTING, TrainerFn.TUNING): + min_epochs, max_epochs = _parse_loop_limits( + self.min_steps, self.max_steps, self.min_epochs, self.max_epochs, self + ) + self.fit_loop.min_epochs = min_epochs + self.fit_loop.max_epochs = max_epochs + # clean hparams if hasattr(model, "hparams"): parsing.clean_namespace(model.hparams) @@ -1149,7 +1152,6 @@ def _run( # ---------------------------- # TRAIN # ---------------------------- - # reset logger connector self._logger_connector.reset_results() self._logger_connector.reset_metrics() @@ -1289,6 +1291,7 @@ def _run_train(self) -> None: torch.set_grad_enabled(True) self.fit_loop.trainer = self + with torch.autograd.set_detect_anomaly(self._detect_anomaly): self.fit_loop.run() diff --git a/tests/tests_pytorch/loops/test_training_loop.py b/tests/tests_pytorch/loops/test_training_loop.py index a9da6dcf2be6d..bba0b9b7c5428 100644 --- a/tests/tests_pytorch/loops/test_training_loop.py +++ b/tests/tests_pytorch/loops/test_training_loop.py @@ -141,7 +141,7 @@ def validation_step(self, *args): def test_fit_loop_done_log_messages(caplog): - fit_loop = FitLoop() + fit_loop = FitLoop(max_epochs=1) trainer = Mock(spec=Trainer) fit_loop.trainer = trainer diff --git a/tests/tests_pytorch/test_cli.py b/tests/tests_pytorch/test_cli.py index 965f53a86d4b7..e37f799888f7e 100644 --- a/tests/tests_pytorch/test_cli.py +++ b/tests/tests_pytorch/test_cli.py @@ -1271,7 +1271,8 @@ def __init__(self, foo: int, *args, **kwargs): def test_lightning_cli_reinstantiate_trainer(): with mock.patch("sys.argv", ["any.py"]): cli = LightningCLI(BoringModel, run=False) - assert cli.trainer.max_epochs == 1000 + + assert cli.trainer.max_epochs is None class TestCallback(Callback): ... diff --git a/tests/tests_pytorch/trainer/flags/test_env_vars.py b/tests/tests_pytorch/trainer/flags/test_env_vars.py index 9e7bd70468482..cfac06c8d7711 100644 --- a/tests/tests_pytorch/trainer/flags/test_env_vars.py +++ b/tests/tests_pytorch/trainer/flags/test_env_vars.py @@ -15,17 +15,20 @@ from unittest import mock from pytorch_lightning import Trainer +from pytorch_lightning.demos.boring_classes import BoringModel def test_passing_no_env_variables(): """Testing overwriting trainer arguments.""" trainer = Trainer() + model = BoringModel() assert trainer.logger is not None assert trainer.max_steps == -1 - assert trainer.max_epochs == 1000 - trainer = Trainer(False, max_steps=42) + assert trainer.max_epochs is None + trainer = Trainer(logger=False, max_steps=1) + trainer.fit(model) assert trainer.logger is None - assert trainer.max_steps == 42 + assert trainer.max_steps == 1 assert trainer.max_epochs == -1 diff --git a/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py b/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py index 93a7c915ebb7b..25f2dfdab279f 100644 --- a/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py +++ b/tests/tests_pytorch/trainer/flags/test_min_max_epochs.py @@ -3,6 +3,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.utilities.warnings import PossibleUserWarning +from tests_pytorch.helpers.utils import no_warning_call @pytest.mark.parametrize( @@ -37,7 +38,23 @@ def test_min_max_steps_epochs(tmpdir, min_epochs, max_epochs, min_steps, max_ste def test_max_epochs_not_set_warning(): - """Test that a warning is emitted when `max_epochs` was not set by the user.""" - with pytest.warns(PossibleUserWarning, match="`max_epochs` was not set. Setting it to 1000 epochs."): - trainer = Trainer(max_epochs=None) - assert trainer.max_epochs == 1000 + """Test that a warning is only emitted when `max_epochs` was not set by the user.""" + + class CustomModel(BoringModel): + def training_step(self, *args, **kwargs): + self.trainer.should_stop = True + + match = "`max_epochs` was not set. Setting it to 1000 epochs." + + model = CustomModel() + model.training_epoch_end = None + trainer = Trainer(max_epochs=None, limit_train_batches=1) + with pytest.warns(PossibleUserWarning, match=match): + trainer.fit(model) + + assert trainer.max_epochs == 1000 + assert trainer.current_epoch == 1 + + with no_warning_call(expected_warning=PossibleUserWarning, match=match): + Trainer(fast_dev_run=True) + Trainer(fast_dev_run=1) diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 8d8fb7f3a8c21..92a1126294dfc 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -97,11 +97,10 @@ def test_num_stepping_batches_infinite_training(): def test_num_stepping_batches_with_max_steps(): """Test stepping batches with `max_steps`.""" - max_steps = 7 + max_steps = 2 trainer = Trainer(max_steps=max_steps) model = BoringModel() - trainer._data_connector.attach_data(model) - trainer.strategy.connect(model) + trainer.fit(model) assert trainer.estimated_stepping_batches == max_steps diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index f868dcc353e72..e4be8929f9c7e 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -540,11 +540,11 @@ def test_trainer_max_steps_and_epochs_validation(max_epochs, max_steps, incorrec @pytest.mark.parametrize( "max_epochs,max_steps,is_done,correct_trainer_epochs", [ - (None, -1, False, 1000), + (None, -1, False, None), (-1, -1, False, -1), (5, -1, False, 5), (-1, 10, False, -1), - (None, 0, True, -1), + (None, 0, True, None), (0, -1, True, 0), (-1, 0, True, -1), (0, -1, True, 0), @@ -555,7 +555,9 @@ def test_trainer_max_steps_and_epochs_fit_loop_done(max_epochs, max_steps, is_do assert trainer.max_epochs == correct_trainer_epochs assert trainer.max_steps == max_steps - assert trainer.fit_loop.done is is_done + + if isinstance(correct_trainer_epochs, int): + assert trainer.fit_loop.done is is_done # Make sure there is no timer timer_callbacks = [c for c in trainer.callbacks if isinstance(c, Timer)] From e2937495e5562c476b1186c2a848f033ad873775 Mon Sep 17 00:00:00 2001 From: Ian <31828525+HalestormAI@users.noreply.github.com> Date: Wed, 3 Aug 2022 20:57:50 +0100 Subject: [PATCH 077/230] Fix typing annotations for the ipu strategy (#13786) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: otaj Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta --- pyproject.toml | 1 - src/pytorch_lightning/strategies/ipu.py | 68 ++++++++++++++----------- 2 files changed, 39 insertions(+), 30 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 15f0293bb1c8a..2df0142e9af4c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,6 @@ module = [ "pytorch_lightning.profilers.simple", "pytorch_lightning.strategies.ddp", "pytorch_lightning.strategies.fully_sharded", - "pytorch_lightning.strategies.ipu", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 82ba4ad227f7c..0b5d8e835ad1d 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -13,11 +13,11 @@ # limitations under the License. import json import os -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import torch from torch import FloatTensor, Tensor -from torch.utils.data import DataLoader +from torch.utils.data import DataLoader, Sampler import pytorch_lightning as pl from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase @@ -25,6 +25,7 @@ from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy +from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -112,12 +113,12 @@ def __init__( self.device_iterations = device_iterations self.autoreport = autoreport self.autoreport_dir = autoreport_dir - self.poptorch_models = {} + self.poptorch_models: Dict[RunningStage, "poptorch.PoplarExecutor"] = {} self._training_opts = training_opts self._inference_opts = inference_opts if self.autoreport: - options = {"autoReport.all": self.autoreport} + options: Dict[str, Any] = {"autoReport.all": self.autoreport} if self.autoreport_dir: self._fs = get_filesystem(str(self.autoreport_dir)) self._fs.makedirs(self.autoreport_dir, exist_ok=True) @@ -139,6 +140,8 @@ def setup(self, trainer: "pl.Trainer") -> None: super().setup(trainer) + assert self.lightning_module is not None + # disable the `optimizer_zero_grad` function by setting it to `None`. # this is because the IPU zeros the gradients internally self._optimizer_zero_grad_original = self.lightning_module.optimizer_zero_grad @@ -192,12 +195,14 @@ def replication_factor(self) -> int: if self._inference_opts: return self._inference_opts.replication_factor + assert self.parallel_devices return len(self.parallel_devices) - stage = self.lightning_module.trainer.state.stage + assert stage is not None return self.poptorch_models[stage]._options.toDict()["replication_factor"] def _create_opts(self, training: bool) -> "poptorch.Options": + assert self.lightning_module is not None opts = poptorch.Options() opts.deviceIterations(self.device_iterations) opts.replicationFactor(self.replication_factor) @@ -221,14 +226,14 @@ def inference_opts(self) -> "poptorch.Options": return self._inference_opts def _convert_to_poptorch_loader( - self, dataloader: DataLoader, sampler, mode: Optional[RunningStage] = None + self, dataloader: DataLoader, sampler: Union[Sampler, Iterable], mode: Optional[RunningStage] = None ) -> "poptorch.DataLoader": if isinstance(dataloader, poptorch.DataLoader): # the user is returning the `poptorch.DataLoader` directly, don't change anything. return dataloader dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs( - dataloader, sampler, mode, self.replication_factor > 1 + dataloader, sampler, mode, self.replication_factor > 1 # type: ignore[arg-type] ) opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts dataloader = poptorch.DataLoader(opts, *dl_args, **dl_kwargs) @@ -240,6 +245,7 @@ def _handle_gradient_accumulation_steps(self) -> None: ``optimizer_step`` will be called on every batch, and the IPU will handle grad accumulation internally. """ + assert self.lightning_module is not None accumulation_scheduler = self.lightning_module.trainer.accumulation_scheduler if accumulation_scheduler.epochs != [0]: @@ -251,18 +257,19 @@ def _handle_gradient_accumulation_steps(self) -> None: accumulation_scheduler.scheduling.update({0: 1}) @property - def _n_replicate(self): + def _n_replicate(self) -> int: + assert self.lightning_module is not None opts = self.training_opts if self.lightning_module.training else self.inference_opts accumulate_grad_batches = opts.Training.gradient_accumulation device_iterations = opts.device_iterations replication_factor = opts.replication_factor return replication_factor * device_iterations * accumulate_grad_batches - def _prepare_input(self, args: Any): - def to_tuple(x): + def _prepare_input(self, args: Any) -> Any: + def to_tuple(x: Any) -> Tuple: return tuple(x) - def to_tensor(x): + def to_tensor(x: Any) -> Tensor: return torch.tensor(x).unsqueeze(0).repeat(self._n_replicate) args = apply_to_collection(args, dtype=list, function=to_tuple) @@ -281,6 +288,7 @@ def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dat def _disable_zero_grad(self) -> None: lightning_module = self.lightning_module + assert lightning_module is not None if is_overridden("optimizer_zero_grad", lightning_module): assert lightning_module is not None # `is_overridden` returns False otherwise rank_zero_warn( @@ -289,27 +297,28 @@ def _disable_zero_grad(self) -> None: ) lightning_module.optimizer_zero_grad = None # type: ignore[assignment] - def _step(self, stage: RunningStage, *args: Any, **kwargs: Any): + def _step(self, stage: RunningStage, *args: Any, **kwargs: Any) -> STEP_OUTPUT: args = self._prepare_input(args) + assert self.lightning_module is not None poptorch_model = self.poptorch_models[stage] self.lightning_module._running_torchscript = True out = poptorch_model(*args, **kwargs) self.lightning_module._running_torchscript = False return out - def training_step(self, *args, **kwargs) -> STEP_OUTPUT: + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: with self.precision_plugin.train_step_context(): return self._step(RunningStage.TRAINING, *args, **kwargs) - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.val_step_context(): return self._step(RunningStage.VALIDATING, *args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.test_step_context(): return self._step(RunningStage.TESTING, *args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: with self.precision_plugin.predict_step_context(): return self._step(RunningStage.PREDICTING, *args, **kwargs) @@ -318,26 +327,27 @@ def teardown(self) -> None: # undo dataloader patching pl.trainer.connectors.data_connector._update_dataloader = self._update_dataloader_original + assert self.lightning_module is not None if self._optimizer_zero_grad_original is not None: # re-enable `optimizer_zero_grad` - self.lightning_module.optimizer_zero_grad = self._optimizer_zero_grad_original + self.lightning_module.optimizer_zero_grad = self._optimizer_zero_grad_original # type: ignore[assignment] for model in self.poptorch_models.values(): model.destroy() super().teardown() - def _compiled(self, model: Any): + def _compiled(self, model: Any) -> bool: # Required to ensure we only attach compiled models, as they are compiled lazily. return model._executable is not None - def _detach_models(self): + def _detach_models(self) -> None: """Detaches all stage specific models from IPU devices.""" for k, model in self.poptorch_models.items(): if self._compiled(model) and model.isAttachedToDevice(): model.detachFromDevice() - def _load_model(self, stage: str): + def _load_model(self, stage: RunningStage) -> None: """Loads the stage specific accelerator model onto device if compiled and not attached to IPU devices. Args: @@ -348,28 +358,28 @@ def _load_model(self, stage: str): if self._compiled(model) and not model.isAttachedToDevice(): model.attachToDevice() - def on_train_start(self): + def on_train_start(self) -> None: self._load_model(RunningStage.TRAINING) - def on_validation_start(self): + def on_validation_start(self) -> None: self._load_model(RunningStage.VALIDATING) - def on_test_start(self): + def on_test_start(self) -> None: self._load_model(RunningStage.TESTING) - def on_predict_start(self): + def on_predict_start(self) -> None: self._load_model(RunningStage.PREDICTING) - def on_train_end(self): + def on_train_end(self) -> None: self._detach_models() - def on_validation_end(self): + def on_validation_end(self) -> None: self._detach_models() - def on_test_end(self): + def on_test_end(self) -> None: self._detach_models() - def on_predict_end(self): + def on_predict_end(self) -> None: self._detach_models() def on_train_batch_start(self, batch: Any, batch_idx: int) -> None: @@ -397,7 +407,7 @@ def barrier(self, name: Optional[str] = None) -> None: def all_gather(self, tensor: Tensor, group: Optional[Any] = None, sync_grads: bool = False) -> Tensor: return tensor - def broadcast(self, obj: object, src: int = 0) -> object: + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: return obj @classmethod From 7240bdbc2334dd96251fe79497d411461442bcfd Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Wed, 3 Aug 2022 15:58:21 -0400 Subject: [PATCH 078/230] Fix mypy errors attributed to `pytorch_lightning.loggers.neptune` (#13692) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: otaj Co-authored-by: Rohit Gupta --- pyproject.toml | 1 - src/pytorch_lightning/loggers/neptune.py | 59 ++++++++++++------------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2df0142e9af4c..352fb33f53ac9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,6 @@ module = [ "pytorch_lightning.core.saving", "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", - "pytorch_lightning.loggers.neptune", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.profilers.simple", diff --git a/src/pytorch_lightning/loggers/neptune.py b/src/pytorch_lightning/loggers/neptune.py index c9b438fd7c732..7c4038cd81abb 100644 --- a/src/pytorch_lightning/loggers/neptune.py +++ b/src/pytorch_lightning/loggers/neptune.py @@ -23,14 +23,12 @@ import os import warnings from argparse import Namespace -from functools import reduce -from typing import Any, Callable, Dict, Generator, Mapping, Optional, Sequence, Set, Union +from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, Sequence, Set, Union from weakref import ReferenceType -import torch from torch import Tensor -from pytorch_lightning import __version__ +import pytorch_lightning as pl from pytorch_lightning.callbacks import Checkpoint from pytorch_lightning.loggers.logger import Logger, rank_zero_experiment from pytorch_lightning.utilities.imports import _RequirementAvailable @@ -272,7 +270,7 @@ def __init__( prefix: str = "training", agg_key_funcs: Optional[Mapping[str, Callable[[Sequence[float]], float]]] = None, agg_default_func: Optional[Callable[[Sequence[float]], float]] = None, - **neptune_run_kwargs, + **neptune_run_kwargs: Any, ): # verify if user passed proper init arguments self._verify_input_arguments(api_key, project, name, run, neptune_run_kwargs) @@ -290,16 +288,17 @@ def __init__( self._api_key = api_key self._run_instance = run self._neptune_run_kwargs = neptune_run_kwargs - self._run_short_id = None + self._run_short_id: Optional[str] = None if self._run_instance is not None: self._retrieve_run_data() # make sure that we've log integration version for outside `Run` instances - self._run_instance[_INTEGRATION_VERSION_KEY] = __version__ + self._run_instance[_INTEGRATION_VERSION_KEY] = pl.__version__ - def _retrieve_run_data(self): + def _retrieve_run_data(self) -> None: try: + assert self._run_instance is not None self._run_instance.wait() self._run_short_id = self._run_instance["sys/id"].fetch() self._run_name = self._run_instance["sys/name"].fetch() @@ -308,8 +307,8 @@ def _retrieve_run_data(self): self._run_name = "offline-name" @property - def _neptune_init_args(self): - args = {} + def _neptune_init_args(self) -> Dict: + args: Dict = {} # Backward compatibility in case of previous version retrieval try: args = self._neptune_run_kwargs @@ -334,7 +333,7 @@ def _neptune_init_args(self): return args - def _construct_path_with_prefix(self, *keys) -> str: + def _construct_path_with_prefix(self, *keys: str) -> str: """Return sequence of keys joined by `LOGGER_JOIN_CHAR`, started with `_prefix` if defined.""" if self._prefix: return self.LOGGER_JOIN_CHAR.join([self._prefix, *keys]) @@ -347,7 +346,7 @@ def _verify_input_arguments( name: Optional[str], run: Optional["Run"], neptune_run_kwargs: dict, - ): + ) -> None: legacy_kwargs_msg = ( "Following kwargs are deprecated: {legacy_kwargs}.\n" "If you are looking for the Neptune logger using legacy Python API," @@ -393,17 +392,17 @@ def _verify_input_arguments( " you can't provide other neptune.init() parameters.\n" ) - def __getstate__(self): + def __getstate__(self) -> Dict[str, Any]: state = self.__dict__.copy() # Run instance can't be pickled state["_run_instance"] = None return state - def __setstate__(self, state): + def __setstate__(self, state: Dict[str, Any]) -> None: self.__dict__ = state self._run_instance = neptune.init(**self._neptune_init_args) - @property + @property # type: ignore[misc] @rank_zero_experiment def experiment(self) -> Run: r""" @@ -433,7 +432,7 @@ def training_step(self, batch, batch_idx): """ return self.run - @property + @property # type: ignore[misc] @rank_zero_experiment def run(self) -> Run: try: @@ -441,7 +440,7 @@ def run(self) -> Run: self._run_instance = neptune.init(**self._neptune_init_args) self._retrieve_run_data() # make sure that we've log integration version for newly created - self._run_instance[_INTEGRATION_VERSION_KEY] = __version__ + self._run_instance[_INTEGRATION_VERSION_KEY] = pl.__version__ return self._run_instance except NeptuneLegacyProjectException as e: @@ -531,7 +530,7 @@ def save_dir(self) -> Optional[str]: return os.path.join(os.getcwd(), ".neptune") @rank_zero_only - def log_model_summary(self, model, max_depth=-1): + def log_model_summary(self, model: "pl.LightningModule", max_depth: int = -1) -> None: model_str = str(ModelSummary(model=model, max_depth=max_depth)) self.run[self._construct_path_with_prefix("model/summary")] = neptune.types.File.from_content( content=model_str, extension="txt" @@ -600,14 +599,16 @@ def _get_full_model_name(model_path: str, checkpoint_callback: "ReferenceType[Ch return filepath @classmethod - def _get_full_model_names_from_exp_structure(cls, exp_structure: dict, namespace: str) -> Set[str]: + def _get_full_model_names_from_exp_structure(cls, exp_structure: Dict[str, Any], namespace: str) -> Set[str]: """Returns all paths to properties which were already logged in `namespace`""" - structure_keys = namespace.split(cls.LOGGER_JOIN_CHAR) - uploaded_models_dict = reduce(lambda d, k: d[k], [exp_structure, *structure_keys]) + structure_keys: List[str] = namespace.split(cls.LOGGER_JOIN_CHAR) + for key in structure_keys: + exp_structure = exp_structure[key] + uploaded_models_dict = exp_structure return set(cls._dict_paths(uploaded_models_dict)) @classmethod - def _dict_paths(cls, d: dict, path_in_build: str = None) -> Generator: + def _dict_paths(cls, d: Dict[str, Any], path_in_build: str = None) -> Generator: for k, v in d.items(): path = f"{path_in_build}/{k}" if path_in_build is not None else k if not isinstance(v, dict): @@ -616,12 +617,12 @@ def _dict_paths(cls, d: dict, path_in_build: str = None) -> Generator: yield from cls._dict_paths(v, path) @property - def name(self) -> str: + def name(self) -> Optional[str]: """Return the experiment name or 'offline-name' when exp is run in offline mode.""" return self._run_name @property - def version(self) -> str: + def version(self) -> Optional[str]: """Return the experiment version. It's Neptune Run's short_id @@ -629,7 +630,7 @@ def version(self) -> str: return self._run_short_id @staticmethod - def _signal_deprecated_api_usage(f_name, sample_code, raise_exception=False): + def _signal_deprecated_api_usage(f_name: str, sample_code: str, raise_exception: bool = False) -> None: msg_suffix = ( f"If you are looking for the Neptune logger using legacy Python API," f" it's still available as part of neptune-contrib package:\n" @@ -649,10 +650,10 @@ def _signal_deprecated_api_usage(f_name, sample_code, raise_exception=False): raise ValueError("The function you've used is deprecated.\n" + msg_suffix) @rank_zero_only - def log_metric(self, metric_name: str, metric_value: Union[Tensor, float, str], step: Optional[int] = None): + def log_metric(self, metric_name: str, metric_value: Union[Tensor, float, str], step: Optional[int] = None) -> None: key = f"{self._prefix}/{metric_name}" self._signal_deprecated_api_usage("log_metric", f"logger.run['{key}'].log(42)") - if torch.is_tensor(metric_value): + if isinstance(metric_value, Tensor): metric_value = metric_value.cpu().detach() self.run[key].log(metric_value, step=step) @@ -678,12 +679,12 @@ def log_artifact(self, artifact: str, destination: Optional[str] = None) -> None self._signal_deprecated_api_usage("log_artifact", f"logger.run['{key}].log('path_to_file')") self.run[key].log(destination) - def set_property(self, *args, **kwargs): + def set_property(self, *args: Any, **kwargs: Any) -> None: self._signal_deprecated_api_usage( "log_artifact", f"logger.run['{self._prefix}/{self.PARAMETERS_KEY}/key'].log(value)", raise_exception=True ) - def append_tags(self, *args, **kwargs): + def append_tags(self, *args: Any, **kwargs: Any) -> None: self._signal_deprecated_api_usage( "append_tags", "logger.run['sys/tags'].add(['foo', 'bar'])", raise_exception=True ) From 52f3775bb58fbf1cd1cf65570b9f69082413f3f5 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Wed, 3 Aug 2022 12:59:37 -0700 Subject: [PATCH 079/230] Doc Terminology updates (#13972) * Doc Terminology updates * API updates --- docs/source-app/api_references.rst | 1 + .../core_api/lightning_app/index.rst | 12 +---- docs/source-app/examples/dag/dag.rst | 6 +-- docs/source-app/examples/hpo/hpo.rst | 6 +-- .../model_server_app/model_server.rst | 12 ++--- .../model_server_app/model_server_app.rst | 6 +-- .../putting_everything_together.rst | 10 ++--- docs/source-app/index.rst | 20 ++++----- .../from_pytorch_lightning_script.rst | 44 ++++++++++--------- .../build_lightning_app/from_scratch.rst | 9 ++-- .../workflows/build_lightning_app/index.rst | 9 ++-- .../build_lightning_app/index_content.rst | 8 ++-- .../build_lightning_component/basic.rst | 9 ++-- .../from_scratch_component_content.rst | 6 +-- .../build_lightning_component/index.rst | 9 ++-- .../index_content.rst | 8 ++-- .../intermediate.rst | 17 +++---- .../publish_a_component.rst | 23 +++++----- 18 files changed, 109 insertions(+), 106 deletions(-) diff --git a/docs/source-app/api_references.rst b/docs/source-app/api_references.rst index 55803457c4820..340d500ef8ef5 100644 --- a/docs/source-app/api_references.rst +++ b/docs/source-app/api_references.rst @@ -34,6 +34,7 @@ ___________________ ~python.popen.PopenPythonScript ~python.tracer.TracerPythonScript + ~training.LightningTrainingComponent ~serve.gradio.ServeGradio ~serve.serve.ModelInferenceAPI diff --git a/docs/source-app/core_api/lightning_app/index.rst b/docs/source-app/core_api/lightning_app/index.rst index e63bd21fa1b70..1cdb4360dbdaa 100644 --- a/docs/source-app/core_api/lightning_app/index.rst +++ b/docs/source-app/core_api/lightning_app/index.rst @@ -24,20 +24,12 @@ Peek under the hood :tag: Basic .. displayitem:: - :header: The App state (basic) - :description: Learn more about the state and its manipulation. - :col_css: col-md-4 - :button_link: ../../workflows/access_app_state.html - :height: 180 - :tag: basic - -.. displayitem:: - :header: The event loop (basic) + :header: The event loop (Basic) :description: Learn more about the event loop. :col_css: col-md-4 :button_link: ../../glossary/event_loop.html :height: 180 - :tag: basic + :tag: Basic .. displayitem:: :header: Communication between Flow and Works diff --git a/docs/source-app/examples/dag/dag.rst b/docs/source-app/examples/dag/dag.rst index 8b1d71dd2431f..f5eae0d4aa1d3 100644 --- a/docs/source-app/examples/dag/dag.rst +++ b/docs/source-app/examples/dag/dag.rst @@ -1,6 +1,6 @@ -#################################### -Build a Directed Acyclic Graph (DAG) -#################################### +###################################### +Develop a Directed Acyclic Graph (DAG) +###################################### .. _dag_example: diff --git a/docs/source-app/examples/hpo/hpo.rst b/docs/source-app/examples/hpo/hpo.rst index 2849a62653e42..568e17836194d 100644 --- a/docs/source-app/examples/hpo/hpo.rst +++ b/docs/source-app/examples/hpo/hpo.rst @@ -2,9 +2,9 @@ .. _hpo_example: -####################################################### -Build a Lightning Hyperparameter Optimization (HPO) App -####################################################### +######################################################### +Develop a Lightning Hyperparameter Optimization (HPO) App +######################################################### ******************* A bit of background diff --git a/docs/source-app/examples/model_server_app/model_server.rst b/docs/source-app/examples/model_server_app/model_server.rst index b1daa00d427d4..283dc97bc99e3 100644 --- a/docs/source-app/examples/model_server_app/model_server.rst +++ b/docs/source-app/examples/model_server_app/model_server.rst @@ -1,8 +1,8 @@ :orphan: -*********************************** -2. Build the Model Server Component -*********************************** +************************************* +2. Develop the Model Server Component +************************************* In the code below, we use `MLServer `_ which aims to provide an easy way to start serving your machine learning models through a REST and gRPC interface, fully compliant with KFServing's V2 Dataplane spec. @@ -19,7 +19,7 @@ fully compliant with KFServing's V2 Dataplane spec. .. Add callout items below this line .. displayitem:: - :header: 1. Build a Train Component + :header: 1. Develop a Train Component :description: Train a model and store its checkpoints with SKlearn :col_css: col-md-4 :button_link: train.html @@ -27,7 +27,7 @@ fully compliant with KFServing's V2 Dataplane spec. :tag: Intermediate .. displayitem:: - :header: 3. Build a Load Testing Component + :header: 3. Develop a Load Testing Component :description: Use Locust to test your model servers :col_css: col-md-4 :button_link: load_testing.html @@ -36,7 +36,7 @@ fully compliant with KFServing's V2 Dataplane spec. .. displayitem:: :header: 4. Putting everything together. - :description: Ensemble the components together and run the app + :description: Ensemble the Components together and run the App :col_css: col-md-4 :button_link: putting_everything_together.html :height: 150 diff --git a/docs/source-app/examples/model_server_app/model_server_app.rst b/docs/source-app/examples/model_server_app/model_server_app.rst index 09d361992920d..933c89d035b00 100644 --- a/docs/source-app/examples/model_server_app/model_server_app.rst +++ b/docs/source-app/examples/model_server_app/model_server_app.rst @@ -2,9 +2,9 @@ .. _model_server_example: -#################### -Build a Model Server -#################### +###################### +Develop a Model Server +###################### **Audience:** Users who want to serve their trained models. diff --git a/docs/source-app/examples/model_server_app/putting_everything_together.rst b/docs/source-app/examples/model_server_app/putting_everything_together.rst index c11a5289d37ac..f74f1113945b4 100644 --- a/docs/source-app/examples/model_server_app/putting_everything_together.rst +++ b/docs/source-app/examples/model_server_app/putting_everything_together.rst @@ -43,15 +43,15 @@ Find more examples .. Add callout items below this line .. displayitem:: - :header: Build a DAG - :description: Create a dag pipeline + :header: Develop a DAG + :description: Develop a DAG pipeline :col_css: col-md-4 :button_link: ../dag/dag.html :height: 150 :tag: Intermediate .. displayitem:: - :header: Build a File Server + :header: Develop a File Server :description: Train multiple models with different parameters :col_css: col-md-4 :button_link: ../file_server/file_server.html @@ -59,7 +59,7 @@ Find more examples :tag: Intermediate .. displayitem:: - :header: Build a Github Repo Script Runner + :header: Develop a Github Repo Script Runner :description: Run code from the internet in the cloud :col_css: col-md-4 :button_link: ../github_repo_runner/github_repo_runner.html @@ -67,7 +67,7 @@ Find more examples :tag: Intermediate .. displayitem:: - :header: Build a HPO Sweeper + :header: Develop a HPO Sweeper :description: Train multiple models with different parameters :col_css: col-md-4 :button_link: ../hpo/hpo.html diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index 364f4034e5ae1..239288004c2a0 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -198,13 +198,13 @@ Keep Learning .. toctree:: :maxdepth: 1 - :caption: Practical Examples + :caption: Examples - Build a DAG - Build a File Server - Build a Github Repo Script Runner - Build a HPO Sweeper - Build a Model Server + Develop a DAG + Develop a File Server + Develop a Github Repo Script Runner + Develop a HPO Sweeper + Develop a Model Server .. [Docs under construction] Build a data exploring app @@ -214,19 +214,19 @@ Keep Learning .. toctree:: :maxdepth: 1 - :caption: Common Workflows + :caption: How to... Add a web user interface (UI) Add a web link Arrange app tabs - Build a Lightning app - Build a Lightning component + Develop a Lightning App + Develop a Lightning Component Cache Work run calls Customize your cloud compute Extend an existing app Publish a Lightning component Run a server within a Lightning App - Run an app on the cloud + Run an App on the cloud Run work in parallel Share an app Share files between components diff --git a/docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst b/docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst index 798212bf7dc95..2fcfb71162c3f 100644 --- a/docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst +++ b/docs/source-app/workflows/build_lightning_app/from_pytorch_lightning_script.rst @@ -1,27 +1,28 @@ -####################################### -Build app from PyTorch Lightning script -####################################### +####################################################### +Develop a Lightning App from a PyTorch Lightning script +####################################################### -**Audience:** Users who want to build an app from their PyTorch Lightning scripts. +**Audience:** Users who want to develop a Lightning App (App) from their PyTorch Lightning (PL) scripts. ---- -*********************************************** -Why do I want to build an app from a PL script? -*********************************************** -Generating an app from a PL script allows you to immediately run on the cloud and share the progress with friends. +************************************************************* +What developing a Lightning App from a PL script does for you +************************************************************* + +Developing an App from a PL script allows you to immediately run on the cloud and share the progress with friends. Once you're happy with your model, you can immediately expand beyond just model development to things like -making your own inference APIs, research demos or even speeding up your data pipeline. +making your own inference APIs, research demos, or even speeding up your data pipeline. -The PyTorch Lightning app is your entry point to the full end-to-end ML licefycle. +The PyTorch Lightning App is your entry point to the full end-to-end ML licefycle. ---- -******************* -Generate a template -******************* +****************** +Develop a template +****************** -To generate a template from a PyTorch Lightning script, use this command: +To develop a template from a PyTorch Lightning script, use this command: .. code:: bash @@ -35,22 +36,23 @@ If your script is not at the root of the project folder, and you'd like to inclu lightning init pl-app path/to/project/root path/to/the/pl_script.py -The default trainer app lets you train a model with a beautiful UI locally and on the cloud with zero effort! +The default trainer App lets you train a model with a beautiful UI locally and on the cloud with zero effort! ---- *********** -Run the app +Run the App *********** -.. note:: this page is under construction -Run the app locally: +.. note:: This section is under construction. + +Run the App locally: .. code:: bash lightning run app pl-app/app.py -Or run it on the cloud so you can share with collaborators and even use all the cloud GPUs you want +Or run the App on the cloud so you can share with collaborators and even use all the cloud GPUs you want. .. code:: bash @@ -67,7 +69,7 @@ Or run it on the cloud so you can share with collaborators and even use all the Modify the template ******************* -The command above generates an app file like this: +The command above generates an App file like this: .. note:: TODO: list the file and show how to extend it @@ -99,7 +101,7 @@ Now you can add your own components as you wish! Known issues ************ -- The UI takes a couple seconds to load when opening the app, be patient. +- The UI takes a couple seconds to load when opening the App, so please be patient. - The timer resets when refreshing the page. - The UI for adding new environment variables does not provide an option to delete an entry. - A bug exists that leaves the script hanging at the start of training when using the DDP strategy. diff --git a/docs/source-app/workflows/build_lightning_app/from_scratch.rst b/docs/source-app/workflows/build_lightning_app/from_scratch.rst index 38335cff9d386..9042f105711cd 100644 --- a/docs/source-app/workflows/build_lightning_app/from_scratch.rst +++ b/docs/source-app/workflows/build_lightning_app/from_scratch.rst @@ -1,7 +1,8 @@ -################################## -Build a Lightning App from Scratch -################################## -**Audience:** Users who want to build a Lightning App from scratch. +#################################### +Develop a Lightning App from Scratch +#################################### + +**Audience:** Users who want to develop a Lightning App from scratch. **Prereqs:** You must have finished the `Basic levels `_. diff --git a/docs/source-app/workflows/build_lightning_app/index.rst b/docs/source-app/workflows/build_lightning_app/index.rst index e4518d357c2cc..0af121d489b1e 100644 --- a/docs/source-app/workflows/build_lightning_app/index.rst +++ b/docs/source-app/workflows/build_lightning_app/index.rst @@ -1,7 +1,8 @@ -##################### -Build a Lightning app -##################### -A Lightning app is a collection of components interacting together. Learn how to generate a basic app template. +####################### +Develop a Lightning App +####################### + +A Lightning App (App) is a collection of components interacting together. Learn how to develop a basic App template. ---- diff --git a/docs/source-app/workflows/build_lightning_app/index_content.rst b/docs/source-app/workflows/build_lightning_app/index_content.rst index cc9c80bd957e7..45264d85a4adc 100644 --- a/docs/source-app/workflows/build_lightning_app/index_content.rst +++ b/docs/source-app/workflows/build_lightning_app/index_content.rst @@ -11,16 +11,16 @@
.. displayitem:: - :header: Build a Lightning app from scratch - :description: Learn how to generate a Lightning app from scratch + :header: Develop a Lightning App from scratch + :description: Learn how to Develop a Lightning App from scratch :col_css: col-md-6 :button_link: from_scratch.html :height: 150 :tag: basic .. displayitem:: - :header: Build app from a PyTorch Lightning script - :description: Share your PyTorch Lightning training on the cloud, run on cloud GPUs, or extend your app + :header: Develop an App from a PyTorch Lightning script + :description: Share your PyTorch Lightning training on the cloud, run on cloud GPUs, or extend your App :col_css: col-md-6 :button_link: from_pytorch_lightning_script.html :height: 150 diff --git a/docs/source-app/workflows/build_lightning_component/basic.rst b/docs/source-app/workflows/build_lightning_component/basic.rst index b72b9147e2f07..07fac58cf21da 100644 --- a/docs/source-app/workflows/build_lightning_component/basic.rst +++ b/docs/source-app/workflows/build_lightning_component/basic.rst @@ -1,7 +1,8 @@ -########################### -Build a Lightning component -########################### -**Audience:** Users who want to build a Lightning component. +############################# +Develop a Lightning Component +############################# + +**Audience:** Users who want to develop a Lightning Component. ---- diff --git a/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst b/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst index 002816b0743ef..f3168e566823d 100644 --- a/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst +++ b/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst @@ -32,9 +32,9 @@ Use a **LightningWork** component for any programming logic that takes more than ---- -************************************************ -What building a Lightning component does for you -************************************************ +************************************************** +What developing a Lightning Component does for you +************************************************** Lightning Components break up complex systems into modular components. The first obvious benefit is that components can be reused across other apps. This means you can build once, test it and forget it. diff --git a/docs/source-app/workflows/build_lightning_component/index.rst b/docs/source-app/workflows/build_lightning_component/index.rst index f72c461ff8553..9ff726a9486b1 100644 --- a/docs/source-app/workflows/build_lightning_component/index.rst +++ b/docs/source-app/workflows/build_lightning_component/index.rst @@ -1,7 +1,8 @@ -########################### -Build a Lightning component -########################### -A Lightning app is a collection of components interacting together. Learn how to build a Lightning component in this section. +############################# +Develop a Lightning Component +############################# + +A Lightning App (App) is a collection of components interacting together. Learn how to build a Lightning Component (Component) in this section. ---- diff --git a/docs/source-app/workflows/build_lightning_component/index_content.rst b/docs/source-app/workflows/build_lightning_component/index_content.rst index 00b4e99ad1e5e..abf26e38a2514 100644 --- a/docs/source-app/workflows/build_lightning_component/index_content.rst +++ b/docs/source-app/workflows/build_lightning_component/index_content.rst @@ -28,16 +28,16 @@ Basics
.. displayitem:: - :header: Build a Lightning component - :description: Learn the basics of building a Lightning component + :header: Develop a Lightning Component + :description: Learn the basics of developing a Lightning Component :col_css: col-md-4 :button_link: basic.html :height: 150 :tag: basic .. displayitem:: - :header: Explore community Lightning components - :description: Discover community-built Lightning components + :header: Explore community Lightning Components + :description: Discover community-built Lightning Components :col_css: col-md-4 :button_link: https://lightning.ai/components :height: 150 diff --git a/docs/source-app/workflows/build_lightning_component/intermediate.rst b/docs/source-app/workflows/build_lightning_component/intermediate.rst index 070d3aa0caf1a..871224ba4fdee 100644 --- a/docs/source-app/workflows/build_lightning_component/intermediate.rst +++ b/docs/source-app/workflows/build_lightning_component/intermediate.rst @@ -1,14 +1,15 @@ -########################################## -Build a Lightning component (intermediate) -########################################## -**Audience:** Users who want to connect a UI to a Lightning component. +############################################ +Develop a Lightning Component (intermediate) +############################################ + +**Audience:** Users who want to connect a UI to a Lightning Component (Component). ---- ***************************** Add a web user interface (UI) ***************************** -Every lightning component can have its own user interface (UI). Lightning components support any kind +Every Lightning Component can have its own user interface (UI). Lightning Components support any kind of UI interface such as react.js, vue.js, streamlit, gradio, dash, web urls, etc...(`full list here <../add_web_ui/index.html>`_). Let's say that we have a user interface defined in html: @@ -24,7 +25,7 @@ Let's say that we have a user interface defined in html: -To *connect* this user interface to the component, define the configure_layout method: +To *connect* this user interface to the Component, define the configure_layout method: .. code:: python :emphasize-lines: 5, 6 @@ -37,7 +38,7 @@ To *connect* this user interface to the component, define the configure_layout m def configure_layout(self): return StaticWebFrontend(serve_dir="path/to/folder/with/index.html/inside") -Finally, route the component's UI through the root component's **configure_layout** method: +Finally, route the Component's UI through the root Component's **configure_layout** method: .. code:: python :emphasize-lines: 14 @@ -63,7 +64,7 @@ Finally, route the component's UI through the root component's **configure_layou app = L.LightningApp(LitApp()) -Run your app and you'll see the UI on the Lightning App view: +Run your App and you'll see the UI on the Lightning App view: .. code:: bash diff --git a/docs/source-app/workflows/build_lightning_component/publish_a_component.rst b/docs/source-app/workflows/build_lightning_component/publish_a_component.rst index 8364afd3d59ce..bb5ec755ae190 100644 --- a/docs/source-app/workflows/build_lightning_component/publish_a_component.rst +++ b/docs/source-app/workflows/build_lightning_component/publish_a_component.rst @@ -1,17 +1,19 @@ ############################# -Publish a Lightning component +Publish a Lightning Component ############################# -**Audience:** Users who want to build a component to publish to the Lightning Gallery + +**Audience:** Users who want to build a Ligthtning Component (Component) to publish to the Lightning Gallery ---- -******************************** -Generate component from template -******************************** -The fastest way to build a component that is ready to be published to the component Gallery is to use +*********************************** +Develop a Component from a template +*********************************** + +The fastest way to build a Component that is ready to be published to the component Gallery is to use the default template. -Generate your component template with this command: +Generate your Component template with this command: .. code:: python @@ -20,9 +22,10 @@ Generate your component template with this command: ---- ***************** -Run the component +Run the Component ***************** -To test that your component works, first install all dependencies: + +To test that your Component works, first install all dependencies: .. code:: bash @@ -30,7 +33,7 @@ To test that your component works, first install all dependencies: pip install -r requirements.txt pip install -e . -Now import your component and use it in an app: +Now import your Component and use it in a Lightning App: .. code:: python From 5f1cb913bb4cb9d3aac4ed27c4cab57e7a45a7d4 Mon Sep 17 00:00:00 2001 From: Mansy Date: Wed, 3 Aug 2022 22:24:51 +0200 Subject: [PATCH 080/230] Relax lightning app dependency requirements (#13998) * [App] Relax lightning app requirements Co-authored-by: manskx --- requirements/app/base.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 506a4a8837d60..e86111f0c0548 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,10 +1,8 @@ py -jinja2==3.0.3 lightning-cloud==0.5.0 packaging -deepdiff >= 5.7.0 +deepdiff>= 5.7.0 starsessions -fsspec==2022.01.0 -s3fs==2022.1.0 +fsspec>=2022.01.0 +s3fs>=2022.1.0 croniter # for now until we found something more robust. -traitlets < 5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 From e513638de877aaefa3069afd53a2bd8a22d7d081 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 3 Aug 2022 23:43:51 +0200 Subject: [PATCH 081/230] adding explain notes for requirements (#13872) * adding explain notes for requirements * Apply suggestions from code review Co-authored-by: Rohit Gupta --- requirements/README.md | 13 +++++++++++++ requirements/app/base.txt | 3 ++- requirements/app/cloud.txt | 2 +- requirements/pytorch/base.txt | 3 +++ requirements/pytorch/examples.txt | 3 +++ requirements/pytorch/extra.txt | 3 +++ requirements/pytorch/loggers.txt | 3 +++ requirements/pytorch/strategies.txt | 3 +++ 8 files changed, 31 insertions(+), 2 deletions(-) create mode 100644 requirements/README.md diff --git a/requirements/README.md b/requirements/README.md new file mode 100644 index 0000000000000..24ce0561689d9 --- /dev/null +++ b/requirements/README.md @@ -0,0 +1,13 @@ +# Project Requirements + +This root requirements folder branches into sub-folders depending on the python package. +Within the folder, we have grouped requirements files/lists per focus, which shall closely match package extra +So, for example, when you install PL as `pip install pytorch-lightning[loggers]`, this list is stored in `requirements/pytorch/loggers.txt`. +The only exceptional requirement file is `devel.txt`, which aggregated all the needed requirements for development. + +## CI/CD upper bounds + +For Ci stability, we have set for all package versions upper bounds (the latest version), so with any sudden release, we won't put our development on fire. +The continues updated of these upper bounds are managed by dependabot. +Note that these upper bounds are lifters when installing a package from the source or as a package. +If you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment. diff --git a/requirements/app/base.txt b/requirements/app/base.txt index e86111f0c0548..0a0b9cdb4719d 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,8 +1,9 @@ py lightning-cloud==0.5.0 packaging -deepdiff>= 5.7.0 +deepdiff>=5.7.0 starsessions fsspec>=2022.01.0 s3fs>=2022.1.0 croniter # for now until we found something more robust. +traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index dc396d72c6cb3..8dac80ef23432 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,5 +1,5 @@ starsessions redis==4.1.0 docker==5.0.3 -setuptools == 59.5.0 +setuptools==59.5.0 s3fs==2022.1.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 41a712c930cca..e8743b18c73b0 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -1,3 +1,6 @@ +# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package +# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment + numpy>=1.17.2, <1.23.1 torch>=1.9.*, <=1.12.0 tqdm>=4.57.0, <=4.63.0 diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 223a9e0117299..288e3a10889c3 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -1,3 +1,6 @@ +# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package +# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment + torchvision>=0.10.*, <=0.13.0 gym[classic_control]>=0.17.0, <0.24.2 ipython[all] <=8.1.1 diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index f956d021976a6..c386c5581cc42 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -1,3 +1,6 @@ +# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package +# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment + # extended list of package dependencies to reach full functionality matplotlib>3.1, <3.5.3 torchtext>=0.10.*, <0.14.0 diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index a857ab5660d54..7d89449318b28 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -1,3 +1,6 @@ +# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package +# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment + # all supported loggers neptune-client>=0.10.0, <0.16.4 diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 4eafac99b8c66..4e916fbc6c61f 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -1,3 +1,6 @@ +# NOTE: the upper bound for the package version is only set for CI stability, and it is dropped while installing this package +# in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment + fairscale>=0.4.5, <=0.4.6 deepspeed>=0.6.0, <0.7.0 # no need to install with [pytorch] as pytorch is already installed From 7fe5d4b2dd1487a9d425442c217fa2510d5aa08f Mon Sep 17 00:00:00 2001 From: Lee Jungwon <33821003+BongYang@users.noreply.github.com> Date: Thu, 4 Aug 2022 08:44:57 +0900 Subject: [PATCH 082/230] Fix mypy typing errors in strategies/fully_sharded.py (#13941) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- pyproject.toml | 1 - .../strategies/fully_sharded.py | 26 ++++++++++++------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 352fb33f53ac9..226b109459f24 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -61,7 +61,6 @@ module = [ "pytorch_lightning.profilers.pytorch", "pytorch_lightning.profilers.simple", "pytorch_lightning.strategies.ddp", - "pytorch_lightning.strategies.fully_sharded", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 971441160d333..283e5e6a868cc 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -13,7 +13,7 @@ # limitations under the License. import contextlib import logging -from typing import Dict, Generator, List, Optional +from typing import Any, Dict, Generator, List, Optional import torch @@ -27,7 +27,7 @@ from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.optimizer import optimizers_to_device -from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, TrainingStep, ValidationStep if _FAIRSCALE_FULLY_SHARDED_AVAILABLE: from fairscale.nn import default_auto_wrap_policy, enable_wrap @@ -124,7 +124,7 @@ def __init__( self._process_group = None @property - def process_group(self): + def process_group(self) -> Any: if self._process_group is None: self._process_group = torch.distributed.new_group() return self._process_group @@ -137,6 +137,7 @@ def setup_distributed(self) -> None: super().setup_distributed() def setup(self, trainer: "pl.Trainer") -> None: + assert self.accelerator self.accelerator.setup(trainer) if trainer.state.fn == TrainerFn.FITTING: @@ -144,6 +145,7 @@ def setup(self, trainer: "pl.Trainer") -> None: optimizers_to_device(self.optimizers, self.root_device) if self._layer_sync: + assert self.model self.model = self._layer_sync.apply(self.model) self.setup_precision_plugin() @@ -155,7 +157,7 @@ def model_sharded_context(self) -> Generator: log.detail(f"{self.__class__.__name__}: entered model_sharded_context.") precision = self.precision_plugin.precision - def wrap_policy(*args, **kwargs): + def wrap_policy(*args: Any, **kwargs: Any) -> Any: return default_auto_wrap_policy(*args, **kwargs, min_num_params=self.min_num_params) with enable_wrap( @@ -186,30 +188,36 @@ def configure_ddp(self) -> None: self.model_to_device() # setup optimizers after fully sharded has wrapped the lightning module + assert self.lightning_module self.setup_optimizers(self.lightning_module.trainer) def model_to_device(self) -> None: log.detail(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...") # ensure we update the device type in the lightning module + assert self.lightning_module self.lightning_module.to(self.root_device) - def training_step(self, *args, **kwargs) -> STEP_OUTPUT: + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: with self.precision_plugin.train_step_context(): + assert isinstance(self.model, TrainingStep) return self.model.training_step(*args, **kwargs) - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.val_step_context(): + assert isinstance(self.model, ValidationStep) return self.model.validation_step(*args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.test_step_context(): + assert isinstance(self.model, TestStep) return self.model.test_step(*args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: with self.precision_plugin.predict_step_context(): + assert isinstance(self.model, PredictStep) return self.model.predict_step(*args, **kwargs) - def post_training_step(self): + def post_training_step(self) -> None: pass @classmethod From 9301a6b16751fb2948add8aaeb99d47fcd96ae51 Mon Sep 17 00:00:00 2001 From: "Thong Q. Nguyen" Date: Wed, 3 Aug 2022 18:49:29 -0700 Subject: [PATCH 083/230] Remove fp16 restriction in the docstring for DeepSpeedStrategy (#13919) Co-authored-by: Akihiro Nitta Co-authored-by: awaelchli --- src/pytorch_lightning/strategies/deepspeed.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index e7fbcf91967fc..2d32503dd406a 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -152,7 +152,8 @@ def __init__( Arguments: - zero_optimization: Enable ZeRO optimization. This is only compatible with precision=16. + zero_optimization: Enable ZeRO optimization. This is compatible with either `precision=16` or + `precision="bf16"`. stage: Different stages of the ZeRO Optimizer. 0 is disabled, 1 is optimizer state partitioning, 2 is optimizer+gradient state partitioning, From 332182d491f258f3051e1490ce0e67a7e873b779 Mon Sep 17 00:00:00 2001 From: Alec Merdler Date: Thu, 4 Aug 2022 03:31:07 -0400 Subject: [PATCH 084/230] Fix button selector for Lightning app e2e tests (#13984) --- src/lightning_app/testing/testing.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index dd34614a34353..97f2762b02e42 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -247,13 +247,20 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator: [LIGHTNING_CLOUD_PROJECT_ID], ) admin_page.goto(f"{Config.url}/{Config.username}/apps") + + # Closing the Create Project dialog. try: - # Closing the Create Project modal - button = admin_page.locator('button:has-text("Cancel")') + project_dialog = admin_page.locator("text=Create a project") + project_dialog.wait_for(timeout=10 * 1000, state="visible") + print("'Create Project' dialog visible, closing it.") + project_name_input = admin_page.locator('input[type="text"]') + project_name_input.fill("Default Project") + button = admin_page.locator('button:has-text("Continue")') button.wait_for(timeout=3 * 1000) button.click() - except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError): - pass + except playwright._impl._api_types.TimeoutError: + print("'Create Project' dialog not visible, skipping.") + admin_page.locator(f"text={name}").click() admin_page.evaluate( """data => { From e78bf2044b48ac92f45f058ce828a05b7cd3b6d1 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 4 Aug 2022 17:34:42 +0530 Subject: [PATCH 085/230] Raise an error if batch transfer hooks are overridden with IPUAccelerator (#13961) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- .../accelerators/gpu_intermediate.rst | 5 ++- .../source-pytorch/accelerators/ipu_basic.rst | 3 ++ src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/core/hooks.py | 9 ++++ .../trainer/configuration_validator.py | 12 +++-- tests/tests_pytorch/strategies/test_dp.py | 45 ------------------- .../trainer/test_config_validator.py | 28 ++++++++++++ 7 files changed, 53 insertions(+), 51 deletions(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index 83d7d1f60c891..f2c5f95ab95aa 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -47,8 +47,9 @@ after which the root node will aggregate the results. :doc:`Manual Optimization <../model/manual_optimization>` with DP. Use DDP which is more stable and at least 3x faster. .. warning:: DP only supports scattering and gathering primitive collections of tensors like lists, dicts, etc. - Therefore the :meth:`~pytorch_lightning.core.hooks.ModelHooks.transfer_batch_to_device` hook does not apply in - this mode and if you have overridden it, it will not be called. + Therefore the hooks :meth:`~pytorch_lightning.core.hooks.ModelHooks.on_before_batch_transfer`, + :meth:`~pytorch_lightning.core.hooks.ModelHooks.transfer_batch_to_device` and :meth:`~pytorch_lightning.core.hooks.ModelHooks.on_after_batch_transfer` + do not apply in this mode and if you have overridden any of them, an exception will be raised. .. testcode:: :skipif: torch.cuda.device_count() < 2 diff --git a/docs/source-pytorch/accelerators/ipu_basic.rst b/docs/source-pytorch/accelerators/ipu_basic.rst index 99a5c69a10417..5302945fc6cc4 100644 --- a/docs/source-pytorch/accelerators/ipu_basic.rst +++ b/docs/source-pytorch/accelerators/ipu_basic.rst @@ -67,3 +67,6 @@ Please see the `MNIST example None: elif trainer.state.fn == TrainerFn.PREDICTING: __verify_eval_loop_configuration(trainer, model, "predict") - __verify_dp_batch_transfer_support(trainer, model) + __verify_batch_transfer_support(trainer, model) _check_deprecated_callback_hooks(trainer) # TODO: Delete _check_on_hpc_hooks in v1.8 _check_on_hpc_hooks(model) @@ -148,17 +149,22 @@ def __verify_eval_loop_configuration(trainer: "pl.Trainer", model: "pl.Lightning raise MisconfigurationException(f"No `{step_name}()` method defined to run `Trainer.{trainer_method}`.") -def __verify_dp_batch_transfer_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: +def __verify_batch_transfer_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: """Raise Misconfiguration exception since these hooks are not supported in DP mode.""" - # TODO: Remove this blocker once batch transfer to device is integrated in Lightning for DP mode. batch_transfer_hooks = ("on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer") datahook_selector = trainer._data_connector._datahook_selector for hook in batch_transfer_hooks: + # TODO: Remove this blocker once batch transfer to device is integrated in Lightning for DP mode. if isinstance(trainer.strategy, DataParallelStrategy) and ( is_overridden(hook, datahook_selector.model) or is_overridden(hook, datahook_selector.datamodule) ): raise MisconfigurationException(f"Overriding `{hook}` is not supported in DP mode.") + if isinstance(trainer.accelerator, IPUAccelerator) and ( + is_overridden(hook, datahook_selector.model) or is_overridden(hook, datahook_selector.datamodule) + ): + raise MisconfigurationException(f"Overriding `{hook}` is not supported with IPUs.") + def __verify_manual_optimization_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: if model.automatic_optimization: diff --git a/tests/tests_pytorch/strategies/test_dp.py b/tests/tests_pytorch/strategies/test_dp.py index 30e0e5b19a845..502003201e169 100644 --- a/tests/tests_pytorch/strategies/test_dp.py +++ b/tests/tests_pytorch/strategies/test_dp.py @@ -11,9 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from unittest import mock -import pytest import torch import torch.nn.functional as F from torch.utils.data import DataLoader @@ -21,10 +19,8 @@ import pytorch_lightning as pl import tests_pytorch.helpers.pipelines as tpipes import tests_pytorch.helpers.utils as tutils -from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset -from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.simple_models import ClassificationModel @@ -154,47 +150,6 @@ def _assert_extra_outputs(self, outputs): assert out.dtype is torch.float -@mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=2) -@mock.patch("pytorch_lightning.utilities.device_parser.is_cuda_available", return_value=True) -def test_dp_raise_exception_with_batch_transfer_hooks(mock_is_available, mock_device_count, tmpdir): - """Test that an exception is raised when overriding batch_transfer_hooks in DP model.""" - - class CustomModel(BoringModel): - def transfer_batch_to_device(self, batch, device, dataloader_idx): - batch = batch.to(device) - return batch - - trainer_options = dict(default_root_dir=tmpdir, max_steps=7, accelerator="gpu", devices=[0, 1], strategy="dp") - - trainer = Trainer(**trainer_options) - model = CustomModel() - - with pytest.raises(MisconfigurationException, match=r"Overriding `transfer_batch_to_device` is not .* in DP"): - trainer.fit(model) - - class CustomModel(BoringModel): - def on_before_batch_transfer(self, batch, dataloader_idx): - batch += 1 - return batch - - trainer = Trainer(**trainer_options) - model = CustomModel() - - with pytest.raises(MisconfigurationException, match=r"Overriding `on_before_batch_transfer` is not .* in DP"): - trainer.fit(model) - - class CustomModel(BoringModel): - def on_after_batch_transfer(self, batch, dataloader_idx): - batch += 1 - return batch - - trainer = Trainer(**trainer_options) - model = CustomModel() - - with pytest.raises(MisconfigurationException, match=r"Overriding `on_after_batch_transfer` is not .* in DP"): - trainer.fit(model) - - @RunIf(min_cuda_gpus=2) def test_dp_training_step_dict(tmpdir): """This test verifies that dp properly reduces dictionaries.""" diff --git a/tests/tests_pytorch/trainer/test_config_validator.py b/tests/tests_pytorch/trainer/test_config_validator.py index a2f24f3addc31..bb973fe10ca1c 100644 --- a/tests/tests_pytorch/trainer/test_config_validator.py +++ b/tests/tests_pytorch/trainer/test_config_validator.py @@ -14,9 +14,11 @@ import pytest import torch +import pytorch_lightning as pl from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.callbacks.callback import Callback from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset +from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.warnings import PossibleUserWarning @@ -192,3 +194,29 @@ def setup(self, pl_module, trainer): with pytest.raises(MisconfigurationException, match="does not have a `stage` argument"): trainer.fit(model) + + +@pytest.mark.parametrize("trainer_kwargs", [{"accelerator": "ipu"}, {"accelerator": "gpu", "strategy": "dp"}]) +@pytest.mark.parametrize("hook", ["on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer"]) +def test_raise_exception_with_batch_transfer_hooks(monkeypatch, hook, trainer_kwargs, tmpdir): + """Test that an exception is raised when overriding batch_transfer_hooks.""" + if trainer_kwargs.get("accelerator") == "gpu": + match_pattern = rf"Overriding `{hook}` is not .* in DP mode." + monkeypatch.setattr(device_parser, "is_cuda_available", lambda: True) + monkeypatch.setattr(device_parser, "num_cuda_devices", lambda: 2) + elif trainer_kwargs.get("accelerator") == "ipu": + match_pattern = rf"Overriding `{hook}` is not .* with IPUs" + monkeypatch.setattr(pl.accelerators.ipu.IPUAccelerator, "is_available", lambda _: True) + monkeypatch.setattr(pl.strategies.ipu, "_IPU_AVAILABLE", lambda: True) + + def custom_method(self, batch, *_, **__): + batch = batch + 1 + return batch + + trainer = Trainer(default_root_dir=tmpdir, **trainer_kwargs) + + model = BoringModel() + setattr(model, hook, custom_method) + + with pytest.raises(MisconfigurationException, match=match_pattern): + trainer.fit(model) From 8cd6c3cf9deb98ae06a413584e4bfb40ef0e9576 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 4 Aug 2022 14:40:19 +0200 Subject: [PATCH 086/230] drop block contribution do app (#14010) --- .github/workflows/ci-app_block.yml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .github/workflows/ci-app_block.yml diff --git a/.github/workflows/ci-app_block.yml b/.github/workflows/ci-app_block.yml deleted file mode 100644 index 75fa50794b03b..0000000000000 --- a/.github/workflows/ci-app_block.yml +++ /dev/null @@ -1,21 +0,0 @@ -name: Block app edits - -on: ["pull_request"] - -jobs: - block: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: "2" # To retrieve the preceding commit. - - - name: Get changed files using defaults - id: changed-files - uses: tj-actions/changed-files@v24 - - - name: List all added files - run: | - for file in ${{ steps.changed-files.outputs.all_changed_and_modified_files }}; do - echo "$file" - done From b8739a0167b3da3539778dc09818f871f91eb472 Mon Sep 17 00:00:00 2001 From: Mansy Date: Thu, 4 Aug 2022 16:25:41 +0200 Subject: [PATCH 087/230] Deprecate sheety API (#14004) * deprecate sheety Co-authored-by: manskx --- src/lightning_app/cli/cmd_install.py | 21 +++++++++------- src/lightning_app/core/constants.py | 3 +++ src/lightning_app/runners/cloud.py | 2 +- tests/tests_app/cli/test_cmd_install.py | 32 ++++++++++++------------- 4 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/lightning_app/cli/cmd_install.py b/src/lightning_app/cli/cmd_install.py index 4fbaefd924544..f15567bd8470c 100644 --- a/src/lightning_app/cli/cmd_install.py +++ b/src/lightning_app/cli/cmd_install.py @@ -8,6 +8,8 @@ import requests +from lightning_app.core.constants import LIGHTNING_APPS_PUBLIC_REGISTRY, LIGHTNING_COMPONENT_PUBLIC_REGISTRY + logger = logging.getLogger(__name__) @@ -295,8 +297,16 @@ def _validate_name(name, resource_type, example): def _resolve_resource(registry_url, name, version_arg, resource_type): + gallery_entries = [] try: url = requests.get(registry_url) + data = json.loads(url.text) + + if resource_type == "app": + gallery_entries = [a for a in data["apps"] if a["canDownloadSourceCode"]] + + elif resource_type == "component": + gallery_entries = data["components"] except requests.ConnectionError: m = f""" Network connection error, could not load list of available Lightning {resource_type}s. @@ -306,12 +316,9 @@ def _resolve_resource(registry_url, name, version_arg, resource_type): sys.tracebacklimit = 0 raise SystemError(m) - data = json.loads(url.text) - data = data[resource_type + "s"] - entries = [] all_versions = [] - for x in data: + for x in gallery_entries: if name == x["name"]: entries.append(x) all_versions.append(x["version"]) @@ -473,12 +480,10 @@ def _install_component(git_url): def _resolve_app_registry(): - public_registry = "https://api.sheety.co/e559626ba514c7ba80caae1e38a8d4f4/lightningAppRegistry/apps" - registry = os.environ.get("LIGHTNING_APP_REGISTRY", public_registry) + registry = os.environ.get("LIGHTNING_APP_REGISTRY", LIGHTNING_APPS_PUBLIC_REGISTRY) return registry def _resolve_component_registry(): - public_registry = "https://api.sheety.co/e559626ba514c7ba80caae1e38a8d4f4/lightningAppRegistry/components" - registry = os.environ.get("LIGHTNING_COMPONENT_REGISTRY", public_registry) + registry = os.environ.get("LIGHTNING_COMPONENT_REGISTRY", LIGHTNING_COMPONENT_PUBLIC_REGISTRY) return registry diff --git a/src/lightning_app/core/constants.py b/src/lightning_app/core/constants.py index fd62de13cc013..84a0da49c21cf 100644 --- a/src/lightning_app/core/constants.py +++ b/src/lightning_app/core/constants.py @@ -32,6 +32,9 @@ LIGHTNING_CREDENTIAL_PATH = os.getenv("LIGHTNING_CREDENTIAL_PATH", str(Path.home() / ".lightning" / "credentials.json")) DOT_IGNORE_FILENAME = ".lightningignore" +LIGHTNING_COMPONENT_PUBLIC_REGISTRY = "https://lightning.ai/v1/components" +LIGHTNING_APPS_PUBLIC_REGISTRY = "https://lightning.ai/v1/apps" + def get_lightning_cloud_url() -> str: # DO NOT CHANGE! diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 37feda0b514f2..9dfa37ca97e72 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -147,7 +147,7 @@ def dispatch( # There can be only one app with unique project_id<>name pair lightning_app = list_apps_resp.lightningapps[0] else: - app_body = Body7(name=app_config.name) + app_body = Body7(name=app_config.name, can_download_source_code=True) lightning_app = self.backend.client.lightningapp_v2_service_create_lightningapp_v2( project.project_id, app_body ) diff --git a/tests/tests_app/cli/test_cmd_install.py b/tests/tests_app/cli/test_cmd_install.py index 52a36dd2324f6..2d277ddb7790c 100644 --- a/tests/tests_app/cli/test_cmd_install.py +++ b/tests/tests_app/cli/test_cmd_install.py @@ -27,7 +27,8 @@ def test_valid_org_app_name(): assert result.exit_code # assert a good (and availablea name) works - real_app = "lightning/install-app" + # This should be an app that's always in the gallery + real_app = "lightning/invideo" result = runner.invoke(lightning_cli.install_app, [real_app]) assert "Press enter to continue:" in result.output @@ -145,7 +146,7 @@ def test_component_install(real_component, test_component_pip_name): def test_prompt_actions(): # TODO: each of these installs must check that a package is installed in the environment correctly - app_to_use = "lightning/install-app" + app_to_use = "lightning/invideo" runner = CliRunner() @@ -197,7 +198,7 @@ def test_version_arg_component(tmpdir, monkeypatch): def test_version_arg_app(tmpdir): # Version does not exist - app_name = "lightning/hackernews-app" + app_name = "lightning/invideo" version_arg = "NOT-EXIST" runner = CliRunner() result = runner.invoke(lightning_cli.install_app, [app_name, f"--version={version_arg}"]) @@ -205,7 +206,7 @@ def test_version_arg_app(tmpdir): assert result.exit_code == 1 # Version exists - version_arg = "0.0.1" + version_arg = "0.0.2" runner = CliRunner() result = runner.invoke(lightning_cli.install_app, [app_name, f"--version={version_arg}", "--yes"]) assert result.exit_code == 0 @@ -213,16 +214,16 @@ def test_version_arg_app(tmpdir): def test_proper_url_parsing(): - name = "lightning/install-app" + name = "lightning/invideo" # make sure org/app-name name is correct org, app = cmd_install._validate_name(name, resource_type="app", example="lightning/lit-slack-component") assert org == "lightning" - assert app == "install-app" + assert app == "invideo" # resolve registry (orgs can have a private registry through their environment variables) registry_url = cmd_install._resolve_app_registry() - assert registry_url == "https://api.sheety.co/e559626ba514c7ba80caae1e38a8d4f4/lightningAppRegistry/apps" + assert registry_url == "https://lightning.ai/v1/apps" # load the component resource component_entry = cmd_install._resolve_resource(registry_url, name=name, version_arg="latest", resource_type="app") @@ -230,10 +231,9 @@ def test_proper_url_parsing(): source_url, git_url, folder_name, git_sha = cmd_install._show_install_app_prompt( component_entry, app, org, True, resource_type="app" ) - assert folder_name == "install-app" + assert folder_name == "LAI-InVideo-search-App" # FixMe: this need to be updated after release with updated org rename - assert source_url == "https://github.com/PyTorchLightning/install-app" - assert git_url.find("@") > 10 # TODO: this will be removed once the apps repos will be public + assert source_url == "https://github.com/Lightning-AI/LAI-InVideo-search-App" assert "#ref" not in git_url assert git_sha @@ -286,20 +286,20 @@ def test_install_app_shows_error(tmpdir): # os.chdir(cwd) -def test_public_app_registry(): - registry = cmd_install._resolve_app_registry() - assert registry == "https://api.sheety.co/e559626ba514c7ba80caae1e38a8d4f4/lightningAppRegistry/apps" - - @mock.patch.dict(os.environ, {"LIGHTNING_APP_REGISTRY": "https://TODO/other_non_PL_registry"}) def test_private_app_registry(): registry = cmd_install._resolve_app_registry() assert registry == "https://TODO/other_non_PL_registry" +def test_public_app_registry(): + registry = cmd_install._resolve_app_registry() + assert registry == "https://lightning.ai/v1/apps" + + def test_public_component_registry(): registry = cmd_install._resolve_component_registry() - assert registry == "https://api.sheety.co/e559626ba514c7ba80caae1e38a8d4f4/lightningAppRegistry/components" + assert registry == "https://lightning.ai/v1/components" @mock.patch.dict(os.environ, {"LIGHTNING_COMPONENT_REGISTRY": "https://TODO/other_non_PL_registry"}) From 2190b47ae7a2c8c98215e6196c00fa991e49db94 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 4 Aug 2022 15:48:26 +0000 Subject: [PATCH 088/230] Update comet-ml requirement from <3.31.6,>=3.1.12 to >=3.1.12,<3.31.8 in /requirements (#13874) --- requirements/pytorch/loggers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index 7d89449318b28..c825275726a9a 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -4,7 +4,7 @@ # all supported loggers neptune-client>=0.10.0, <0.16.4 -comet-ml>=3.1.12, <3.31.6 +comet-ml>=3.1.12, <3.31.8 mlflow>=1.0.0, <1.27.0 test_tube>=0.7.5, <=0.7.5 wandb>=0.10.22, <0.12.20 From ef0623ec64d8b9122b6d3d0801569f76d1f62d05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 4 Aug 2022 18:00:00 +0200 Subject: [PATCH 089/230] Remove deprecated training type plugins (#14011) * Remove deprecated training type plugins * update changelog * DDP2Plugin * Update src/pytorch_lightning/CHANGELOG.md --- src/pytorch_lightning/CHANGELOG.md | 5 +- src/pytorch_lightning/plugins/__init__.py | 15 ----- .../plugins/training_type/__init__.py | 15 ----- .../plugins/training_type/ddp.py | 24 ------- .../plugins/training_type/ddp2.py | 18 ------ .../plugins/training_type/ddp_spawn.py | 24 ------- .../plugins/training_type/deepspeed.py | 24 ------- .../plugins/training_type/dp.py | 24 ------- .../plugins/training_type/fully_sharded.py | 24 ------- .../plugins/training_type/horovod.py | 24 ------- .../plugins/training_type/ipu.py | 24 ------- .../plugins/training_type/parallel.py | 26 -------- .../plugins/training_type/sharded.py | 24 ------- .../plugins/training_type/sharded_spawn.py | 24 ------- .../plugins/training_type/single_device.py | 24 ------- .../plugins/training_type/single_tpu.py | 24 ------- .../plugins/training_type/tpu_spawn.py | 24 ------- .../training_type/training_type_plugin.py | 26 -------- .../plugins/training_type/utils.py | 23 ------- src/pytorch_lightning/trainer/trainer.py | 8 --- .../deprecated_api/test_remove_1-8.py | 62 ------------------- 21 files changed, 3 insertions(+), 483 deletions(-) delete mode 100644 src/pytorch_lightning/plugins/training_type/__init__.py delete mode 100644 src/pytorch_lightning/plugins/training_type/ddp.py delete mode 100644 src/pytorch_lightning/plugins/training_type/ddp2.py delete mode 100644 src/pytorch_lightning/plugins/training_type/ddp_spawn.py delete mode 100644 src/pytorch_lightning/plugins/training_type/deepspeed.py delete mode 100644 src/pytorch_lightning/plugins/training_type/dp.py delete mode 100644 src/pytorch_lightning/plugins/training_type/fully_sharded.py delete mode 100644 src/pytorch_lightning/plugins/training_type/horovod.py delete mode 100644 src/pytorch_lightning/plugins/training_type/ipu.py delete mode 100644 src/pytorch_lightning/plugins/training_type/parallel.py delete mode 100644 src/pytorch_lightning/plugins/training_type/sharded.py delete mode 100644 src/pytorch_lightning/plugins/training_type/sharded_spawn.py delete mode 100644 src/pytorch_lightning/plugins/training_type/single_device.py delete mode 100644 src/pytorch_lightning/plugins/training_type/single_tpu.py delete mode 100644 src/pytorch_lightning/plugins/training_type/tpu_spawn.py delete mode 100644 src/pytorch_lightning/plugins/training_type/training_type_plugin.py delete mode 100644 src/pytorch_lightning/plugins/training_type/utils.py diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ac3187b13b904..ffcb5e3e18c23 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -32,10 +32,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Removed -- +- Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) -- +- Removed all deprecated training type plugins ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) + ### Fixed diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index afd10c88c951d..ff5e812014ebe 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -18,21 +18,6 @@ from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin -from pytorch_lightning.plugins.training_type.ddp import DDPPlugin -from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin -from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin -from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin -from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin -from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin -from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin -from pytorch_lightning.plugins.training_type.ipu import IPUPlugin -from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin -from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin -from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin -from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin -from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin -from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin -from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin from pytorch_lightning.strategies import Strategy PLUGIN = Union[Strategy, PrecisionPlugin, ClusterEnvironment, CheckpointIO, LayerSync] diff --git a/src/pytorch_lightning/plugins/training_type/__init__.py b/src/pytorch_lightning/plugins/training_type/__init__.py deleted file mode 100644 index f7bee339ef95a..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -from pytorch_lightning.plugins.training_type.ddp import DDPPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin # noqa: F401 -from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.horovod import HorovodPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.ipu import IPUPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.parallel import ParallelPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin # noqa: F401 -from pytorch_lightning.plugins.training_type.training_type_plugin import TrainingTypePlugin # noqa: F401 diff --git a/src/pytorch_lightning/plugins/training_type/ddp.py b/src/pytorch_lightning/plugins/training_type/ddp.py deleted file mode 100644 index cf81ff837d021..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/ddp.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DDPStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class DDPPlugin(DDPStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.ddp.DDPPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.ddp.DDPStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/ddp2.py b/src/pytorch_lightning/plugins/training_type/ddp2.py deleted file mode 100644 index 64cd2635bc134..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/ddp2.py +++ /dev/null @@ -1,18 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DDP2Strategy - - -class DDP2Plugin(DDP2Strategy): - pass diff --git a/src/pytorch_lightning/plugins/training_type/ddp_spawn.py b/src/pytorch_lightning/plugins/training_type/ddp_spawn.py deleted file mode 100644 index 3539557c7e283..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/ddp_spawn.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DDPSpawnStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class DDPSpawnPlugin(DDPSpawnStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.ddp_spawn.DDPSpawnPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.ddp_spawn.DDPSpawnStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/deepspeed.py b/src/pytorch_lightning/plugins/training_type/deepspeed.py deleted file mode 100644 index b761048d95e09..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/deepspeed.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DeepSpeedStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class DeepSpeedPlugin(DeepSpeedStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.deepspeed.DeepSpeedPlugin` is deprecated in v1.6 and will be removed in" - " v1.8. Use `pl.strategies.deepspeed.DeepSpeedStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/dp.py b/src/pytorch_lightning/plugins/training_type/dp.py deleted file mode 100644 index 984457aa14afc..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/dp.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DataParallelStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class DataParallelPlugin(DataParallelStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.dp.DataParallelPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.dp.DataParallelStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/fully_sharded.py b/src/pytorch_lightning/plugins/training_type/fully_sharded.py deleted file mode 100644 index 2ed02fa9bf7ed..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/fully_sharded.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DDPFullyShardedStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class DDPFullyShardedPlugin(DDPFullyShardedStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.fully_sharded.DDPFullyShardedPlugin` is deprecated in v1.6 and will be" - " removed in v1.8. Use `pl.strategies.fully_sharded.DDPFullyShardedStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/horovod.py b/src/pytorch_lightning/plugins/training_type/horovod.py deleted file mode 100644 index 455d4c47a92f1..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/horovod.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import HorovodStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class HorovodPlugin(HorovodStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.horovod.HorovodPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.horovod.HorovodStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/ipu.py b/src/pytorch_lightning/plugins/training_type/ipu.py deleted file mode 100644 index 3959e84a943d6..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/ipu.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies.ipu import IPUStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class IPUPlugin(IPUStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.ipu.IPUPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.ipu.IPUStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/parallel.py b/src/pytorch_lightning/plugins/training_type/parallel.py deleted file mode 100644 index 5822e17c61f23..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/parallel.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from abc import ABC - -from pytorch_lightning.strategies import ParallelStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class ParallelPlugin(ParallelStrategy, ABC): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.parallel.ParallelPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.parallel.ParallelStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/sharded.py b/src/pytorch_lightning/plugins/training_type/sharded.py deleted file mode 100644 index d66442d565ad9..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/sharded.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DDPShardedStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class DDPShardedPlugin(DDPShardedStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.sharded.DDPShardedPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.sharded.DDPShardedStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/sharded_spawn.py b/src/pytorch_lightning/plugins/training_type/sharded_spawn.py deleted file mode 100644 index 7b94652d5f6e1..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/sharded_spawn.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import DDPSpawnShardedStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class DDPSpawnShardedPlugin(DDPSpawnShardedStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.sharded_spawn.DDPSpawnShardedPlugin` is deprecated in v1.6 and will be" - " removed in v1.8. Use `pl.strategies.sharded_spawn.DDPSpawnShardedStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/single_device.py b/src/pytorch_lightning/plugins/training_type/single_device.py deleted file mode 100644 index 376b39fa4846a..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/single_device.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import SingleDeviceStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class SingleDevicePlugin(SingleDeviceStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.single_device.SingleDevicePlugin` is deprecated in v1.6 and will be removed" - " in v1.8. Use `pl.strategies.single_device.SingleDeviceStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/single_tpu.py b/src/pytorch_lightning/plugins/training_type/single_tpu.py deleted file mode 100644 index 5d305a51c497a..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/single_tpu.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import SingleTPUStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class SingleTPUPlugin(SingleTPUStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.single_tpu.SingleTPUPlugin` is deprecated in v1.6 and will be removed in" - " v1.8. Use `pl.strategies.single_tpu.SingleTPUStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/tpu_spawn.py b/src/pytorch_lightning/plugins/training_type/tpu_spawn.py deleted file mode 100644 index 8a745409598d4..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/tpu_spawn.py +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies import TPUSpawnStrategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class TPUSpawnPlugin(TPUSpawnStrategy): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.tpu_spawn.TPUSpawnPlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.tpu_spawn.TPUSpawnStrategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/training_type_plugin.py b/src/pytorch_lightning/plugins/training_type/training_type_plugin.py deleted file mode 100644 index ae740f1580684..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/training_type_plugin.py +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from abc import ABC - -from pytorch_lightning.strategies import Strategy -from pytorch_lightning.utilities import rank_zero_deprecation - - -class TrainingTypePlugin(Strategy, ABC): - def __init__(self, *args, **kwargs) -> None: # type: ignore[no-untyped-def] - rank_zero_deprecation( - "The `pl.plugins.training_type.training_type_plugin.TrainingTypePlugin` is deprecated in v1.6 and will" - " be removed in v1.8. Use `pl.strategies.strategy.Strategy` instead." - ) - super().__init__(*args, **kwargs) diff --git a/src/pytorch_lightning/plugins/training_type/utils.py b/src/pytorch_lightning/plugins/training_type/utils.py deleted file mode 100644 index 0b2aeb9987f13..0000000000000 --- a/src/pytorch_lightning/plugins/training_type/utils.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from pytorch_lightning.strategies.utils import on_colab_kaggle as _on_colab_kaggle -from pytorch_lightning.utilities import rank_zero_deprecation - - -def on_colab_kaggle() -> bool: - rank_zero_deprecation( - "`pl.plugins.training_type.utils.on_colab_kaggle` is deprecated in v1.6 and will be removed in v1.8." - " Use `pl.strategies.utils.on_colab_kaggle` instead." - ) - return _on_colab_kaggle() diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 4b3c046ae0f0c..b85fc1c1e8b4c 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -2004,14 +2004,6 @@ def accelerator(self) -> Accelerator: def strategy(self) -> Strategy: return self._accelerator_connector.strategy - @property - def training_type_plugin(self) -> Strategy: - rank_zero_deprecation( - "`Trainer.training_type_plugin` is deprecated in v1.6 and will be removed in v1.8. Use" - " `Trainer.strategy` instead." - ) - return self.strategy - @property def precision_plugin(self) -> PrecisionPlugin: return self.strategy.precision_plugin diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 0d9fc1e7a2baf..dc28ce32d293d 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -28,18 +28,6 @@ from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel from pytorch_lightning.loggers import CSVLogger, Logger, LoggerCollection from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin -from pytorch_lightning.plugins.training_type.ddp import DDPPlugin -from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin -from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin -from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin -from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin -from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedPlugin -from pytorch_lightning.plugins.training_type.ipu import IPUPlugin -from pytorch_lightning.plugins.training_type.sharded import DDPShardedPlugin -from pytorch_lightning.plugins.training_type.sharded_spawn import DDPSpawnShardedPlugin -from pytorch_lightning.plugins.training_type.single_device import SingleDevicePlugin -from pytorch_lightning.plugins.training_type.single_tpu import SingleTPUPlugin -from pytorch_lightning.plugins.training_type.tpu_spawn import TPUSpawnPlugin from pytorch_lightning.profiler import AbstractProfiler, BaseProfiler from pytorch_lightning.profilers import AdvancedProfiler, Profiler, SimpleProfiler from pytorch_lightning.strategies import DDP2Strategy, ParallelStrategy @@ -299,12 +287,6 @@ def test_v1_8_0_deprecate_trainer_callback_hook_mixin(): trainer.on_before_zero_grad(optimizer=optim.SGD(model.parameters(), lr=0.01, momentum=0.9)) -def test_v1_8_0_deprecated_training_type_plugin_property(): - trainer = Trainer() - with pytest.deprecated_call(match="in v1.6 and will be removed in v1.8"): - trainer.training_type_plugin - - def test_v1_8_0_deprecate_trainer_data_loading_mixin(): trainer = Trainer(max_epochs=1) model = BoringModel() @@ -328,47 +310,6 @@ def test_v_1_8_0_deprecated_device_stats_monitor_prefix_metric_keys(): prefix_metric_keys({"foo": 1.0}, "bar") -@pytest.mark.parametrize( - "cls", - [ - DDPPlugin, - DDPSpawnPlugin, - pytest.param(DeepSpeedPlugin, marks=RunIf(deepspeed=True)), - DataParallelPlugin, - DDPFullyShardedPlugin, - pytest.param(IPUPlugin, marks=RunIf(ipu=True)), - DDPShardedPlugin, - DDPSpawnShardedPlugin, - TPUSpawnPlugin, - ], -) -def test_v1_8_0_deprecated_training_type_plugin_classes(cls): - old_name = cls.__name__ - new_name = old_name.replace("Plugin", "Strategy") - with pytest.deprecated_call( - match=f"{old_name}` is deprecated in v1.6 and will be removed in v1.8. Use .*{new_name}` instead." - ): - cls() - - -def test_v1_8_0_deprecated_single_device_plugin_class(): - with pytest.deprecated_call( - match=( - "SingleDevicePlugin` is deprecated in v1.6 and will be removed in v1.8." - " Use `.*SingleDeviceStrategy` instead." - ) - ): - SingleDevicePlugin("cpu") - - -@RunIf(tpu=True, standalone=True) -def test_v1_8_0_deprecated_single_tpu_plugin_class(): - with pytest.deprecated_call( - match="SingleTPUPlugin` is deprecated in v1.6 and will be removed in v1.8. Use `.*SingleTPUStrategy` instead." - ): - SingleTPUPlugin(0) - - def test_v1_8_0_deprecated_lightning_optimizers(): trainer = Trainer() with pytest.deprecated_call( @@ -1163,8 +1104,5 @@ def test_unsupported_ddp2_strategy(): with pytest.raises(TypeError, match="The `DDP2Strategy`/`DDP2Plugin` is no longer supported in v1.7 and will be"): DDP2Strategy() - with pytest.raises(TypeError, match="The `DDP2Strategy`/`DDP2Plugin` is no longer supported in v1.7 and will be"): - DDP2Plugin() - with pytest.raises(ValueError, match="The DDP2 strategy is no longer supported."): Trainer(strategy="ddp2") From 341c63c2b9612b8653c4942fe188dec67f0e11ca Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Thu, 4 Aug 2022 10:48:29 -0700 Subject: [PATCH 090/230] [CLI] add support to run app on a specific cluster (#13894) Add `--cluster-id` flag which can be passed to `lightning run app` if the `--cloud` flag is present. This allows you to run your Lightning AI apps on Lightning AI BYOC clusters running on your own cloud provider infrastructure. Co-authored-by: William Falcon Co-authored-by: Laverne Henderson --- src/lightning_app/CHANGELOG.md | 2 +- src/lightning_app/cli/lightning_cli.py | 20 ++++++++- src/lightning_app/runners/cloud.py | 29 +++++++++++- src/lightning_app/runners/runtime.py | 4 +- src/lightning_app/testing/testing.py | 31 +++++++------ tests/tests_app/cli/test_run_app.py | 20 +++++++++ tests/tests_app/runners/test_cloud.py | 43 ++++++++++++++++++ tests/tests_app_examples/test_v0_app.py | 59 ++++++++++++++++--------- 8 files changed, 169 insertions(+), 39 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 0f9838b1efe2e..692a1e62cec50 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -10,7 +10,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) - +- Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) ### Changed diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index bb81b4eda133f..fb4c40330dfd9 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -73,10 +73,21 @@ def logout(): def _run_app( - file: str, cloud: bool, without_server: bool, no_cache: bool, name: str, blocking: bool, open_ui: bool, env: tuple + file: str, + cloud: bool, + cluster_id: str, + without_server: bool, + no_cache: bool, + name: str, + blocking: bool, + open_ui: bool, + env: tuple, ): file = _prepare_file(file) + if not cloud and cluster_id is not None: + raise click.ClickException("Using the flag --cluster-id in local execution is not supported.") + runtime_type = RuntimeType.CLOUD if cloud else RuntimeType.MULTIPROCESS # Cloud specific validations @@ -108,6 +119,7 @@ def on_before_run(*args): on_before_run=on_before_run, name=name, env_vars=env_vars, + cluster_id=cluster_id, ) if runtime_type == RuntimeType.CLOUD: click.echo("Application is ready in the cloud") @@ -121,6 +133,9 @@ def run(): @run.command("app") @click.argument("file", type=click.Path(exists=True)) @click.option("--cloud", type=bool, default=False, is_flag=True) +@click.option( + "--cluster-id", type=str, default=None, help="Run Lightning App on a specific Lightning AI BYOC compute cluster" +) @click.option("--name", help="The current application name", default="", type=str) @click.option("--without-server", is_flag=True, default=False) @click.option( @@ -133,6 +148,7 @@ def run(): def run_app( file: str, cloud: bool, + cluster_id: str, without_server: bool, no_cache: bool, name: str, @@ -142,7 +158,7 @@ def run_app( app_args: List[str], ): """Run an app from a file.""" - _run_app(file, cloud, without_server, no_cache, name, blocking, open_ui, env) + _run_app(file, cloud, cluster_id, without_server, no_cache, name, blocking, open_ui, env) def app_command(): diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 9dfa37ca97e72..957b60b5d2ab5 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -25,6 +25,7 @@ V1LightningworkSpec, V1NetworkConfig, V1PackageManager, + V1ProjectClusterBinding, V1PythonDependencyInfo, V1UserRequestedComputeConfig, V1Work, @@ -52,6 +53,7 @@ def dispatch( self, on_before_run: Optional[Callable] = None, name: str = "", + cluster_id: str = None, **kwargs: Any, ): """Method to dispatch and run the :class:`~lightning_app.core.app.LightningApp` in the cloud.""" @@ -108,6 +110,7 @@ def dispatch( random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) spec = V1LightningworkSpec( build_spec=build_spec, + cluster_id=cluster_id, user_requested_compute_config=user_compute_config, network_config=[V1NetworkConfig(name=random_name, port=work.port)], ) @@ -157,14 +160,21 @@ def dispatch( enable_app_server=app_spec.enable_app_server, flow_servers=app_spec.flow_servers, image_spec=app_spec.image_spec, + cluster_id=cluster_id, works=[V1Work(name=work_req.name, spec=work_req.spec) for work_req in work_reqs], local_source=True, dependency_cache_key=app_spec.dependency_cache_key, ) + if cluster_id is not None: + self._ensure_cluster_project_binding(project.project_id, cluster_id) + lightning_app_release = self.backend.client.lightningapp_v2_service_create_lightningapp_release( project.project_id, lightning_app.id, release_body ) + if cluster_id is not None: + logger.info(f"running app on {lightning_app_release.cluster_id}") + if lightning_app_release.source_upload_url == "": raise RuntimeError("The source upload url is empty.") @@ -220,7 +230,10 @@ def dispatch( lightning_app.id, lightning_app_release.id, Body9( - desired_state=V1LightningappInstanceState.RUNNING, name=lightning_app.name, env=v1_env_vars + cluster_id=cluster_id, + desired_state=V1LightningappInstanceState.RUNNING, + name=lightning_app.name, + env=v1_env_vars, ), ) ) @@ -237,6 +250,20 @@ def dispatch( if cleanup_handle: cleanup_handle() + def _ensure_cluster_project_binding(self, project_id: str, cluster_id: str): + cluster_bindings = self.backend.client.projects_service_list_project_cluster_bindings(project_id=project_id) + + for cluster_binding in cluster_bindings.clusters: + if cluster_binding.cluster_id != cluster_id: + continue + if cluster_binding.project_id == project_id: + return + + self.backend.client.projects_service_create_project_cluster_binding( + project_id, + body=V1ProjectClusterBinding(cluster_id=cluster_id, project_id=project_id), + ) + @staticmethod def _check_uploaded_folder(root: Path, repo: LocalSourceCodeDir) -> None: """This method is used to inform the users if their folder files are large and how to filter them.""" diff --git a/src/lightning_app/runners/runtime.py b/src/lightning_app/runners/runtime.py index 123e16d89ede5..348e3ecca12a0 100644 --- a/src/lightning_app/runners/runtime.py +++ b/src/lightning_app/runners/runtime.py @@ -28,6 +28,7 @@ def dispatch( on_before_run: Optional[Callable] = None, name: str = "", env_vars: Dict[str, str] = {}, + cluster_id: str = None, ) -> Optional[Any]: """Bootstrap and dispatch the application to the target. @@ -42,6 +43,7 @@ def dispatch( on_before_run: Callable to be executed before run. name: Name of app execution env_vars: Dict of env variables to be set on the app + cluster_id: the Lightning AI cluster to run the app on. Defaults to managed Lightning AI cloud """ from lightning_app.runners.runtime_type import RuntimeType from lightning_app.utilities.component import _set_flow_context @@ -60,7 +62,7 @@ def dispatch( ) # a cloud dispatcher will return the result while local # dispatchers will be running the app in the main process - return runtime.dispatch(on_before_run=on_before_run, name=name, no_cache=no_cache) + return runtime.dispatch(on_before_run=on_before_run, name=name, no_cache=no_cache, cluster_id=cluster_id) @dataclass diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 97f2762b02e42..e1cc2e180dab5 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -160,7 +160,7 @@ def run_cli(args) -> Generator: @requires("playwright") @contextmanager -def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator: +def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str] = []) -> Generator: """This utility is used to automate testing e2e application with lightning_app.ai.""" # 1. Validate the provide app_folder is correct. if not os.path.exists(os.path.join(app_folder, "app.py")): @@ -184,19 +184,22 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py") -> Generator: shutil.copytree(app_folder, tmpdir, dirs_exist_ok=True) # TODO - add -no-cache to the command line. process = Popen( - [ - sys.executable, - "-m", - "lightning", - "run", - "app", - app_name, - "--cloud", - "--name", - name, - "--open-ui", - "false", - ], + ( + [ + sys.executable, + "-m", + "lightning", + "run", + "app", + app_name, + "--cloud", + "--name", + name, + "--open-ui", + "false", + ] + + extra_args + ), cwd=tmpdir, env=env_copy, stdout=sys.stdout, diff --git a/tests/tests_app/cli/test_run_app.py b/tests/tests_app/cli/test_run_app.py index 152bd4b7417ac..221e3e2ab3a6f 100644 --- a/tests/tests_app/cli/test_run_app.py +++ b/tests/tests_app/cli/test_run_app.py @@ -3,6 +3,7 @@ from pathlib import Path from unittest import mock +import click import pytest from click.testing import CliRunner from tests_app import _PROJECT_ROOT @@ -55,6 +56,23 @@ def _lightning_app_run_and_logging(self, *args, **kwargs): assert bool(int(caplog.messages[0])) is open_ui +def test_lightning_run_cluster_without_cloud(monkeypatch): + """This test validates that running apps only supports --cluster-id if --cloud argument is passed.""" + monkeypatch.setattr("lightning_app.runners.cloud.logger", logging.getLogger()) + with pytest.raises(click.exceptions.ClickException): + _run_app( + file=os.path.join(_PROJECT_ROOT, "tests/tests_app/core/scripts/app_metadata.py"), + cloud=False, + cluster_id="test-cluster", + without_server=False, + name="", + blocking=False, + open_ui=False, + no_cache=True, + env=("FOO=bar",), + ) + + @mock.patch.dict(os.environ, {"LIGHTNING_CLOUD_URL": "https://beta.lightning.ai"}) @mock.patch("lightning_app.cli.lightning_cli.dispatch") @pytest.mark.parametrize("open_ui", (True, False)) @@ -70,6 +88,7 @@ def test_lightning_run_app_cloud(mock_dispatch: mock.MagicMock, open_ui, caplog, _run_app( file=os.path.join(_PROJECT_ROOT, "tests/tests_app/core/scripts/app_metadata.py"), cloud=True, + cluster_id="", without_server=False, name="", blocking=False, @@ -89,4 +108,5 @@ def test_lightning_run_app_cloud(mock_dispatch: mock.MagicMock, open_ui, caplog, name="", no_cache=True, env_vars={"FOO": "bar"}, + cluster_id="", ) diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index f38383d87e148..4b1cf08e8554d 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -16,6 +16,7 @@ V1Membership, V1NetworkConfig, V1PackageManager, + V1ProjectClusterBinding, V1PythonDependencyInfo, V1UserRequestedComputeConfig, V1Work, @@ -35,6 +36,48 @@ def run(self): class TestAppCreationClient: """Testing the calls made using GridRestClient to create the app.""" + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) + def test_run_on_byoc_cluster(self, monkeypatch): + mock_client = mock.MagicMock() + mock_client.projects_service_list_memberships.return_value = V1ListMembershipsResponse( + memberships=[V1Membership(name="Default Project", project_id="default-project-id")] + ) + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=[]) + ) + cloud_backend = mock.MagicMock() + cloud_backend.client = mock_client + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) + app = mock.MagicMock() + app.flows = [] + app.frontend = {} + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file="entrypoint.py") + cloud_runtime._check_uploaded_folder = mock.MagicMock() + + # without requirements file + # setting is_file to False so requirements.txt existence check will return False + monkeypatch.setattr(Path, "is_file", lambda *args, **kwargs: False) + monkeypatch.setattr(cloud, "Path", Path) + cloud_runtime.dispatch(cluster_id="test1234") + body = Body8( + cluster_id="test1234", + app_entrypoint_file=mock.ANY, + enable_app_server=True, + flow_servers=[], + image_spec=None, + works=[], + local_source=True, + dependency_cache_key=mock.ANY, + ) + cloud_runtime.backend.client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "default-project-id", mock.ANY, body + ) + cloud_runtime.backend.client.projects_service_create_project_cluster_binding.assert_called_once_with( + "default-project-id", body=V1ProjectClusterBinding(cluster_id="test1234", project_id="default-project-id") + ) + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) def test_requirements_file(self, monkeypatch): mock_client = mock.MagicMock() diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py index 2c03d2de60e29..d34a92d6102f8 100644 --- a/tests/tests_app_examples/test_v0_app.py +++ b/tests/tests_app_examples/test_v0_app.py @@ -30,6 +30,44 @@ def test_v0_app_example(): assert result.exit_code == 0 +def run_v0_app(fetch_logs, view_page): + def check_content(button_name, text_content): + button = view_page.locator(f'button:has-text("{button_name}")') + button.wait_for(timeout=3 * 1000) + button.click() + view_page.reload() + locator = view_page.frame_locator("iframe").locator("div") + locator.wait_for(timeout=3 * 1000) + assert text_content in " ".join(locator.all_text_contents()) + return True + + wait_for(view_page, check_content, "TAB_1", "Hello from component A") + wait_for(view_page, check_content, "TAB_2", "Hello from component B") + has_logs = False + while not has_logs: + for log in fetch_logs(): + if "'a': 'a', 'b': 'b'" in log: + has_logs = True + sleep(1) + + +@pytest.mark.cloud +@pytest.mark.skipif( + os.environ.get("LIGHTNING_BYOC_CLUSTER_ID") is None, + reason="missing LIGHTNING_BYOC_CLUSTER_ID environment variable", +) +def test_v0_app_example_byoc_cloud() -> None: + with run_app_in_cloud( + os.path.join(_PROJECT_ROOT, "examples/app_v0"), + extra_args=["--cluster-id", os.environ.get("LIGHTNING_BYOC_CLUSTER_ID")], + ) as ( + _, + view_page, + fetch_logs, + ): + run_v0_app(fetch_logs, view_page) + + @pytest.mark.cloud def test_v0_app_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_v0")) as ( @@ -37,23 +75,4 @@ def test_v0_app_example_cloud() -> None: view_page, fetch_logs, ): - - def check_content(button_name, text_content): - button = view_page.locator(f'button:has-text("{button_name}")') - button.wait_for(timeout=3 * 1000) - button.click() - view_page.reload() - locator = view_page.frame_locator("iframe").locator("div") - locator.wait_for(timeout=3 * 1000) - assert text_content in " ".join(locator.all_text_contents()) - return True - - wait_for(view_page, check_content, "TAB_1", "Hello from component A") - wait_for(view_page, check_content, "TAB_2", "Hello from component B") - - has_logs = False - while not has_logs: - for log in fetch_logs(): - if "'a': 'a', 'b': 'b'" in log: - has_logs = True - sleep(1) + run_v0_app(fetch_logs, view_page) From f5bd6e6f5f5df28e4f178d97f3b5cf8e04137f4d Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 5 Aug 2022 01:16:07 +0530 Subject: [PATCH 091/230] Cast only floating types with IPUs (#13983) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/strategies/ipu.py | 11 +++++++++-- tests/tests_pytorch/accelerators/test_ipu.py | 11 ++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index ffcb5e3e18c23..cb8bba43f39ac 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -41,6 +41,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) + + - Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 0b5d8e835ad1d..67a9e14e32d56 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -59,7 +59,9 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any: @staticmethod def batch_to(data: Tensor) -> Tensor: - return data.half() + if torch.is_floating_point(data): + return data.half() + return data def _move_float_tensors_to_half(self, batch: Any) -> Any: batch = apply_to_collection(batch, (FloatTensor, torch.cuda.FloatTensor), function=self.batch_to) @@ -279,8 +281,13 @@ def to_tensor(x: Any) -> Tensor: def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any: # This override is necessary because the cast must occur before the data # is moved to the device to prevent wasteful host->device copies. + def fp_to_half(tensor: Tensor) -> Tensor: + if torch.is_floating_point(tensor): + return tensor.half() + return tensor + if self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF): - batch = apply_to_collection(batch, Tensor, function=Tensor.half) + batch = apply_to_collection(batch, Tensor, function=fp_to_half) # We don't call `super().batch_to_device` because `data.to(device)` is not # currently necessary for IPUs. The movement of data from host<->IPU is # currently handled by PopTorch. diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index 9d510d9d60e9e..33d59d9a835ca 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -219,7 +219,16 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert isinstance(trainer.strategy, IPUStrategy) assert isinstance(trainer.strategy.precision_plugin, IPUPrecisionPlugin) assert trainer.strategy.precision_plugin.precision == 16 - assert trainer.strategy.batch_to_device(torch.zeros((1), dtype=torch.float)).dtype == torch.half + + changed_dtypes = [torch.float, torch.float64] + data = [torch.zeros((1), dtype=dtype) for dtype in changed_dtypes] + new_data = trainer.strategy.batch_to_device(data) + assert all(val.dtype is torch.half for val in new_data) + + not_changed_dtypes = [torch.uint8, torch.int8, torch.int32, torch.int64] + data = [torch.zeros((1), dtype=dtype) for dtype in not_changed_dtypes] + new_data = trainer.strategy.batch_to_device(data) + assert all(val.dtype is dtype for val, dtype in zip(new_data, not_changed_dtypes)) with pytest.raises(SystemExit): trainer.fit(model) From b88b700745553af5037e7061c7d3ccb0783c4cd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 4 Aug 2022 22:27:35 +0200 Subject: [PATCH 092/230] Remove the deprecated DDP2 strategy (#14026) --- .github/CONTRIBUTING.md | 2 +- .../accelerators/gpu_intermediate.rst | 28 --------------- .../common/checkpointing_expert.rst | 2 +- src/pytorch_lightning/CHANGELOG.md | 2 ++ src/pytorch_lightning/plugins/__init__.py | 15 -------- src/pytorch_lightning/strategies/__init__.py | 1 - src/pytorch_lightning/strategies/ddp2.py | 35 ------------------- .../connectors/accelerator_connector.py | 9 ----- src/pytorch_lightning/utilities/enums.py | 6 ++-- tests/README.md | 2 +- .../deprecated_api/test_remove_1-8.py | 10 +----- 11 files changed, 8 insertions(+), 104 deletions(-) delete mode 100644 src/pytorch_lightning/strategies/ddp2.py diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 83f7a7252f625..a1edacea7c104 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -103,7 +103,7 @@ _**Note**, even if you do not find the solution, sending a PR with a test coveri Want to keep Lightning healthy? Love seeing those green tests? So do we! How to we keep it that way? We write tests! We value tests contribution even more than new features. -Most of the tests in PyTorch Lightning train a random `BoringModel` under various trainer conditions (ddp, ddp2+amp, etc...). Want to add a new test case and not sure how? [Talk to us!](https://www.pytorchlightning.ai/community) +Most of the tests in PyTorch Lightning train a random `BoringModel` under various trainer conditions (ddp, amp, etc...). Want to add a new test case and not sure how? [Talk to us!](https://www.pytorchlightning.ai/community) ______________________________________________________________________ diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index f2c5f95ab95aa..4ea765d94675f 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -104,34 +104,6 @@ There are cases in which it is NOT possible to use DDP. Examples are: In these situations you should use `ddp_notebook` or `dp` instead. -Distributed Data Parallel 2 -^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -.. warning:: - The DDP2 strategy is no longer supported. For single-node use, we recommend ``strategy='ddp'`` or - ``strategy='dp'`` as a replacement. If you need DDP2, you will need ``torch < 1.9``, - ``pytorch-lightning < 1.5``, and set it as ``accelerator='ddp2'``. - -In certain cases, it's advantageous to use all batches on the same machine instead of a subset. -For instance, you might want to compute a NCE loss where it pays to have more negative samples. - -In this case, we can use DDP2 which behaves like DP in a machine and DDP across nodes. DDP2 does the following: - -1. Copies a subset of the data to each node. - -2. Inits a model on each node. - -3. Runs a forward and backward pass using DP. - -4. Syncs gradients across nodes. - -5. Applies the optimizer updates. - -.. code-block:: python - - # train on 32 GPUs (4 nodes) - trainer = Trainer(accelerator="gpu", devices=8, strategy="ddp2", num_nodes=4) - Distributed Data Parallel Spawn ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ `ddp_spawn` is exactly like `ddp` except that it uses .spawn to start the training processes. diff --git a/docs/source-pytorch/common/checkpointing_expert.rst b/docs/source-pytorch/common/checkpointing_expert.rst index 665acfeef548f..f800d822aa1c5 100644 --- a/docs/source-pytorch/common/checkpointing_expert.rst +++ b/docs/source-pytorch/common/checkpointing_expert.rst @@ -97,7 +97,7 @@ Custom Checkpoint IO Plugin .. note:: - Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable. + Some ``Strategy``s like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable. ************************** diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index cb8bba43f39ac..59e9328d7397b 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -38,6 +38,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed all deprecated training type plugins ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) +- Removed the deprecated `DDP2Strategy` ([#14026](https://github.com/Lightning-AI/lightning/pull/14026)) + ### Fixed diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index ff5e812014ebe..4d4cc63d89973 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -30,31 +30,16 @@ "XLACheckpointIO", "HPUCheckpointIO", "ApexMixedPrecisionPlugin", - "DataParallelPlugin", - "DDP2Plugin", - "DDPPlugin", - "DDPSpawnPlugin", - "DDPFullyShardedPlugin", - "DeepSpeedPlugin", "DeepSpeedPrecisionPlugin", "DoublePrecisionPlugin", - "HorovodPlugin", - "IPUPlugin", "IPUPrecisionPlugin", "HPUPrecisionPlugin", "NativeMixedPrecisionPlugin", "PrecisionPlugin", "ShardedNativeMixedPrecisionPlugin", "FullyShardedNativeMixedPrecisionPlugin", - "SingleDevicePlugin", - "SingleTPUPlugin", "TPUPrecisionPlugin", "TPUBf16PrecisionPlugin", - "TPUSpawnPlugin", - "TrainingTypePlugin", - "ParallelPlugin", - "DDPShardedPlugin", - "DDPSpawnShardedPlugin", "LayerSync", "NativeSyncBatchNorm", ] diff --git a/src/pytorch_lightning/strategies/__init__.py b/src/pytorch_lightning/strategies/__init__.py index ab79bd4fd70d9..a85d1064e988d 100644 --- a/src/pytorch_lightning/strategies/__init__.py +++ b/src/pytorch_lightning/strategies/__init__.py @@ -13,7 +13,6 @@ # limitations under the License. from pytorch_lightning.strategies.bagua import BaguaStrategy # noqa: F401 from pytorch_lightning.strategies.ddp import DDPStrategy # noqa: F401 -from pytorch_lightning.strategies.ddp2 import DDP2Strategy # noqa: F401 from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy # noqa: F401 from pytorch_lightning.strategies.deepspeed import DeepSpeedStrategy # noqa: F401 from pytorch_lightning.strategies.dp import DataParallelStrategy # noqa: F401 diff --git a/src/pytorch_lightning/strategies/ddp2.py b/src/pytorch_lightning/strategies/ddp2.py deleted file mode 100644 index e13c750e0a976..0000000000000 --- a/src/pytorch_lightning/strategies/ddp2.py +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import Any - - -class DDP2Strategy: - """DDP2 behaves like DP in one node, but synchronization across nodes behaves like in DDP. - - .. deprecated:: v1.7 - This strategy is no longer supported in v1.7 will be removed completely in v1.8. For single-node execution, we - recommend the :class:`~pytorch_lightning.strategies.ddp.DDPStrategy` or the - :class:`~pytorch_lightning.strategies.dp.DataParallelStrategy` as a replacement. If you rely on DDP2, you will - need ``torch < 1.9`` and ``pytorch-lightning < 1.5``. - """ - - strategy_name = "ddp2" - - def __new__(cls, *args: Any, **kwargs: Any) -> "DDP2Strategy": - raise TypeError( - "The `DDP2Strategy`/`DDP2Plugin` is no longer supported in v1.7 and will be removed completely in v1.8." - " For single-node execution, we recommend the `DDPStrategy` or the `DPStrategy`. If you rely on DDP2, you" - " will need `torch < 1.9` and `pytorch-lightning < 1.5`." - ) diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index bd879cf85ff7a..0f2947b6ec2ca 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -54,7 +54,6 @@ ) from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm from pytorch_lightning.strategies import ( - DDP2Strategy, DDPFullyShardedNativeStrategy, DDPFullyShardedStrategy, DDPShardedStrategy, @@ -666,13 +665,6 @@ def _init_strategy(self) -> None: # TODO lazy initialized and setup horovod strategy `global_rank` self._handle_horovod() if isinstance(self._strategy_flag, str): - if self._strategy_flag == "ddp2": - # TODO: remove this error in v1.8 - raise ValueError( - "The DDP2 strategy is no longer supported. For single-node use, we recommend `strategy='ddp'` or" - " `strategy='dp'` as a replacement. If you need DDP2, you will need `torch < 1.9`," - " `pytorch-lightning < 1.5`, and set it as `accelerator='ddp2'`." - ) self.strategy = StrategyRegistry.get(self._strategy_flag) elif isinstance(self._strategy_flag, Strategy): self.strategy = self._strategy_flag @@ -840,7 +832,6 @@ def is_distributed(self) -> bool: if hasattr(self.strategy, "is_distributed") and not isinstance(self.accelerator, TPUAccelerator): return self.strategy.is_distributed distributed_strategy = ( - DDP2Strategy, DDPStrategy, DDPSpawnShardedStrategy, DDPShardedStrategy, diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index d7d3a14ec924a..e687d3f9f046b 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -120,7 +120,6 @@ class DistributedType(_DeprecatedEnum): DP = "dp" DDP = "ddp" - DDP2 = "ddp2" DDP_SPAWN = "ddp_spawn" TPU_SPAWN = "tpu_spawn" DEEPSPEED = "deepspeed" @@ -203,16 +202,15 @@ class _StrategyType(LightningEnum): """Define type of training strategy. >>> # you can match the type with string - >>> _StrategyType.DDP == 'ddp' + >>> _StrategyType.DDP == 'DDP' True >>> # which is case invariant - >>> _StrategyType.DDP2 in ('ddp2', ) + >>> _StrategyType.DP in ('dp', ) True """ DP = "dp" DDP = "ddp" - DDP2 = "ddp2" DDP_SPAWN = "ddp_spawn" DDP_FORK = "ddp_fork" TPU_SPAWN = "tpu_spawn" diff --git a/tests/README.md b/tests/README.md index e56131fb459b0..13b20ed234f5e 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,6 +1,6 @@ # PyTorch-Lightning Tests -Most of the tests in PyTorch Lightning train a [BoringModel](https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/demos/boring_classes.py) under various trainer conditions (ddp, ddp2+amp, etc...). Want to add a new test case and not sure how? [Talk to us!](https://www.pytorchlightning.ai/community) +Most of the tests in PyTorch Lightning train a [BoringModel](https://github.com/Lightning-AI/lightning/blob/master/src/pytorch_lightning/demos/boring_classes.py) under various trainer conditions (ddp, amp, etc...). Want to add a new test case and not sure how? [Talk to us!](https://www.pytorchlightning.ai/community) ## Running tests diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index dc28ce32d293d..a371b2b3d04bf 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -30,7 +30,7 @@ from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.profiler import AbstractProfiler, BaseProfiler from pytorch_lightning.profilers import AdvancedProfiler, Profiler, SimpleProfiler -from pytorch_lightning.strategies import DDP2Strategy, ParallelStrategy +from pytorch_lightning.strategies import ParallelStrategy from pytorch_lightning.strategies.ipu import LightningIPUModule from pytorch_lightning.trainer.configuration_validator import _check_datamodule_checkpoint_hooks from pytorch_lightning.trainer.states import RunningStage @@ -1098,11 +1098,3 @@ def test_trainer_tpu_cores(monkeypatch): ) ): assert trainer.tpu_cores == 8 - - -def test_unsupported_ddp2_strategy(): - with pytest.raises(TypeError, match="The `DDP2Strategy`/`DDP2Plugin` is no longer supported in v1.7 and will be"): - DDP2Strategy() - - with pytest.raises(ValueError, match="The DDP2 strategy is no longer supported."): - Trainer(strategy="ddp2") From c6cbc3dceaef8a06b4f77f5ad16964a7de93fd89 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 Aug 2022 10:17:55 +0900 Subject: [PATCH 093/230] Update mlflow requirement from <1.27.0,>=1.0.0 to >=1.0.0,<1.28.0 in /requirements (#14024) Update mlflow requirement in /requirements Updates the requirements on [mlflow](https://github.com/mlflow/mlflow) to permit the latest version. - [Release notes](https://github.com/mlflow/mlflow/releases) - [Changelog](https://github.com/mlflow/mlflow/blob/master/CHANGELOG.md) - [Commits](https://github.com/mlflow/mlflow/compare/1.0.0...v1.27.0) --- updated-dependencies: - dependency-name: mlflow dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/loggers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index c825275726a9a..48a15c30f842f 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -5,6 +5,6 @@ neptune-client>=0.10.0, <0.16.4 comet-ml>=3.1.12, <3.31.8 -mlflow>=1.0.0, <1.27.0 +mlflow>=1.0.0, <1.28.0 test_tube>=0.7.5, <=0.7.5 wandb>=0.10.22, <0.12.20 From a4e0bcc837b3f7536eef84b7cdc14999996e51ea Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 5 Aug 2022 01:47:16 +0000 Subject: [PATCH 094/230] Bump JamesIves/github-pages-deploy-action from 4.1.4 to 4.4.0 (#13953) Bumps [JamesIves/github-pages-deploy-action](https://github.com/JamesIves/github-pages-deploy-action) from 4.1.4 to 4.4.0. - [Release notes](https://github.com/JamesIves/github-pages-deploy-action/releases) - [Commits](https://github.com/JamesIves/github-pages-deploy-action/compare/4.1.4...v4.4.0) --- updated-dependencies: - dependency-name: JamesIves/github-pages-deploy-action dependency-type: direct:production update-type: version-update:semver-minor ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/docs-deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml index 6fdc1f069652c..dd589baf2fa46 100644 --- a/.github/workflows/docs-deploy.yml +++ b/.github/workflows/docs-deploy.yml @@ -58,7 +58,7 @@ jobs: make html --jobs 2 - name: Deploy 🚀 - uses: JamesIves/github-pages-deploy-action@4.1.4 + uses: JamesIves/github-pages-deploy-action@v4.4.0 with: token: ${{ secrets.GITHUB_TOKEN }} branch: gh-pages # The branch the action should deploy to. From fb15ab72d3768fb268e4bb9056e623c190aa5127 Mon Sep 17 00:00:00 2001 From: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> Date: Fri, 5 Aug 2022 11:16:34 +0900 Subject: [PATCH 095/230] CI/CD: Update base image `nvidia/cuda` from 11.1 to 11.1.1 (#14019) update cuda_version 11.1 -> 11.1.1 --- .github/workflows/cicd-pytorch_dockers.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch_dockers.yml index b037c798bc8ee..10bc343ac5302 100644 --- a/.github/workflows/cicd-pytorch_dockers.yml +++ b/.github/workflows/cicd-pytorch_dockers.yml @@ -88,8 +88,8 @@ jobs: # the config used in '.azure-pipelines/gpu-tests.yml' - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"} # latest (used in Tutorials) - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1", ubuntu_version: "20.04"} + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} steps: - uses: actions/checkout@v2 @@ -126,8 +126,8 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1"} - - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1"} + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} # nightly: add when there's a release candidate # - {python_version: "3.9", pytorch_version: "1.12"} From 12a061f2aaefaa9ed9ccf81ab6f378835b675a7e Mon Sep 17 00:00:00 2001 From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com> Date: Fri, 5 Aug 2022 09:57:00 +0200 Subject: [PATCH 096/230] [App] Increased DeepDiff's verbose level to properly handle dict changes (#13960) --- src/lightning_app/core/api.py | 4 ++-- src/lightning_app/core/app.py | 4 ++-- src/lightning_app/utilities/proxies.py | 14 +++++++++----- src/lightning_app/utilities/scheduler.py | 4 +++- src/lightning_app/utilities/state.py | 2 +- tests/tests_app/core/test_lightning_api.py | 2 +- tests/tests_app/core/test_lightning_flow.py | 2 +- tests/tests_app/utilities/test_proxies.py | 4 +++- 8 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index f38c1844e28e0..f19ada5340d57 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -258,11 +258,11 @@ async def post_state( last_state = global_app_state_store.get_served_state(x_lightning_session_uuid) state = deepcopy(last_state) state["app_state"]["stage"] = body["stage"] - deep_diff = DeepDiff(last_state, state) + deep_diff = DeepDiff(last_state, state, verbose_level=2) else: state = body["state"] last_state = global_app_state_store.get_served_state(x_lightning_session_uuid) - deep_diff = DeepDiff(last_state, state) + deep_diff = DeepDiff(last_state, state, verbose_level=2) update_delta = Delta(deep_diff) api_app_delta_queue.put(update_delta) diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index ab41fb256ffe6..584f94285c219 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -202,7 +202,7 @@ def set_last_state(self, state): @staticmethod def populate_changes(last_state, new_state): - diff = DeepDiff(last_state, new_state, view="tree") + diff = DeepDiff(last_state, new_state, view="tree", verbose_level=2) changes_categories = [diff[key] for key in diff.to_dict()] @@ -307,7 +307,7 @@ def maybe_apply_changes(self) -> bool: if not deltas: # When no deltas are received from the Rest API or work queues, # we need to check if the flow modified the state and populate changes. - if Delta(DeepDiff(self.last_state, self.state)).to_dict(): + if Delta(DeepDiff(self.last_state, self.state, verbose_level=2)).to_dict(): # new_state = self.populate_changes(self.last_state, self.state) self.set_state(self.state) self._has_updated = True diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index 28d436f3e4a23..2c93a6c89f38c 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -221,7 +221,7 @@ def run_once(self) -> None: self._delta_memory.clear() # The remaining delta is the result of state updates triggered outside the setattr, e.g, by a list append - delta = Delta(DeepDiff(self._last_state, self._work.state)) + delta = Delta(DeepDiff(self._last_state, self._work.state, verbose_level=2)) if not delta.to_dict(): return self._last_state = deepcopy(self._work.state) @@ -256,7 +256,7 @@ def __call__(self, name: str, value: Any) -> None: with _state_observer_lock: state = deepcopy(self.work.state) self.work._default_setattr(name, value) - delta = Delta(DeepDiff(state, self.work.state)) + delta = Delta(DeepDiff(state, self.work.state, verbose_level=2)) if not delta.to_dict(): return @@ -408,7 +408,9 @@ def run_once(self): make_status(WorkStageStatus.FAILED, message=str(e), reason=WorkFailureReasons.USER_EXCEPTION) ) self.delta_queue.put( - ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(reference_state, self.work.state))) + ComponentDelta( + id=self.work_name, delta=Delta(DeepDiff(reference_state, self.work.state, verbose_level=2)) + ) ) self.work.on_exception(e) print("########## CAPTURED EXCEPTION ###########") @@ -437,7 +439,9 @@ def run_once(self): reference_state = deepcopy(self.work.state) self.work._calls[call_hash]["statuses"].append(make_status(WorkStageStatus.SUCCEEDED)) self.work._calls[call_hash]["ret"] = ret - self.delta_queue.put(ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(reference_state, self.work.state)))) + self.delta_queue.put( + ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(reference_state, self.work.state, verbose_level=2))) + ) # 18. Update the work for the next delta if any. self._proxy_setattr(cleanup=True) @@ -452,7 +456,7 @@ def _sigterm_signal_handler(self, signum, frame, call_hash: str) -> None: self.work._calls[call_hash]["statuses"].append( make_status(WorkStageStatus.STOPPED, reason=WorkStopReasons.SIGTERM_SIGNAL_HANDLER) ) - delta = Delta(DeepDiff(state, self.work.state)) + delta = Delta(DeepDiff(state, self.work.state, verbose_level=2)) self.delta_queue.put(ComponentDelta(id=self.work_name, delta=delta)) # kill the thread as the job is going to be terminated. diff --git a/src/lightning_app/utilities/scheduler.py b/src/lightning_app/utilities/scheduler.py index f4416b6119ebc..012930f017f20 100644 --- a/src/lightning_app/utilities/scheduler.py +++ b/src/lightning_app/utilities/scheduler.py @@ -37,7 +37,9 @@ def run_once(self): flow = self._app.get_component_by_name(metadata["name"]) previous_state = deepcopy(flow.state) flow._enable_schedule(call_hash) - component_delta = ComponentDelta(id=flow.name, delta=Delta(DeepDiff(previous_state, flow.state))) + component_delta = ComponentDelta( + id=flow.name, delta=Delta(DeepDiff(previous_state, flow.state, verbose_level=2)) + ) self._app.delta_queue.put(component_delta) metadata["start_time"] = next_event.isoformat() diff --git a/src/lightning_app/utilities/state.py b/src/lightning_app/utilities/state.py index 5cd7979de09d9..300bca34533df 100644 --- a/src/lightning_app/utilities/state.py +++ b/src/lightning_app/utilities/state.py @@ -130,7 +130,7 @@ def _store_state(self, state: Dict[str, Any]) -> None: def send_delta(self) -> None: app_url = f"{self._url}/api/v1/delta" - deep_diff = DeepDiff(_LAST_STATE, _STATE) + deep_diff = DeepDiff(_LAST_STATE, _STATE, verbose_level=2) assert self._plugin is not None # TODO: Find how to prevent the infinite loop on refresh without storing the DeepDiff if self._plugin.should_update_app(deep_diff): diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index 9de7c63051b63..edd2896d1951d 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -118,7 +118,7 @@ def _change_stage(self, enum): previous_state = deepcopy(self.state) current_state = self.state current_state["app_state"]["stage"] = enum.value - deep_diff = DeepDiff(previous_state, current_state) + deep_diff = DeepDiff(previous_state, current_state, verbose_level=2) self.api_delta_queue.put(Delta(deep_diff)) def maybe_apply_changes(self): diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py index 1966c6d7b23d6..e8ce1222a3186 100644 --- a/tests/tests_app/core/test_lightning_flow.py +++ b/tests/tests_app/core/test_lightning_flow.py @@ -415,7 +415,7 @@ def run(self): work_state = flow_a.work.state flow_a.work.counter = 1 work_state_2 = flow_a.work.state - delta = Delta(DeepDiff(work_state, work_state_2)) + delta = Delta(DeepDiff(work_state, work_state_2, verbose_level=2)) delta = _delta_to_appstate_delta(flow_a, flow_a.work, delta) new_flow_state = LightningApp.populate_changes(flow_state, flow_state + delta) flow_a.set_state(new_flow_state) diff --git a/tests/tests_app/utilities/test_proxies.py b/tests/tests_app/utilities/test_proxies.py index cd0dfd7026e09..2f945ef7bea94 100644 --- a/tests/tests_app/utilities/test_proxies.py +++ b/tests/tests_app/utilities/test_proxies.py @@ -254,7 +254,9 @@ def __call__(self): "message": None, } ) - self.delta_queue.put(ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(state, self.work.state)))) + self.delta_queue.put( + ComponentDelta(id=self.work_name, delta=Delta(DeepDiff(state, self.work.state, verbose_level=2))) + ) self.counter += 1 except Exception as e: logger.error(traceback.format_exc()) From a4e4cab7a6095867aafea7d6402dca30c6dec338 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 5 Aug 2022 14:01:19 +0530 Subject: [PATCH 097/230] Deprecate `amp_level` from `Trainer` (#13898) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- .../common/precision_intermediate.rst | 8 +++-- docs/source-pytorch/common/trainer.rst | 30 +++++-------------- src/pytorch_lightning/CHANGELOG.md | 2 +- .../connectors/accelerator_connector.py | 6 ++++ src/pytorch_lightning/trainer/trainer.py | 4 +++ .../deprecated_api/test_remove_1-10.py | 21 +++++++++++++ .../connectors/test_accelerator_connector.py | 4 ++- .../optimization/test_manual_optimization.py | 6 +++- 8 files changed, 53 insertions(+), 28 deletions(-) create mode 100644 tests/tests_pytorch/deprecated_api/test_remove_1-10.py diff --git a/docs/source-pytorch/common/precision_intermediate.rst b/docs/source-pytorch/common/precision_intermediate.rst index 9ed4c75ecc3f8..231c186baf72f 100644 --- a/docs/source-pytorch/common/precision_intermediate.rst +++ b/docs/source-pytorch/common/precision_intermediate.rst @@ -96,12 +96,16 @@ NVIDIA APEX Trainer(accelerator="gpu", devices=1, amp_backend="apex", precision=16) -Set the `NVIDIA optimization level `__ via the trainer. +Set the `NVIDIA optimization level `__ via the precision plugin. .. testcode:: :skipif: not _APEX_AVAILABLE or not torch.cuda.is_available() - Trainer(accelerator="gpu", devices=1, amp_backend="apex", amp_level="O2", precision=16) + from pytorch_lightning.plugins.apex_amp import ApexMixedPrecisionPlugin + + + apex_plugin = ApexMixedPrecisionPlugin(amp_level="O3") + Trainer(accelerator="gpu", devices=1, precision=16, plugins=[apex_plugin]) ---- diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index c3251917f8d3b..290d3aefb8524 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -313,27 +313,6 @@ Use PyTorch AMP ('native'), or NVIDIA apex ('apex'). # using NVIDIA Apex trainer = Trainer(amp_backend="apex") -amp_level -^^^^^^^^^ - -.. raw:: html - - - -| - -The optimization level to use (O1, O2, etc...) -for 16-bit GPU precision (using NVIDIA apex under the hood). - -Check `NVIDIA apex docs `_ for level - -Example:: - - # default used by the Trainer - trainer = Trainer(amp_level='O2') - auto_scale_batch_size ^^^^^^^^^^^^^^^^^^^^^ @@ -1188,13 +1167,18 @@ Half precision, or mixed precision, is the combined use of 32 and 16 bit floatin 1. `Install apex. `__ - 2. Set the ``precision`` trainer flag to 16. You can customize the `Apex optimization level `_ by setting the `amp_level` flag. + 2. Set the ``precision`` trainer flag to 16. You can customize the `Apex optimization level `_ by setting the ``amp_level`` flag + in the precision plugin. .. testcode:: :skipif: not _APEX_AVAILABLE or not torch.cuda.is_available() + from pytorch_lightning.plugins.apex_amp import ApexMixedPrecisionPlugin + + + apex_plugin = ApexMixedPrecisionPlugin(amp_level="O2") # turn on 16-bit - trainer = Trainer(amp_backend="apex", amp_level="O2", precision=16, accelerator="gpu", devices=1) + trainer = Trainer(accelerator="gpu", devices=1, precision=16, plugins=[apex_plugin]) profiler ^^^^^^^^ diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 59e9328d7397b..1ae21230c256f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -24,7 +24,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated -- +- Deprecated `amp_level` from `Trainer` in favour of passing it explictly via precision plugin ([#13898](https://github.com/Lightning-AI/lightning/pull/13898)) - diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index 0f2947b6ec2ca..ccfdaa3185686 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -187,6 +187,12 @@ def __init__( self._amp_level_flag: Optional[str] = amp_level self._auto_select_gpus: bool = auto_select_gpus + if amp_level is not None: + rank_zero_deprecation( + "Setting `amp_level` inside the `Trainer` is deprecated in v1.8.0 and will be removed" + " in v1.10.0. Please set it inside the specific precision plugin and pass it to the `Trainer`." + ) + self._check_config_and_set_final_flags( strategy=strategy, accelerator=accelerator, diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index b85fc1c1e8b4c..2a1c5082a1ac8 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -209,6 +209,10 @@ def __init__( amp_level: The optimization level to use (O1, O2, etc...). By default it will be set to "O2" if ``amp_backend`` is set to "apex". + .. deprecated:: v1.8 + Setting ``amp_level`` inside the ``Trainer`` is deprecated in v1.8.0 and will be removed + in v1.10.0. Please set it inside the specific precision plugin and pass it to the ``Trainer``. + auto_lr_find: If set to True, will make trainer.tune() run a learning rate finder, trying to optimize initial learning for faster convergence. trainer.tune() method will set the suggested learning rate in self.lr or self.learning_rate in the LightningModule. diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py new file mode 100644 index 0000000000000..6a0a458c6c041 --- /dev/null +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -0,0 +1,21 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest + +from pytorch_lightning import Trainer + + +def test_deprecated_amp_level(): + with pytest.deprecated_call(match="Setting `amp_level` inside the `Trainer` is deprecated in v1.8.0"): + Trainer(amp_level="O3", amp_backend="apex") diff --git a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py index dc53fb5e36588..1296d962eea2c 100644 --- a/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_accelerator_connector.py @@ -413,7 +413,9 @@ def test_validate_precision_type(precision): def test_amp_level_raises_error_with_native(): - with pytest.raises(MisconfigurationException, match="O2'` but it's only supported with `amp_backend='apex'`"): + with pytest.deprecated_call( + match="Setting `amp_level` inside the `Trainer` is deprecated in v1.8.0" + ), pytest.raises(MisconfigurationException, match="O2'` but it's only supported with `amp_backend='apex'`"): _ = Trainer(amp_level="O2", amp_backend="native", precision=16) diff --git a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py index a1a4dfca8666d..470262ceb539e 100644 --- a/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py +++ b/tests/tests_pytorch/trainer/optimization/test_manual_optimization.py @@ -23,6 +23,7 @@ from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin from pytorch_lightning.strategies import Strategy from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 from tests_pytorch.helpers.runif import RunIf @@ -77,7 +78,7 @@ def configure_optimizers(self): {"accelerator": "gpu", "devices": 1, "precision": 16, "amp_backend": "native"}, marks=RunIf(min_cuda_gpus=1) ), pytest.param( - {"accelerator": "gpu", "devices": 1, "precision": 16, "amp_backend": "apex", "amp_level": "O2"}, + {"accelerator": "gpu", "devices": 1, "precision": 16, "amp_backend": "apex"}, marks=RunIf(min_cuda_gpus=1, amp_apex=True), ), ], @@ -119,6 +120,8 @@ def on_train_end(self): model.val_dataloader = None limit_train_batches = 2 + plugins = [ApexMixedPrecisionPlugin(amp_level="O2")] if kwargs.get("amp_backend") == "apex" else [] + trainer = Trainer( default_root_dir=tmpdir, limit_train_batches=limit_train_batches, @@ -126,6 +129,7 @@ def on_train_end(self): max_epochs=1, log_every_n_steps=1, enable_model_summary=False, + plugins=plugins, **kwargs, ) From 3d5c3d24f9d644a4de0be4975dc1f96042ca9754 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 5 Aug 2022 10:49:00 +0200 Subject: [PATCH 098/230] Remove unused auto_collect_arguments class method (#14015) --- src/pytorch_lightning/core/module.py | 30 ---------------------- tests/tests_pytorch/models/test_hparams.py | 2 +- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index b8cc1d91cde18..f58503edd88cb 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -14,7 +14,6 @@ """The LightningModule - an nn.Module with many additional features.""" import collections.abc -import inspect import logging import numbers import os @@ -46,7 +45,6 @@ from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_13 -from pytorch_lightning.utilities.parsing import collect_init_args from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature from pytorch_lightning.utilities.types import _METRIC_COLLECTION, EPOCH_OUTPUT, LRSchedulerTypeUnion, STEP_OUTPUT @@ -1782,34 +1780,6 @@ def _verify_is_manual_optimization(self, fn_name): " set model property `automatic_optimization` as False" ) - @classmethod - def _auto_collect_arguments(cls, frame=None) -> Tuple[Dict, Dict]: - """Collect all module arguments in the current constructor and all child constructors. The child - constructors are all the ``__init__`` methods that reach the current class through (chained) - ``super().__init__()`` calls. - - Args: - frame: instance frame - - Returns: - self_arguments: arguments dictionary of the first instance - parents_arguments: arguments dictionary of the parent's instances - """ - if not frame: - frame = inspect.currentframe() - - frame_args = collect_init_args(frame.f_back, []) - self_arguments = frame_args[-1] - - # set hyper_parameters in child - self_arguments = self_arguments - parents_arguments = {} - - # add all arguments from parents - for args in frame_args[:-1]: - parents_arguments.update(args) - return self_arguments, parents_arguments - @torch.no_grad() def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = None, **kwargs): """Saves the model in ONNX format. diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index 20bdfda5dc224..c130381c7832d 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -410,7 +410,7 @@ def __init__(self, arg1, arg2, *args, **kwargs): class LocalVariableModelSuperFirst(BoringModel): - """This model has the _auto_collect_arguments() call at the end.""" + """This model has the save_hyperparameters() call at the end.""" def __init__(self, arg1, arg2, *args, **kwargs): super().__init__(*args, **kwargs) From 0883971ccb973be3f8d0495f442f80f253fd45bb Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 5 Aug 2022 18:04:45 +0900 Subject: [PATCH 099/230] CI: Update XLA from 1.9 to 1.12 (#14013) --- .circleci/config.yml | 2 +- .github/workflows/README.md | 2 +- .github/workflows/cicd-pytorch_dockers.yml | 2 +- dockers/base-xla/Dockerfile | 6 +++++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7ac10195c75a9..5c314d4e6e5c1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -113,7 +113,7 @@ jobs: docker: - image: circleci/python:3.7 environment: - - XLA_VER: 1.9 + - XLA_VER: 1.12 - PYTHON_VER: 3.7 - MAX_CHECKS: 1000 - CHECK_SPEEP: 5 diff --git a/.github/workflows/README.md b/.github/workflows/README.md index d67bf92d6c048..8b9e7d173b03c 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -13,7 +13,7 @@ | pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | | pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | | PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.9) | linux | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | - \*Accelerators used in CI - GPU: 2 x NVIDIA Tesla V100 diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch_dockers.yml index 10bc343ac5302..a6ba2ac4aa5f4 100644 --- a/.github/workflows/cicd-pytorch_dockers.yml +++ b/.github/workflows/cicd-pytorch_dockers.yml @@ -51,7 +51,7 @@ jobs: matrix: # the config used in '.circleci/config.yml`' python_version: ["3.7"] - xla_version: ["1.11"] + xla_version: ["1.12"] steps: - uses: actions/checkout@v2 - uses: docker/setup-buildx-action@v2 diff --git a/dockers/base-xla/Dockerfile b/dockers/base-xla/Dockerfile index 977aee878ffcd..3cc43e6e1a4f3 100644 --- a/dockers/base-xla/Dockerfile +++ b/dockers/base-xla/Dockerfile @@ -19,7 +19,7 @@ LABEL maintainer="Lightning-AI " # CALL: docker image build -t pytorch-lightning:XLA-image -f dockers/base-xla/Dockerfile . --build-arg PYTHON_VERSION=3.8 ARG PYTHON_VERSION=3.9 ARG CONDA_VERSION=4.9.2 -ARG XLA_VERSION=1.11 +ARG XLA_VERSION=1.12 SHELL ["/bin/bash", "-c"] # for skipping configurations @@ -92,6 +92,10 @@ RUN \ python --version && \ cd pytorch-lightning && \ pip install -q fire && \ + # Pin mkl version to avoid OSError on torch import + # OSError: libmkl_intel_lp64.so.1: cannot open shared object file: No such file or directory + # https://github.com/pytorch/xla/issues/1666 + pip install mkl==2021.4.0 && \ # drop packages installed with XLA python .actions/assistant.py requirements_prune_pkgs torch,torchvision && \ # drop unnecessary packages From 91dd6a68fb596d45914fc5d4fbbf2bad52e8399e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 5 Aug 2022 14:20:27 +0200 Subject: [PATCH 100/230] Remove meta device utilities in favor of torchdistx (#13868) --- pyproject.toml | 1 - .../core/mixins/device_dtype_mixin.py | 8 +- .../strategies/launchers/multiprocessing.py | 11 + src/pytorch_lightning/trainer/trainer.py | 22 +- src/pytorch_lightning/utilities/cli.py | 14 +- src/pytorch_lightning/utilities/data.py | 17 +- src/pytorch_lightning/utilities/meta.py | 340 +++--------------- .../deprecated_api/test_remove_1-9.py | 14 + .../strategies/test_deepspeed_strategy.py | 20 -- tests/tests_pytorch/utilities/test_meta.py | 84 ----- .../utilities/test_torchdistx.py | 92 +++++ 11 files changed, 194 insertions(+), 429 deletions(-) delete mode 100644 tests/tests_pytorch/utilities/test_meta.py create mode 100644 tests/tests_pytorch/utilities/test_torchdistx.py diff --git a/pyproject.toml b/pyproject.toml index 226b109459f24..5473e73c52e19 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,5 @@ module = [ "pytorch_lightning.tuner.batch_size_scaling", "pytorch_lightning.utilities.auto_restart", "pytorch_lightning.utilities.data", - "pytorch_lightning.utilities.meta", ] ignore_errors = "True" diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index b12e1cf042a1f..62e81e4839da6 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -18,8 +18,6 @@ from torch.nn import Module from typing_extensions import Self -import pytorch_lightning as pl - class DeviceDtypeModuleMixin(Module): __jit_unused_properties__ = ["device", "dtype"] @@ -180,10 +178,8 @@ def half(self) -> Self: # type: ignore[valid-type] def __update_properties( self, device: Optional[torch.device] = None, dtype: Optional[Union[str, torch.dtype]] = None ) -> None: - def apply_fn(module: Union["DeviceDtypeModuleMixin", Module]) -> None: - # TODO: Find why `isinstance(module, DeviceDtypeModuleMixin)` doesn't - # work when using `init_meta_context`. - if not isinstance(module, (DeviceDtypeModuleMixin, pl.LightningModule)): + def apply_fn(module: Union[DeviceDtypeModuleMixin, Module]) -> None: + if not isinstance(module, DeviceDtypeModuleMixin): return if device is not None: module._device = device diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 91fa92b555ae0..39bba092e9c60 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -87,6 +87,7 @@ def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] a selected set of attributes get restored in the main process after processes join. **kwargs: Optional keyword arguments to be passed to the given function. """ + self._check_torchdistx_support() # The default cluster environment in Lightning chooses a random free port number # This needs to be done in the main process here before starting processes to ensure each rank will connect # through the same port @@ -178,6 +179,16 @@ def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Opt return _WorkerOutput(best_model_path, weights_path, trainer.state, results, extra) + def _check_torchdistx_support(self) -> None: + if self._start_method == "spawn": + from pytorch_lightning.utilities.meta import _is_deferred + + if _is_deferred(self._strategy.lightning_module): + raise NotImplementedError( + f"The `{type(self._strategy).__name__}` strategy does not support `torchdistx`'s deferred" + f" initialization." + ) + def add_to_queue(self, trainer: "pl.Trainer", queue: "_FakeQueue") -> None: """Appends the :attr:`trainer.callback_metrics` dictionary to the given queue. To avoid issues with memory sharing, we cast the data to numpy. diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 2a1c5082a1ac8..6853c4328af46 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -70,7 +70,6 @@ XLAProfiler, ) from pytorch_lightning.strategies import ParallelStrategy, Strategy -from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.callback_hook import TrainerCallbackHookMixin from pytorch_lightning.trainer.configuration_validator import verify_loop_configurations from pytorch_lightning.trainer.connectors.accelerator_connector import _LITERAL_WARN, AcceleratorConnector @@ -106,8 +105,7 @@ from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_len_all_ranks from pytorch_lightning.utilities.distributed import distributed_available from pytorch_lightning.utilities.exceptions import ExitGracefullyException, MisconfigurationException -from pytorch_lightning.utilities.imports import _fault_tolerant_training -from pytorch_lightning.utilities.meta import is_on_meta_device, materialize_module +from pytorch_lightning.utilities.imports import _fault_tolerant_training, _module_available from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.seed import isolate_rng @@ -1469,20 +1467,14 @@ def _call_setup_hook(self) -> None: def _call_configure_sharded_model(self) -> None: with self.strategy.model_sharded_context(): - self._handle_meta_model() - self._call_lightning_module_hook("configure_sharded_model") - self._call_callback_hooks("on_configure_sharded_model") - - def _handle_meta_model(self) -> None: - if not is_on_meta_device(self.lightning_module): - return + # experimental support for torchdistx + if _module_available("torchdistx.deferred_init"): + from torchdistx.deferred_init import materialize_module - if isinstance(self.strategy, DDPSpawnStrategy): - raise MisconfigurationException("LightningModule on meta device isn't supported with spawn.") + materialize_module(self.lightning_module) - materialize_module(self.lightning_module) - # the trainer reference is lost during materialization - self.lightning_module.trainer = proxy(self) + self._call_lightning_module_hook("configure_sharded_model") + self._call_callback_hooks("on_configure_sharded_model") def _call_teardown_hook(self) -> None: fn = self.state.fn._setup_fn diff --git a/src/pytorch_lightning/utilities/cli.py b/src/pytorch_lightning/utilities/cli.py index 285b5361f9cd2..8af919b78ce93 100644 --- a/src/pytorch_lightning/utilities/cli.py +++ b/src/pytorch_lightning/utilities/cli.py @@ -22,7 +22,7 @@ import pytorch_lightning as pl import pytorch_lightning.cli as new_cli -from pytorch_lightning.utilities.meta import get_all_subclasses +from pytorch_lightning.utilities.meta import _get_all_subclasses from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation _deprecate_registry_message = ( @@ -108,17 +108,17 @@ def _populate_registries(subclasses: bool) -> None: # Remove in v1.9 if subclasses: rank_zero_deprecation(_deprecate_auto_registry_message) # this will register any subclasses from all loaded modules including userland - for cls in get_all_subclasses(torch.optim.Optimizer): + for cls in _get_all_subclasses(torch.optim.Optimizer): OPTIMIZER_REGISTRY(cls, show_deprecation=False) - for cls in get_all_subclasses(torch.optim.lr_scheduler._LRScheduler): + for cls in _get_all_subclasses(torch.optim.lr_scheduler._LRScheduler): LR_SCHEDULER_REGISTRY(cls, show_deprecation=False) - for cls in get_all_subclasses(pl.Callback): + for cls in _get_all_subclasses(pl.Callback): CALLBACK_REGISTRY(cls, show_deprecation=False) - for cls in get_all_subclasses(pl.LightningModule): + for cls in _get_all_subclasses(pl.LightningModule): MODEL_REGISTRY(cls, show_deprecation=False) - for cls in get_all_subclasses(pl.LightningDataModule): + for cls in _get_all_subclasses(pl.LightningDataModule): DATAMODULE_REGISTRY(cls, show_deprecation=False) - for cls in get_all_subclasses(pl.loggers.Logger): + for cls in _get_all_subclasses(pl.loggers.Logger): LOGGER_REGISTRY(cls, show_deprecation=False) else: # manually register torch's subclasses and our subclasses diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 862c7f2de905b..00a7cb8486709 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -18,7 +18,7 @@ from contextlib import contextmanager from dataclasses import fields from functools import partial -from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, Set, Tuple, Type, Union +from typing import Any, Callable, Dict, Generator, Iterable, Mapping, Optional, Tuple, Type, Union import torch from torch import Tensor @@ -39,6 +39,7 @@ from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler from pytorch_lightning.utilities.enums import _FaultTolerantMode from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.meta import _get_all_subclasses from pytorch_lightning.utilities.rank_zero import rank_zero_warn from pytorch_lightning.utilities.seed import pl_worker_init_function from pytorch_lightning.utilities.warnings import WarningCache @@ -493,20 +494,6 @@ def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: return wrapper -# https://stackoverflow.com/a/63851681/9201239 -def _get_all_subclasses(cls: Type[Any]) -> Set[Type[Any]]: - """Returns a list of all classes that inherit directly or indirectly from the given class.""" - subclasses = set() - - def recurse(cl: Type[Any]) -> None: - for subclass in cl.__subclasses__(): - subclasses.add(subclass) - recurse(subclass) - - recurse(cls) - return subclasses - - @contextmanager def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: """This context manager is used to add support for re-instantiation of custom (subclasses) of `base_cls`. diff --git a/src/pytorch_lightning/utilities/meta.py b/src/pytorch_lightning/utilities/meta.py index 77da02f7231d4..9f4cd72bfe65d 100644 --- a/src/pytorch_lightning/utilities/meta.py +++ b/src/pytorch_lightning/utilities/meta.py @@ -11,149 +11,46 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import importlib -import inspect -import operator -import threading from contextlib import contextmanager -from functools import partial -from itertools import chain -from types import ModuleType -from typing import Any, Callable, Dict, Generator, Iterator, List, Optional, Set, Type +from typing import Any, Callable, Generator, Mapping, Optional, Set, Type, Union -import torch -from torch import nn, Tensor -from torch.nn import Module -from torch.nn.modules.container import ModuleDict, ModuleList, Sequential +from torch import Tensor +from torch.nn import Module, Parameter -import pytorch_lightning as pl -from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _compare_version -from pytorch_lightning.utilities.rank_zero import rank_zero_warn +from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.imports import _module_available -_TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") -if _TORCH_GREATER_EQUAL_1_10: - from torch._C import _DisableTorchDispatch # type: ignore[attr-defined] - - #################################################################### - # BELOW: TAKEN FROM https://github.com/pytorch/pytorch/pull/66317. # - # TODO: Removed once merged and released on PyTorch side # - #################################################################### - - @contextmanager - def enable_python_mode(cls) -> Iterator[None]: - if not hasattr(cls, "__torch_dispatch__"): - raise ValueError("The class passed to enable_python_mode " "must have a __torch_dispatch__ classmethod") - if not isinstance(cls, type) or not issubclass(cls, (Tensor,)): - raise ValueError("The argument passed to enable_python_mode " "must be the type of a Tensor subclass") - torch._C._enter_python_mode(cls) - try: - yield - finally: - torch._C._exit_python_mode() - - _tls = threading.local() - _tls.in_call = False - - @contextmanager - def _no_dispatch() -> Iterator[None]: - """Temporarily disables the Python dispatch mode.""" - guard = _DisableTorchDispatch() # noqa F841 - try: - yield - finally: - del guard - - def _handle_arange(func, args, kwargs): - kwargs["device"] = torch.device("cpu") - return torch.empty_like(func(*args, **kwargs), device="meta") - - def _handle_tril(func, args, kwargs): - if args and isinstance(args[0], Tensor): - return torch.empty_like(args[0], device="meta") - - return NotImplemented - - class _MetaContext(Tensor): - _op_handlers: Dict[Callable, Callable] = {} - - @classmethod - def _ensure_handlers_initialized(cls) -> None: - if cls._op_handlers: - return - - cls._op_handlers.update( - { - torch.ops.aten.arange: _handle_arange, - torch.ops.aten.tril: _handle_tril, - } - ) - - @classmethod - def __torch_dispatch__(cls, func, types, args=(), kwargs=None): - cls._ensure_handlers_initialized() - - op_handler: Optional[Callable] - - try: - op_handler = cls._op_handlers[func] - except KeyError: - op_handler = None - - with _no_dispatch(): - if op_handler: - result = op_handler(func, args, kwargs) - if result is not NotImplemented: - return result - - if "device" in kwargs: - kwargs["device"] = torch.device("meta") - - return func(*args, **kwargs) - - def init_meta(module_fn: Callable[..., Module], *args, **kwargs) -> Module: - def create_instance(module=None) -> Module: - if module: - module.__init__(*args, **kwargs) - return module - return module_fn(*args, **kwargs) - - if _tls.in_call: - module = create_instance() - else: - _tls.in_call = True - try: - with enable_python_mode(_MetaContext): - module = create_instance() - finally: - _tls.in_call = False - - module.materialize = partial(create_instance, module=module) # type: ignore[assignment] - - return module +def is_meta_init() -> bool: + rank_zero_deprecation( + "`pytorch_lightning.utilities.meta.is_meta_init` is deprecated in v1.8 and will be removed in v1.9." + " The function has become a no-op." + " Please check out the `torchdistx` project instead: https://github.com/pytorch/torchdistx" + ) + return False - def is_meta_init() -> bool: - """Indicates whether the module is being instantiated by ``init_meta()``.""" - return _tls.in_call - #################################################################### - # ABOVE: TAKEN FROM https://github.com/pytorch/pytorch/pull/66317. # - # TODO: Removed once merged and released on PyTorch side # - #################################################################### +def init_meta(module_fn: Callable[..., Module], *args: Any, **kwargs: Any) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.meta.init_meta` is deprecated in v1.8 and will be removed in v1.9." + " The function has become a no-op." + " Please check out the `torchdistx` project instead: https://github.com/pytorch/torchdistx" + ) -else: - def init_meta(*_, **__): - if not _TORCH_GREATER_EQUAL_1_10: - return MisconfigurationException("`init_meta` is supported from PyTorch 1.10.0") +def get_all_subclasses(cls: Type) -> Set[Type]: + rank_zero_deprecation( + "`pytorch_lightning.utilities.meta.get_all_subclasses` is deprecated in v1.8 and will be removed in v1.9." + " Please copy its implementation if you have a use for it." + ) + return _get_all_subclasses(cls) # https://stackoverflow.com/a/63851681/9201239 -def get_all_subclasses(cls: Type) -> Set[Type]: +def _get_all_subclasses(cls: Type) -> Set[Type]: subclass_list = [] - def recurse(cl): + def recurse(cl: Type) -> None: for subclass in cl.__subclasses__(): subclass_list.append(subclass) recurse(subclass) @@ -163,7 +60,11 @@ def recurse(cl): return set(subclass_list) -def recursively_setattr(root_module: nn.Module, prefix: str, materialized_module: nn.Module) -> None: +def recursively_setattr(root_module: Any, prefix: str, materialized_module: Module) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.meta.recursively_setattr` is deprecated in v1.8 and will be removed in v1.9." + " Please copy its implementation if you have a use for it." + ) *path, name = prefix.split(".") for p in path: root_module = getattr(root_module, p) @@ -175,166 +76,43 @@ def recursively_setattr(root_module: nn.Module, prefix: str, materialized_module setattr(root_module, name, materialized_module) -def materialize_module(root_module: nn.Module) -> nn.Module: - """This utility performs an in-place operation by materialize a module and its children.""" - if not _TORCH_GREATER_EQUAL_1_10: - return root_module - - materialize_fn = getattr(root_module, "materialize", None) - if materialize_fn and not isinstance(root_module, (Sequential, ModuleList, ModuleDict)): - return materialize_fn() - - for name, child in root_module.named_children(): - materialize_fn = getattr(child, "materialize", None) - if not materialize_fn or isinstance(child, (Sequential, ModuleList, ModuleDict)): - materialize_module(child) - else: - setattr(root_module, name, materialize_fn()) - return root_module - - -# cache subclasses to optimize the search when resetting the meta device later on. -__STORAGE_META__ = {} -__CREATED_MODULES__ = set() - - -def _unset_meta_device(from_created: bool = False) -> None: - """Replace all meta module by their original version.""" - if not _TORCH_GREATER_EQUAL_1_10: - raise MisconfigurationException("`init_meta` is supported from PyTorch 1.10.0") - - if from_created: - values = [__STORAGE_META__[key] for key in __CREATED_MODULES__] - else: - values = __STORAGE_META__.values() - - for mods, subclass, _ in values: - for mod in mods: - setattr(mod, subclass.__name__, subclass) - - -def _set_meta_device_populated(from_created: bool = False) -> None: - """Replace all meta module by their original version.""" - if not _TORCH_GREATER_EQUAL_1_10: - raise MisconfigurationException("`init_meta` is supported from PyTorch 1.10.0") - - if from_created: - values = [__STORAGE_META__[key] for key in __CREATED_MODULES__] - else: - values = __STORAGE_META__.values() - - for mods, subclass, meta_class in values: - for mod in mods: - setattr(mod, subclass.__name__, meta_class) - - -def _set_meta_device() -> None: - """Replace all torch.nn.Module by their meta replacement.""" - - if not _TORCH_GREATER_EQUAL_1_10: - raise MisconfigurationException("`init_meta` is supported from PyTorch 1.10.0") - - # Author note: This can be optimized further by searching all subclasses at once. - # Its time complexity is O(n*m) where n is the number of all subclasses if there's no multiple inheritance - # and m the number of all subclasses belonging to its subclass module. - - for subclass in get_all_subclasses(torch.nn.modules.module.Module): - - if subclass in (Sequential, ModuleList, ModuleDict, pl.LightningModule): - continue - - # if a subclass has already been stored, we should use the cache - if str(subclass) in __STORAGE_META__: - # reset the class import package to its rightful state. - mods, subclass, meta_class = __STORAGE_META__[subclass] - for mod in mods: - setattr(mod, subclass.__name__, meta_class) - continue - - class _IsinstanceMetaclass(type(subclass)): - def __instancecheck__(self, instance: Any) -> bool: - """Overrides the ``isinstance`` check on ``_MaterializerModule`` objects.""" - return isinstance(instance, self.__bases__[0]) - - # Create a class subclassing current `subclass` overriding its new method. - # this will enable use to use `torch.distributed.nn.utils.init_meta` to create a `meta` - # version of the current subclass module - class _MaterializerModule(subclass, metaclass=_IsinstanceMetaclass): - @classmethod - @contextmanager - def instantiation_context(cls): - _unset_meta_device(from_created=True) - yield - _set_meta_device_populated(from_created=True) - - @classmethod - def materialize(cls, materialize_fn: Callable): - with cls.instantiation_context(): - obj = materialize_fn() - return obj - - @staticmethod - def add_subclasses(subclass): - """This is used to unroll the instantiation tree while creating the modules.""" - # Don't store the LightningModule as skipped from the Meta process. - if subclass != pl.LightningModule: - __CREATED_MODULES__.add(subclass) - if subclass.__bases__[0] != torch.nn.modules.module.Module: - _MaterializerModule.add_subclasses(subclass.__bases__[0]) - - def __new__(cls, *args, **kwargs): - subclass = cls.__bases__[0] - cls.add_subclasses(subclass) - with cls.instantiation_context(): - obj = init_meta(subclass, *args, **kwargs) - - obj.materialize = partial(cls.materialize, materialize_fn=obj.materialize) - return obj - - def search(mod: ModuleType) -> List[ModuleType]: - out = [] - for _, obj in inspect.getmembers(mod): - if obj == subclass: - out.append(mod) - return out - - submodules = subclass.__module__.split(".") - mod = importlib.import_module(submodules[0]) - - # nn.Module class can be imported at different level and they all need to be mocked. - # Example: torch.nn.Linear is actually torch.nn.modules.linear.Linear - # Therefore, torch.nn.Linear, torch.nn.modules.Linear, torch.nn.modules.linear.Linear - # needs to be replaced by the torch.nn.linear.modules.Linear _MaterializerModule - out = [search(mod)] - for name in submodules[1:]: - mod = getattr(mod, name) - out.append(search(mod)) - - # drop empty module - mods = [mod for mod in chain(*out) if mod] - - # store the modules search so it doesn't have to be performed again for this class - __STORAGE_META__[subclass] = (mods, subclass, _MaterializerModule) - - # replace all subclass by its meta form - for mod in mods: - setattr(mod, subclass.__name__, _MaterializerModule) +def materialize_module(root_module: Module) -> None: + rank_zero_deprecation( + "`pytorch_lightning.utilities.meta.materialize_module` is deprecated in v1.8 and will be removed in v1.9." + " The function has become a no-op." + " Please check out the `torchdistx` project instead: https://github.com/pytorch/torchdistx" + ) @contextmanager def init_meta_context() -> Generator: - rank_zero_warn( - "Be aware this feature is highly experimental and there are a number of weird edge cases " - "where it can internal assert and/or crash. A more stable version is to be expected from PyTorch 1.11." + rank_zero_deprecation( + "`pytorch_lightning.utilities.meta.init_meta_context` is deprecated in v1.8 and will be removed in v1.9." + " The function has become a no-op." + " Please check out the `torchdistx` project instead: https://github.com/pytorch/torchdistx" ) - _set_meta_device() yield - _unset_meta_device() -def is_on_meta_device(module: nn.Module) -> bool: +def is_on_meta_device(module: Module) -> bool: + rank_zero_deprecation( + "`pytorch_lightning.utilities.meta.is_on_meta_device` is deprecated in v1.8 and will be removed in v1.9." + " Please copy its implementation if you have a use for it." + ) try: param = next(module.parameters()) - return param.device.type == "meta" + return param.is_meta except StopIteration: return False + + +def _is_deferred(module: Optional[Module]) -> bool: + if module is None or not _module_available("torchdistx.fake"): + return False + from torchdistx.fake import is_fake + + def any_fake(tensors: Mapping[str, Optional[Union[Tensor, Parameter]]]) -> bool: + return any(is_fake(t) for t in tensors.values() if t is not None) + + is_deferred = any(_is_deferred(m) for m in module.children()) + return is_deferred or any_fake(module._parameters) or any_fake(module._buffers) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py index 54c59bec62b5d..dcd8ecfd0169c 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py @@ -217,3 +217,17 @@ def test_gpu_accelerator_deprecation_warning(): ) ): GPUAccelerator() + + +def test_meta_utility_deprecations(): + import pytorch_lightning.utilities.meta as meta + + pytest.deprecated_call(meta.is_meta_init, match="is_meta_init.*removed in v1.9") + pytest.deprecated_call(meta.init_meta, Mock(), match="init_meta.*removed in v1.9") + pytest.deprecated_call(meta.get_all_subclasses, Mock, match="get_all_subclasses.*removed in v1.9") + pytest.deprecated_call(meta.recursively_setattr, Mock(), "foo", 1, match="recursively_setattr.*removed in v1.9") + pytest.deprecated_call(meta.materialize_module, Mock(), match="materialize_module.*removed in v1.9") + with pytest.deprecated_call(match="init_meta_context.*removed in v1.9"): + with meta.init_meta_context(): + pass + pytest.deprecated_call(meta.is_on_meta_device, LightningModule(), match="is_on_meta_device.*removed in v1.9") diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 1f955a2520faa..14f7ab1e79b08 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -33,7 +33,6 @@ from pytorch_lightning.strategies import DeepSpeedStrategy from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE, LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.meta import init_meta_context from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf @@ -1232,25 +1231,6 @@ def on_test_batch_start( trainer.test(model) -@RunIf(min_cuda_gpus=2, min_torch="1.10.0", max_torch="1.12.0", standalone=True, deepspeed=True) -def test_deepspeed_with_meta_device(tmpdir): - with init_meta_context(): - model = BoringModel() - assert model.layer.weight.device.type == "meta" - trainer = Trainer( - default_root_dir=tmpdir, - strategy=DeepSpeedStrategy(stage=3), - accelerator="gpu", - devices=2, - fast_dev_run=True, - precision=16, - enable_progress_bar=False, - enable_model_summary=False, - ) - trainer.fit(model) - assert model.layer.weight.device.type == "cpu" - - @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) def test_deepspeed_multi_save_same_filepath(tmpdir): """Test that verifies that deepspeed saves only latest checkpoint in the specified path and deletes the old diff --git a/tests/tests_pytorch/utilities/test_meta.py b/tests/tests_pytorch/utilities/test_meta.py deleted file mode 100644 index f7fcce4cb835e..0000000000000 --- a/tests/tests_pytorch/utilities/test_meta.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import pytest -from torch import nn - -from pytorch_lightning.core.module import LightningModule -from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.utilities.meta import init_meta_context, is_on_meta_device, materialize_module -from tests_pytorch.helpers.runif import RunIf - - -class MLP(nn.Module): - def __init__(self, num_layers: int): - super().__init__() - self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(num_layers)] + [nn.Dropout(), nn.LayerNorm(1)]) - - -class SimpleBoringModel(LightningModule): - def __init__(self, num_layers: int): - super().__init__() - self.save_hyperparameters() - self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(self.hparams.num_layers)]) - - -@RunIf(min_torch="1.10.0", max_torch="1.12.0", standalone=True) -def test_init_meta_context(): - - with init_meta_context(): - m = nn.Linear(in_features=1, out_features=1) - assert isinstance(m, nn.Linear) - assert m.weight.device.type == "meta" - assert is_on_meta_device(m) - mlp = MLP(4) - assert mlp.layer[0].weight.device.type == "meta" - - mlp = materialize_module(mlp) - assert mlp.layer[0].weight.device.type == "cpu" - - assert not is_on_meta_device(mlp) - assert not is_on_meta_device(nn.Module()) - - model = SimpleBoringModel(4) - assert model.layer[0].weight.device.type == "meta" - materialize_module(model) - assert model.layer[0].weight.device.type == "cpu" - - mlp = MLP(4) - assert mlp.layer[0].weight.device.type == "cpu" - # no-op as already materialized. - materialize_module(mlp) - assert mlp.layer[0].weight.device.type == "cpu" - - m = nn.Linear(in_features=1, out_features=1) - assert m.weight.device.type == "cpu" - - with init_meta_context(): - m = nn.Linear(in_features=1, out_features=1) - assert m.weight.device.type == "meta" - - m = nn.Linear(in_features=1, out_features=1) - assert m.weight.device.type == "cpu" - - -@RunIf(min_torch="1.10.0", max_torch="1.12.0", standalone=True) -def test_materialize_module_recursive_child(): - """Test materialize_module doesn't set a child recursively to a model instantiated within init_meta_context.""" - with init_meta_context(): - model = BoringModel() - - materialize_module(model) - - with pytest.raises(AttributeError, match="'Linear' object has no attribute 'layer'"): - model.layer.layer diff --git a/tests/tests_pytorch/utilities/test_torchdistx.py b/tests/tests_pytorch/utilities/test_torchdistx.py new file mode 100644 index 0000000000000..aa3f8e34bfaac --- /dev/null +++ b/tests/tests_pytorch/utilities/test_torchdistx.py @@ -0,0 +1,92 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import pytest +from torch import nn + +from pytorch_lightning import Trainer +from pytorch_lightning.core.module import LightningModule +from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.utilities.imports import _RequirementAvailable +from pytorch_lightning.utilities.meta import _is_deferred +from tests_pytorch.helpers.runif import RunIf + +_TORCHDISTX_AVAILABLE = _RequirementAvailable("torchdistx") + + +class SimpleBoringModel(LightningModule): + def __init__(self, num_layers): + super().__init__() + self.layer = nn.Sequential(*[nn.Linear(1, 1) for _ in range(num_layers)]) + + +@pytest.mark.skipif(not _TORCHDISTX_AVAILABLE, reason=_TORCHDISTX_AVAILABLE.message) +def test_deferred_init_with_lightning_module(): + from torchdistx.deferred_init import deferred_init, materialize_module + from torchdistx.fake import is_fake + + model = deferred_init(SimpleBoringModel, 4) + weight = model.layer[0].weight + assert weight.device.type == "cpu" + assert is_fake(weight) + assert _is_deferred(model) + + materialize_module(model) + materialize_module(model) # make sure it's idempotent + assert not _is_deferred(model) + weight = model.layer[0].weight + assert weight.device.type == "cpu" + assert not is_fake(weight) + + +@pytest.mark.skipif(not _TORCHDISTX_AVAILABLE, reason=_TORCHDISTX_AVAILABLE.message) +@pytest.mark.parametrize( + "trainer_kwargs", + ( + {"accelerator": "auto", "devices": 1}, + pytest.param( + {"strategy": "deepspeed_stage_3", "accelerator": "gpu", "devices": 2, "precision": 16}, + marks=RunIf(min_cuda_gpus=2, deepspeed=True), + ), + ), +) +def test_deferred_init_with_trainer(tmpdir, trainer_kwargs): + from torchdistx.deferred_init import deferred_init + + model = deferred_init(BoringModel) + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + enable_progress_bar=False, + enable_model_summary=False, + **trainer_kwargs + ) + trainer.fit(model) + + +@pytest.mark.skipif(not _TORCHDISTX_AVAILABLE, reason=_TORCHDISTX_AVAILABLE.message) +def test_deferred_init_ddp_spawn(tmpdir): + from torchdistx.deferred_init import deferred_init + + model = deferred_init(BoringModel) + trainer = Trainer( + default_root_dir=tmpdir, + fast_dev_run=True, + enable_progress_bar=False, + enable_model_summary=False, + accelerator="auto", + devices="1", + strategy="ddp_spawn", + ) + with pytest.raises(NotImplementedError, match="DDPSpawnStrategy` strategy does not support.*torchdistx"): + trainer.fit(model) From 89b72f74797306710220183d14d3acfd98869180 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Fri, 5 Aug 2022 19:51:30 +0200 Subject: [PATCH 101/230] relax `redis` requirement (#14008) --- requirements/app/cloud.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index 8dac80ef23432..5f8bf0c48692f 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,5 +1,5 @@ starsessions -redis==4.1.0 +redis>=4.0.0, <=4.2.4 docker==5.0.3 setuptools==59.5.0 s3fs==2022.1.0 From 9d02ad761c6c9360cba96e339d340c0cc2e572a8 Mon Sep 17 00:00:00 2001 From: Yurij Mikhalevich Date: Fri, 5 Aug 2022 22:34:27 +0400 Subject: [PATCH 102/230] feature(ui): Lightning AI doc theme update, integrates global header and footer with docs (#14053) --- requirements/app/docs.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt index 6d08960851c68..b35cc585b40c7 100644 --- a/requirements/app/docs.txt +++ b/requirements/app/docs.txt @@ -8,7 +8,7 @@ docutils>=0.16 sphinxcontrib-fulltoc>=1.0 sphinxcontrib-mockautodoc -https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31-rc1.zip +https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405) sphinx-paramlinks>=0.5.1 sphinx-togglebutton>=0.2 From 26d69ceada7f4ad1632e70df6414348170e85574 Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Fri, 5 Aug 2022 13:42:00 -0700 Subject: [PATCH 103/230] [CLI] add support for listing apps (#13987) * add support for listing apps * update changelog with correct PR number * add tests for pagination * fix wrong mock on test_cli * ensure all enum values are accounted for * make AppManager and AppList protected, add limit to pagination calls * add restarting transition /w tests * add state transition not yet run with tests --- src/lightning_app/CHANGELOG.md | 1 + src/lightning_app/cli/cmd_apps.py | 106 +++++++++++++++++++ src/lightning_app/cli/lightning_cli_list.py | 16 +++ src/lightning_app/utilities/cloud.py | 6 +- tests/tests_app/cli/test_cli.py | 15 ++- tests/tests_app/cli/test_cmd_apps.py | 111 ++++++++++++++++++++ 6 files changed, 249 insertions(+), 6 deletions(-) create mode 100644 src/lightning_app/cli/cmd_apps.py create mode 100644 tests/tests_app/cli/test_cmd_apps.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 692a1e62cec50..07927a1b01f87 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) +- Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987)) - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) ### Changed diff --git a/src/lightning_app/cli/cmd_apps.py b/src/lightning_app/cli/cmd_apps.py new file mode 100644 index 0000000000000..b413a9effbc96 --- /dev/null +++ b/src/lightning_app/cli/cmd_apps.py @@ -0,0 +1,106 @@ +import json +from datetime import datetime + +from lightning_cloud.openapi import ( + Externalv1LightningappInstance, + V1LightningappInstanceState, + V1LightningappInstanceStatus, +) +from rich.console import Console +from rich.table import Table +from rich.text import Text + +from lightning_app.cli.core import Formatable +from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.network import LightningClient + + +class _AppManager: + """_AppManager implements API calls specific to Lightning AI BYOC apps.""" + + def __init__(self): + self.api_client = LightningClient() + + def list(self, cluster_id: str = None, limit: int = 100): + project = _get_project(self.api_client) + + args = { + "project_id": project.project_id, + "limit": limit, + } + if cluster_id is not None: + args["cluster_id"] = cluster_id + + resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**args) + apps = resp.lightningapps + while resp.next_page_token is not None and resp.next_page_token != "": + args["page_token"] = resp.next_page_token + resp = self.api_client.lightningapp_instance_service_list_lightningapp_instances(**args) + apps = apps + resp.lightningapps + console = Console() + console.print(_AppList(resp.lightningapps).as_table()) + + +class _AppList(Formatable): + def __init__(self, apps: [Externalv1LightningappInstance]): + self.apps = apps + + @staticmethod + def _textualize_state_transitions( + desired_state: V1LightningappInstanceState, current_state: V1LightningappInstanceStatus + ): + phases = { + V1LightningappInstanceState.IMAGE_BUILDING: Text("building image", style="bold yellow"), + V1LightningappInstanceState.PENDING: Text("pending", style="bold yellow"), + V1LightningappInstanceState.RUNNING: Text("running", style="bold green"), + V1LightningappInstanceState.FAILED: Text("failed", style="bold red"), + V1LightningappInstanceState.STOPPED: Text("stopped"), + V1LightningappInstanceState.NOT_STARTED: Text("not started"), + V1LightningappInstanceState.DELETED: Text("deleted", style="bold red"), + V1LightningappInstanceState.UNSPECIFIED: Text("unspecified", style="bold red"), + } + + if current_state.phase == V1LightningappInstanceState.UNSPECIFIED and current_state.start_timestamp is None: + return Text("not yet started", style="bold yellow") + + if ( + desired_state == V1LightningappInstanceState.DELETED + and current_state.phase != V1LightningappInstanceState.DELETED + ): + return Text("terminating", style="bold red") + + if ( + any( + phase == current_state.phase + for phase in [V1LightningappInstanceState.PENDING, V1LightningappInstanceState.STOPPED] + ) + and desired_state == V1LightningappInstanceState.RUNNING + ): + return Text("restarting", style="bold yellow") + + return phases[current_state.phase] + + def as_json(self) -> str: + return json.dumps(self.apps) + + def as_table(self) -> Table: + table = Table("id", "name", "status", "cluster", "created", show_header=True, header_style="bold green") + + for app in self.apps: + app: Externalv1LightningappInstance + status = self._textualize_state_transitions(desired_state=app.spec.desired_state, current_state=app.status) + + # this guard is necessary only until 0.3.93 releases which includes the `created_at` + # field to the external API + created_at = datetime.now() + if hasattr(app, "created_at"): + created_at = app.created_at + + table.add_row( + app.id, + app.name, + status, + app.spec.cluster_id, + created_at.strftime("%Y-%m-%d") if created_at else "", + ) + return table diff --git a/src/lightning_app/cli/lightning_cli_list.py b/src/lightning_app/cli/lightning_cli_list.py index 31f46537e8c5f..d0d1d34a6dd4d 100644 --- a/src/lightning_app/cli/lightning_cli_list.py +++ b/src/lightning_app/cli/lightning_cli_list.py @@ -1,5 +1,6 @@ import click +from lightning_app.cli.cmd_apps import _AppManager from lightning_app.cli.cmd_clusters import AWSClusterManager @@ -14,3 +15,18 @@ def list_clusters(**kwargs): """List your Lightning AI BYOC compute clusters.""" cluster_manager = AWSClusterManager() cluster_manager.list() + + +@click.option( + "--cluster-id", + "cluster_id", + type=str, + required=False, + default=None, + help="Filter apps by associated Lightning AI compute cluster", +) +@get_list.command("apps") +def list_apps(cluster_id: str, **kwargs): + """List your Lightning AI apps.""" + app_manager = _AppManager() + app_manager.list(cluster_id=cluster_id) diff --git a/src/lightning_app/utilities/cloud.py b/src/lightning_app/utilities/cloud.py index 1706cdac94477..b320979a62028 100644 --- a/src/lightning_app/utilities/cloud.py +++ b/src/lightning_app/utilities/cloud.py @@ -8,12 +8,12 @@ from lightning_app.utilities.network import LightningClient -def _get_project(client: LightningClient) -> V1Membership: +def _get_project(client: LightningClient, project_id: str = LIGHTNING_CLOUD_PROJECT_ID) -> V1Membership: """Get a project membership for the user from the backend.""" projects = client.projects_service_list_memberships() - if LIGHTNING_CLOUD_PROJECT_ID is not None: + if project_id is not None: for membership in projects.memberships: - if membership.project_id == LIGHTNING_CLOUD_PROJECT_ID: + if membership.project_id == project_id: break else: raise ValueError( diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 3e003293692a8..16e641ac38f23 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -9,7 +9,7 @@ from lightning_app.cli.lightning_cli import _main, get_app_url, login, logout, run from lightning_app.cli.lightning_cli_create import create, create_cluster from lightning_app.cli.lightning_cli_delete import delete, delete_cluster -from lightning_app.cli.lightning_cli_list import get_list, list_clusters +from lightning_app.cli.lightning_cli_list import get_list, list_apps, list_clusters from lightning_app.runners.runtime_type import RuntimeType @@ -99,13 +99,22 @@ def test_create_cluster(create: mock.MagicMock): ) +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.cli.cmd_apps._AppManager.list") +def test_list_apps(list_command: mock.MagicMock): + runner = CliRunner() + runner.invoke(list_apps) + + list_command.assert_called_once_with(cluster_id=None) + + @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.list") -def test_list_clusters(list: mock.MagicMock): +def test_list_clusters(list_command: mock.MagicMock): runner = CliRunner() runner.invoke(list_clusters) - list.assert_called_once_with() + list_command.assert_called_once_with() @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) diff --git a/tests/tests_app/cli/test_cmd_apps.py b/tests/tests_app/cli/test_cmd_apps.py new file mode 100644 index 0000000000000..7d89dbc75b0a5 --- /dev/null +++ b/tests/tests_app/cli/test_cmd_apps.py @@ -0,0 +1,111 @@ +from unittest import mock +from unittest.mock import MagicMock + +import pytest as pytest +from lightning_cloud.openapi import ( + Externalv1LightningappInstance, + V1LightningappInstanceSpec, + V1LightningappInstanceState, + V1LightningappInstanceStatus, + V1ListLightningappInstancesResponse, + V1ListMembershipsResponse, + V1Membership, +) +from rich.text import Text + +from lightning_app.cli.cmd_apps import _AppList, _AppManager + + +@pytest.mark.parametrize( + "current_state,desired_state,expected", + [ + ( + V1LightningappInstanceStatus(phase=V1LightningappInstanceState.RUNNING), + V1LightningappInstanceState.DELETED, + Text("terminating"), + ), + ( + V1LightningappInstanceStatus(phase=V1LightningappInstanceState.STOPPED), + V1LightningappInstanceState.RUNNING, + Text("restarting"), + ), + ( + V1LightningappInstanceStatus(phase=V1LightningappInstanceState.PENDING), + V1LightningappInstanceState.RUNNING, + Text("restarting"), + ), + ( + V1LightningappInstanceStatus(phase=V1LightningappInstanceState.UNSPECIFIED, start_timestamp=None), + V1LightningappInstanceState.RUNNING, + Text("not yet started"), + ), + ], +) +def test_state_transitions(current_state, desired_state, expected): + actual = _AppList._textualize_state_transitions(current_state=current_state, desired_state=desired_state) + assert actual == expected + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.lightningapp_instance_service_list_lightningapp_instances") +@mock.patch("lightning_app.utilities.network.LightningClient.projects_service_list_memberships") +def test_list_all_apps_paginated(list_memberships: mock.MagicMock, list_instances: mock.MagicMock): + list_memberships.return_value = V1ListMembershipsResponse(memberships=[V1Membership(project_id="default-project")]) + list_instances.side_effect = [ + V1ListLightningappInstancesResponse( + lightningapps=[ + Externalv1LightningappInstance( + name="test1", + spec=V1LightningappInstanceSpec(desired_state=V1LightningappInstanceState.RUNNING), + status=V1LightningappInstanceStatus(phase=V1LightningappInstanceState.RUNNING), + ) + ], + next_page_token="page-2", + ), + V1ListLightningappInstancesResponse( + lightningapps=[ + Externalv1LightningappInstance( + name="test2", + spec=V1LightningappInstanceSpec(desired_state=V1LightningappInstanceState.STOPPED), + status=V1LightningappInstanceStatus(phase=V1LightningappInstanceState.RUNNING), + ) + ], + ), + ] + + cluster_manager = _AppManager() + cluster_manager.list() + + list_memberships.assert_called_once() + assert list_instances.mock_calls == [ + mock.call(project_id="default-project", limit=100), + mock.call(project_id="default-project", page_token="page-2", limit=100), + ] + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.lightningapp_instance_service_list_lightningapp_instances") +@mock.patch("lightning_app.utilities.network.LightningClient.projects_service_list_memberships") +def test_list_all_apps(list_memberships: mock.MagicMock, list_instances: mock.MagicMock): + list_memberships.return_value = V1ListMembershipsResponse(memberships=[V1Membership(project_id="default-project")]) + list_instances.return_value = V1ListLightningappInstancesResponse(lightningapps=[]) + + cluster_manager = _AppManager() + cluster_manager.list() + + list_memberships.assert_called_once() + list_instances.assert_called_once_with(project_id="default-project", limit=100) + + +@mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) +@mock.patch("lightning_app.utilities.network.LightningClient.lightningapp_instance_service_list_lightningapp_instances") +@mock.patch("lightning_app.utilities.network.LightningClient.projects_service_list_memberships") +def test_list_apps_on_cluster(list_memberships: mock.MagicMock, list_instances: mock.MagicMock): + list_memberships.return_value = V1ListMembershipsResponse(memberships=[V1Membership(project_id="default-project")]) + list_instances.return_value = V1ListLightningappInstancesResponse(lightningapps=[]) + + cluster_manager = _AppManager() + cluster_manager.list(cluster_id="12345") + + list_memberships.assert_called_once() + list_instances.assert_called_once_with(project_id="default-project", cluster_id="12345", limit=100) From b25275ccc27652b91d85d49b7bc220b37c921b54 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sat, 6 Aug 2022 03:45:15 +0530 Subject: [PATCH 104/230] Cast to fp16 before moving to device with deepspeed (#14000) --- src/pytorch_lightning/CHANGELOG.md | 6 +++++ src/pytorch_lightning/strategies/deepspeed.py | 21 +++++++++++---- src/pytorch_lightning/strategies/ipu.py | 14 +++++----- src/pytorch_lightning/strategies/utils.py | 14 ++++++++++ src/pytorch_lightning/utilities/deepspeed.py | 2 +- .../deprecated_api/test_remove_1-8.py | 4 +-- .../deprecated_api/test_remove_1-9.py | 6 +++++ .../strategies/test_deepspeed_strategy.py | 27 ++++++++++++++++--- 8 files changed, 74 insertions(+), 20 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 1ae21230c256f..5d77a3ad293b9 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -24,6 +24,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Deprecated +- Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) + + - Deprecated `amp_level` from `Trainer` in favour of passing it explictly via precision plugin ([#13898](https://github.com/Lightning-AI/lightning/pull/13898)) @@ -46,6 +49,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) +- Casted tensors to fp16 before moving them to device with `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) + + - Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 2d32503dd406a..b0b55374ba1a9 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -33,6 +33,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.ddp import DDPStrategy +from pytorch_lightning.strategies.utils import _fp_to_half from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities import GradClipAlgorithmType from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -46,10 +47,10 @@ from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.optimizer import optimizers_to_device -from pytorch_lightning.utilities.rank_zero import rank_zero_info +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed from pytorch_lightning.utilities.types import _LRScheduler, _PATH, LRSchedulerConfig, ReduceLROnPlateau, STEP_OUTPUT -from pytorch_lightning.utilities.warnings import rank_zero_warn, WarningCache +from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() @@ -70,9 +71,15 @@ def remove_module_hooks(model: torch.nn.Module) -> None: class LightningDeepSpeedModule(_LightningModuleWrapperBase): + """ + .. deprecated:: v1.7.1 + ``LightningDeepSpeedModule`` has been deprecated in v1.7.1 and will be removed in v1.9.0. + """ + def __init__( self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase], precision: Union[str, int] ) -> None: + rank_zero_deprecation("`LightningDeepSpeedModule` has been deprecated in v1.7.1 and will be removed in v1.9.0") super().__init__(pl_module) self.precision = precision @@ -478,7 +485,7 @@ def init_deepspeed(self) -> None: ) assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) - model = LightningDeepSpeedModule(pl_module=self.model, precision=self.precision_plugin.precision) + model = _LightningModuleWrapperBase(pl_module=self.model) if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) @@ -606,9 +613,9 @@ def _initialize_deepspeed_inference(self, model: Module) -> None: @property def lightning_module(self) -> Optional["pl.LightningModule"]: - # the model may not be wrapped with DeepEngine & LightningDeepSpeedModule if calling this too early + # the model may not be wrapped with DeepEngine & _LightningModuleWrapperBase if calling this too early module = getattr(self.model, "module", self.model) - module = module.module if isinstance(module, LightningDeepSpeedModule) else module + module = module.module if isinstance(module, _LightningModuleWrapperBase) else module assert isinstance(module, pl.LightningModule) or module is None return module @@ -944,6 +951,10 @@ def register_strategies(cls, strategy_registry: Dict) -> None: offload_optimizer_device="nvme", ) + def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any: + batch = apply_to_collection(batch, Tensor, function=_fp_to_half, precision=self.precision_plugin.precision) + return super().batch_to_device(batch, device, dataloader_idx) + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: assert self.model is not None with self.precision_plugin.val_step_context(): diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 67a9e14e32d56..c40addd4244b2 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -26,6 +26,7 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast +from pytorch_lightning.strategies.utils import _fp_to_half from pytorch_lightning.trainer.states import RunningStage, TrainerFn from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection @@ -44,6 +45,11 @@ class LightningIPUModule(_LightningModuleWrapperBase): + """ + .. deprecated:: v1.7.0 + ``LightningIPUModule`` has been deprecated in v1.7.0 and will be removed in v1.9.0. + """ + def __init__( self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase], precision: Union[str, int] ) -> None: @@ -281,13 +287,7 @@ def to_tensor(x: Any) -> Tensor: def batch_to_device(self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0) -> Any: # This override is necessary because the cast must occur before the data # is moved to the device to prevent wasteful host->device copies. - def fp_to_half(tensor: Tensor) -> Tensor: - if torch.is_floating_point(tensor): - return tensor.half() - return tensor - - if self.precision_plugin.precision in (PrecisionType.MIXED, PrecisionType.HALF): - batch = apply_to_collection(batch, Tensor, function=fp_to_half) + batch = apply_to_collection(batch, Tensor, function=_fp_to_half, precision=self.precision_plugin.precision) # We don't call `super().batch_to_device` because `data.to(device)` is not # currently necessary for IPUs. The movement of data from host<->IPU is # currently handled by PopTorch. diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py index eddb9077116dc..b71458bfc30d3 100644 --- a/src/pytorch_lightning/strategies/utils.py +++ b/src/pytorch_lightning/strategies/utils.py @@ -13,6 +13,20 @@ # limitations under the License. import os +import torch + +from pytorch_lightning.utilities.enums import PrecisionType + def on_colab_kaggle() -> bool: return bool(os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE")) + + +def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor: + if torch.is_floating_point(tensor): + if precision in (PrecisionType.MIXED, PrecisionType.HALF): + return tensor.half() + if precision == PrecisionType.BFLOAT: + return tensor.bfloat16() + + return tensor diff --git a/src/pytorch_lightning/utilities/deepspeed.py b/src/pytorch_lightning/utilities/deepspeed.py index f1c83176ccccf..cfa4e6a2f4d2b 100644 --- a/src/pytorch_lightning/utilities/deepspeed.py +++ b/src/pytorch_lightning/utilities/deepspeed.py @@ -98,7 +98,7 @@ def convert_zero_checkpoint_to_fp32_state_dict( model_file = get_model_state_file(checkpoint_dir, zero_stage) client_state = torch.load(model_file, map_location=CPU_DEVICE) client_state = {key: value for key, value in client_state.items() if key not in deepspeed_states} - # State dict keys will include reference to wrapper LightningDeepSpeedModule + # State dict keys will include reference to wrapper _LightningModuleWrapperBase # Delete `module` prefix before saving. state_dict = {k.partition("module.")[2]: state_dict[k] for k in state_dict.keys()} client_state["state_dict"] = state_dict diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index a371b2b3d04bf..aa6c1a615f9d2 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -948,9 +948,7 @@ def test_trainer_config_ipus(monkeypatch, trainer_kwargs, expected_ipus): trainer.ipus == expected_ipus -@mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True) -def test_v1_8_0_deprecated_lightning_ipu_module(_, monkeypatch): - monkeypatch.setattr(pytorch_lightning.strategies.ipu, "_IPU_AVAILABLE", True) +def test_v1_8_0_deprecated_lightning_ipu_module(): with pytest.deprecated_call(match=r"has been deprecated in v1.7.0 and will be removed in v1.8."): _ = LightningIPUModule(BoringModel(), 32) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py index dcd8ecfd0169c..baccbebb658bc 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-9.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-9.py @@ -30,6 +30,7 @@ from pytorch_lightning.profiler.pytorch import PyTorchProfiler, RegisterRecordFunction, ScheduleWrapper from pytorch_lightning.profiler.simple import SimpleProfiler from pytorch_lightning.profiler.xla import XLAProfiler +from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.utilities.imports import _KINETO_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_only from tests_pytorch.helpers.runif import RunIf @@ -219,6 +220,11 @@ def test_gpu_accelerator_deprecation_warning(): GPUAccelerator() +def test_v1_9_0_deprecated_lightning_deepspeed_module(): + with pytest.deprecated_call(match=r"has been deprecated in v1.7.1 and will be removed in v1.9."): + _ = LightningDeepSpeedModule(BoringModel(), 32) + + def test_meta_utility_deprecations(): import pytorch_lightning.utilities.meta as meta diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 14f7ab1e79b08..4f2cc14b6c62d 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -84,11 +84,12 @@ def automatic_optimization(self) -> bool: return False -def test_deepspeed_lightning_module(tmpdir): +def test_deepspeed_lightning_module(): """Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves types and device correctly.""" model = BoringModel() - module = LightningDeepSpeedModule(model, precision=16) + with pytest.deprecated_call(match="`LightningDeepSpeedModule` has been deprecated in v1.7.1"): + module = LightningDeepSpeedModule(model, precision=16) module.half() assert module.dtype == torch.half @@ -100,12 +101,13 @@ def test_deepspeed_lightning_module(tmpdir): @RunIf(min_cuda_gpus=1) -def test_deepspeed_lightning_module_precision(tmpdir): +def test_deepspeed_lightning_module_precision(): """Test to ensure that a model wrapped in `LightningDeepSpeedModule` moves tensors to half when precision 16.""" model = BoringModel() - module = LightningDeepSpeedModule(model, precision=16) + with pytest.deprecated_call(match="`LightningDeepSpeedModule` has been deprecated in v1.7.1"): + module = LightningDeepSpeedModule(model, precision=16) module.cuda().half() assert module.dtype == torch.half @@ -1286,6 +1288,7 @@ def test_deepspeed_with_bfloat16_precision(tmpdir): assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecisionPlugin) assert trainer.strategy.precision_plugin.precision == "bf16" assert trainer.strategy.config["zero_optimization"]["stage"] == 3 + assert trainer.strategy.config["bf16"]["enabled"] assert model.layer.weight.dtype == torch.bfloat16 @@ -1324,3 +1327,19 @@ def configure_optimizers(self): ) with pytest.raises(SystemExit): trainer.fit(model) + + +@RunIf(min_cuda_gpus=1, deepspeed=True) +def test_deepspeed_tensors_cast_to_fp16_before_hosted_on_device(): + class CustomBoringModel(BoringModel): + def transfer_batch_to_device(self, batch, *args, **kwargs): + assert batch.dtype is torch.float16 + return super().transfer_batch_to_device(batch, *args, **kwargs) + + model = CustomBoringModel() + trainer = Trainer(strategy="deepspeed", devices=1, accelerator="cuda", precision=16) + trainer.strategy.connect(model) + batch = torch.zeros((1), dtype=torch.float32) + batch = trainer.strategy.batch_to_device(batch) + assert batch.is_cuda + assert batch.dtype is torch.float16 From 5c05719f27b160a7f6db9345542182986d4feb27 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 8 Aug 2022 08:15:54 +0200 Subject: [PATCH 105/230] Freeze requirements for CI (#14007) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * free requirements * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typo * typo * ui * mypy * todo * mypy Co-authored-by: Carlos Mocholí * mypy Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Carlos Mocholí Co-authored-by: Akihiro Nitta --- .github/workflows/code-checks.yml | 1 + requirements/app/base.txt | 7 +++---- requirements/app/cloud.txt | 3 +-- requirements/app/docs.txt | 17 ++++++++--------- requirements/app/test.txt | 15 +++++---------- requirements/app/ui.txt | 2 +- requirements/pytorch/docs.txt | 16 ++++++++-------- requirements/pytorch/test.txt | 27 +++++++++++++-------------- 8 files changed, 40 insertions(+), 48 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index ed9cd46adbe44..7b5f3f26602e8 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -34,6 +34,7 @@ jobs: run: | pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt + # todo: adjust requirements for both code-bases pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html pip list diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 0a0b9cdb4719d..02eeb04bfa218 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,9 +1,8 @@ -py lightning-cloud==0.5.0 packaging -deepdiff>=5.7.0 +deepdiff>=5.7.0, <=5.8.1 starsessions -fsspec>=2022.01.0 -s3fs>=2022.1.0 +fsspec>=2022.01.0, <=2022.7.1 +s3fs>=2022.1.0, <=2022.7.1 croniter # for now until we found something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index 5f8bf0c48692f..ff18d47b44565 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,5 +1,4 @@ starsessions redis>=4.0.0, <=4.2.4 docker==5.0.3 -setuptools==59.5.0 -s3fs==2022.1.0 +# setuptools==59.5.0 diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt index b35cc585b40c7..bf22aef2c2d92 100644 --- a/requirements/app/docs.txt +++ b/requirements/app/docs.txt @@ -1,18 +1,17 @@ sphinx>=4.0,<5.0 -myst-parser>=0.15 -nbsphinx>=0.8.5 +myst-parser>=0.15,<0.17 +nbsphinx>=0.8.5, <=0.8.9 ipython[notebook] ipython_genutils -pandoc>=1.0 -docutils>=0.16 -sphinxcontrib-fulltoc>=1.0 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 sphinxcontrib-mockautodoc https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1 -sphinx-togglebutton>=0.2 -sphinx-copybutton>=0.3 +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 sphinx-autobuild -typing-extensions # already in `requirements.txt` but the docs CI job does not install it jinja2>=3.0.0,<3.1.0 diff --git a/requirements/app/test.txt b/requirements/app/test.txt index 9d2ed0af910ca..ab5ef8f1e85ac 100644 --- a/requirements/app/test.txt +++ b/requirements/app/test.txt @@ -1,15 +1,10 @@ -coverage>=5.0 -codecov>=2.1 -pytest>=5.0 -pytest-timeout -pytest-cov +coverage>=6.4, <=6.4.2 +codecov>=2.1, <=2.1.12 +pytest>=7.0, <=7.1.2 +pytest-timeout <=2.1.0 +pytest-cov <=3.0.0 playwright==1.22.0 # pytest-flake8 -flake8>=3.0 -check-manifest -twine>=3.2 -isort>=5.0 -mypy>=0.720 httpx trio pympler diff --git a/requirements/app/ui.txt b/requirements/app/ui.txt index 28df7f9c2ffe0..f0e4b2cdef471 100644 --- a/requirements/app/ui.txt +++ b/requirements/app/ui.txt @@ -1 +1 @@ -streamlit>=1.3.1 +streamlit>=1.3.1, <=1.11.1 diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt index e6fbbe322b6bf..50e7c2049f6f6 100644 --- a/requirements/pytorch/docs.txt +++ b/requirements/pytorch/docs.txt @@ -1,16 +1,16 @@ sphinx>=4.0,<5.0 myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5 +nbsphinx>=0.8.5, <=0.8.9 ipython[notebook] -pandoc>=1.0 -docutils>=0.16 -sphinxcontrib-fulltoc>=1.0 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 sphinxcontrib-mockautodoc pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip -sphinx-autodoc-typehints>=1.11,<1.15 # v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1 -sphinx-togglebutton>=0.2 -sphinx-copybutton>=0.3 +sphinx-autodoc-typehints>=1.11,<1.15 # strict; v1.15 failing on master (#11405) +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 typing-extensions # already in `requirements.txt` but the docs CI job does not install it jinja2>=3.0.0,<3.1.0 diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index ce54cd087b1de..c155400a3d35f 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -1,18 +1,17 @@ -coverage>=6.4 -codecov>=2.1 -pytest>=7.0 -pytest-cov -pytest-forked +coverage>=6.4, <=6.4.2 +codecov>=2.1, <=2.1.12 +pytest>=7.0, <=7.1.2 +pytest-cov <=3.0.0 +pytest-forked <=1.4.0 pytest-rerunfailures>=10.2 -mypy>=0.920 -flake8>=3.9.2 pre-commit>=1.0 +mypy==0.971 # needed in tests -cloudpickle>=1.3 -scikit-learn>0.22.1 -onnxruntime -psutil # for `DeviceStatsMonitor` -pandas # needed in benchmarks -fastapi -uvicorn +cloudpickle>=1.3, <=2.1.0 +scikit-learn>0.22.1, <=1.1.1 +onnxruntime<=1.12.0 +psutil<=5.9.1 # for `DeviceStatsMonitor` +pandas>1.0, <=1.4.3 # needed in benchmarks +fastapi<=0.79.0 +uvicorn<=0.18.2 From 76836a33cdfa63e2c85c6f4ea9b2a1f174c973e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Aug 2022 10:06:41 +0200 Subject: [PATCH 106/230] Run mypy with PyTorch 1.12 (#14044) --- .github/workflows/code-checks.yml | 2 +- pyproject.toml | 1 - .../plugins/precision/fully_sharded_native_amp.py | 2 +- .../strategies/fully_sharded_native.py | 2 +- .../strategies/launchers/multiprocessing.py | 2 +- src/pytorch_lightning/utilities/cloud_io.py | 11 ++++++----- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/.github/workflows/code-checks.yml b/.github/workflows/code-checks.yml index 7b5f3f26602e8..15bd5e9911740 100644 --- a/.github/workflows/code-checks.yml +++ b/.github/workflows/code-checks.yml @@ -32,7 +32,7 @@ jobs: - name: Install dependencies run: | - pip install torch==1.11 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install torch==1.12 --find-links https://download.pytorch.org/whl/cpu/torch_stable.html python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt # todo: adjust requirements for both code-bases pip install -r requirements/pytorch/devel.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html diff --git a/pyproject.toml b/pyproject.toml index 5473e73c52e19..9b8400ba27577 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.callbacks.quantization", "pytorch_lightning.core.datamodule", - "pytorch_lightning.core.decorators", "pytorch_lightning.core.module", "pytorch_lightning.core.saving", "pytorch_lightning.demos.boring_classes", diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py index 8c693f2975bbd..60e53b880c84d 100644 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py @@ -23,7 +23,7 @@ if _TORCH_GREATER_EQUAL_1_12: from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision else: - MixedPrecision = None + MixedPrecision = None # type: ignore[misc,assignment] class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index 4c351f26fa3b9..d92931fb5cdb2 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -51,7 +51,7 @@ ) from torch.distributed.fsdp.wrap import enable_wrap else: - MixedPrecision = None + MixedPrecision = None # type: ignore[misc,assignment] BackwardPrefetch = None # type: ignore[misc,assignment] CPUOffload = None # type: ignore[misc,assignment] diff --git a/src/pytorch_lightning/strategies/launchers/multiprocessing.py b/src/pytorch_lightning/strategies/launchers/multiprocessing.py index 39bba092e9c60..2617e5fe27b10 100644 --- a/src/pytorch_lightning/strategies/launchers/multiprocessing.py +++ b/src/pytorch_lightning/strategies/launchers/multiprocessing.py @@ -144,7 +144,7 @@ def _recover_results_in_main_process(self, worker_output: "_WorkerOutput", train # load last weights if worker_output.weights_path is not None: ckpt = self._strategy.checkpoint_io.load_checkpoint(worker_output.weights_path) - trainer.lightning_module.load_state_dict(ckpt) # type: ignore[arg-type] + trainer.lightning_module.load_state_dict(ckpt) self._strategy.checkpoint_io.remove_checkpoint(worker_output.weights_path) trainer.state = worker_output.trainer_state diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py index 81482a8ab24f9..ee3358be59541 100644 --- a/src/pytorch_lightning/utilities/cloud_io.py +++ b/src/pytorch_lightning/utilities/cloud_io.py @@ -22,14 +22,12 @@ from fsspec.core import url_to_fs from fsspec.implementations.local import AbstractFileSystem -from pytorch_lightning.utilities.types import _PATH +from pytorch_lightning.utilities.types import _DEVICE, _PATH def load( path_or_url: Union[IO, _PATH], - map_location: Optional[ - Union[str, Callable, torch.device, Dict[Union[str, torch.device], Union[str, torch.device]]] - ] = None, + map_location: Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] = None, ) -> Any: """Loads a checkpoint. @@ -41,7 +39,10 @@ def load( # any sort of BytesIO or similar return torch.load(path_or_url, map_location=map_location) if str(path_or_url).startswith("http"): - return torch.hub.load_state_dict_from_url(str(path_or_url), map_location=map_location) + return torch.hub.load_state_dict_from_url( + str(path_or_url), + map_location=map_location, # type: ignore[arg-type] # upstream annotation is not correct + ) fs = get_filesystem(path_or_url) with fs.open(path_or_url, "rb") as f: return torch.load(f, map_location=map_location) From aaeff90254aa0a1b91aaed759d15e66123533618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Aug 2022 10:07:54 +0200 Subject: [PATCH 107/230] Remove deprecated `DistributedType` and `DeviceType` enum classes (#14045) --- src/pytorch_lightning/CHANGELOG.md | 8 +- src/pytorch_lightning/utilities/__init__.py | 1 - src/pytorch_lightning/utilities/enums.py | 95 +------------------ .../deprecated_api/test_remove_1-8.py | 13 --- 4 files changed, 8 insertions(+), 109 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5d77a3ad293b9..565ef0e8438b5 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -30,7 +30,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated `amp_level` from `Trainer` in favour of passing it explictly via precision plugin ([#13898](https://github.com/Lightning-AI/lightning/pull/13898)) -- +- Deprecated the calls to `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) ### Removed @@ -44,6 +44,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the deprecated `DDP2Strategy` ([#14026](https://github.com/Lightning-AI/lightning/pull/14026)) +- Removed the deprecated `DistributedType` and `DeviceType` enum classes ([#14045](https://github.com/Lightning-AI/lightning/pull/14045)) + + +- Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) + + ### Fixed - Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) diff --git a/src/pytorch_lightning/utilities/__init__.py b/src/pytorch_lightning/utilities/__init__.py index df5084dd85490..c849ba0a05d68 100644 --- a/src/pytorch_lightning/utilities/__init__.py +++ b/src/pytorch_lightning/utilities/__init__.py @@ -21,7 +21,6 @@ _AcceleratorType, _StrategyType, AMPType, - DistributedType, GradClipAlgorithmType, LightningEnum, ) diff --git a/src/pytorch_lightning/utilities/enums.py b/src/pytorch_lightning/utilities/enums.py index e687d3f9f046b..06d616f87259f 100644 --- a/src/pytorch_lightning/utilities/enums.py +++ b/src/pytorch_lightning/utilities/enums.py @@ -15,11 +15,9 @@ from __future__ import annotations import os -from enum import Enum, EnumMeta -from typing import Any +from enum import Enum from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.warnings import rank_zero_deprecation class LightningEnum(str, Enum): @@ -43,37 +41,6 @@ def __hash__(self) -> int: return hash(self.value.lower()) -class _DeprecatedEnumMeta(EnumMeta): - """Enum that calls `deprecate()` whenever a member is accessed. - - Adapted from: https://stackoverflow.com/a/62309159/208880 - """ - - def __getattribute__(cls, name: str) -> Any: - obj = super().__getattribute__(name) - # ignore __dunder__ names -- prevents potential recursion errors - if not (name.startswith("__") and name.endswith("__")) and isinstance(obj, Enum): - obj.deprecate() - return obj - - def __getitem__(cls, name: str) -> Any: - member: _DeprecatedEnumMeta = super().__getitem__(name) - member.deprecate() - return member - - def __call__(cls, *args: Any, **kwargs: Any) -> Any: - obj = super().__call__(*args, **kwargs) - if isinstance(obj, Enum): - obj.deprecate() - return obj - - -class _DeprecatedEnum(LightningEnum, metaclass=_DeprecatedEnumMeta): - """_DeprecatedEnum calls an enum's `deprecate()` method on member access.""" - - pass - - class AMPType(LightningEnum): """Type of Automatic Mixed Precission used for training. @@ -110,66 +77,6 @@ def supported_types() -> list[str]: return [x.value for x in PrecisionType] -class DistributedType(_DeprecatedEnum): - """Define type of training strategy. - - Deprecated since v1.6.0 and will be removed in v1.8.0. - - Use `_StrategyType` instead. - """ - - DP = "dp" - DDP = "ddp" - DDP_SPAWN = "ddp_spawn" - TPU_SPAWN = "tpu_spawn" - DEEPSPEED = "deepspeed" - HOROVOD = "horovod" - DDP_SHARDED = "ddp_sharded" - DDP_SHARDED_SPAWN = "ddp_sharded_spawn" - DDP_FULLY_SHARDED = "ddp_fully_sharded" - HPU_PARALLEL = "hpu_parallel" - - @staticmethod - def interactive_compatible_types() -> list[DistributedType]: - """Returns a list containing interactive compatible DistributeTypes.""" - return [ - DistributedType.DP, - DistributedType.DDP_SPAWN, - DistributedType.DDP_SHARDED_SPAWN, - DistributedType.TPU_SPAWN, - ] - - def is_interactive_compatible(self) -> bool: - """Returns whether self is interactive compatible.""" - return self in DistributedType.interactive_compatible_types() - - def deprecate(self) -> None: - rank_zero_deprecation( - "`DistributedType` Enum has been deprecated in v1.6 and will be removed in v1.8." - f" Use the string value `{self.value!r}` instead." - ) - - -class DeviceType(_DeprecatedEnum): - """Define Device type by its nature - accelerators. - - Deprecated since v1.6.0 and will be removed in v1.8.0. - - Use `_AcceleratorType` instead. - """ - - CPU = "CPU" - GPU = "GPU" - IPU = "IPU" - TPU = "TPU" - - def deprecate(self) -> None: - rank_zero_deprecation( - "`DeviceType` Enum has been deprecated in v1.6 and will be removed in v1.8." - f" Use the string value `{self.value!r}` instead." - ) - - class GradClipAlgorithmType(LightningEnum): """Define gradient_clip_algorithm types - training-tricks. NORM type means "clipping gradients by norm". This computed over all model parameters together. diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index aa6c1a615f9d2..91be34c55078f 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -36,7 +36,6 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.apply_func import move_data_to_device -from pytorch_lightning.utilities.enums import DeviceType, DistributedType from pytorch_lightning.utilities.imports import _TORCHTEXT_LEGACY from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn from tests_pytorch.deprecated_api import no_deprecated_call @@ -44,18 +43,6 @@ from tests_pytorch.helpers.torchtext_utils import get_dummy_torchtext_data_iterator -def test_v1_8_0_deprecated_distributed_type_enum(): - - with pytest.deprecated_call(match="has been deprecated in v1.6 and will be removed in v1.8."): - _ = DistributedType.DDP - - -def test_v1_8_0_deprecated_device_type_enum(): - - with pytest.deprecated_call(match="has been deprecated in v1.6 and will be removed in v1.8."): - _ = DeviceType.CPU - - @pytest.mark.skipif(not _TORCHTEXT_LEGACY, reason="torchtext.legacy is deprecated.") def test_v1_8_0_deprecated_torchtext_batch(): From 355fda3702d640330fddbe25ad127879b0a7cbfa Mon Sep 17 00:00:00 2001 From: Dan Dale Date: Mon, 8 Aug 2022 01:16:53 -0700 Subject: [PATCH 108/230] Add Promoted CLI to API Reference Section (#14072) --- docs/source-pytorch/api_references.rst | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index db4fc1e2c4cf8..8daed5ddcaf41 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -47,6 +47,20 @@ callbacks Timer TQDMProgressBar +cli +----- + +.. currentmodule:: pytorch_lightning.cli + +.. autosummary:: + :toctree: api + :nosignatures: + :template: classtemplate.rst + + LightningCLI + LightningArgumentParser + SaveConfigCallback + core ---- From 5271ed93e6823178d1698d150b7146fe7a288695 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Mon, 8 Aug 2022 10:03:52 +0100 Subject: [PATCH 109/230] Fix mypy errors attributed to `pytorch_lightning.trainer.connectors.callback_connector.py` (#13750) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Apply suggestions from code review Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta --- pyproject.toml | 1 - .../trainer/connectors/callback_connector.py | 26 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9b8400ba27577..2f0e290440f44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -63,7 +63,6 @@ module = [ "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", - "pytorch_lightning.trainer.connectors.callback_connector", "pytorch_lightning.trainer.connectors.data_connector", "pytorch_lightning.trainer.supporters", "pytorch_lightning.trainer.trainer", diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py index 83881905beeb1..bb7f912420256 100644 --- a/src/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py @@ -17,6 +17,7 @@ from datetime import timedelta from typing import Dict, List, Optional, Sequence, Union +import pytorch_lightning as pl from pytorch_lightning.callbacks import ( Callback, Checkpoint, @@ -37,7 +38,7 @@ class CallbackConnector: - def __init__(self, trainer): + def __init__(self, trainer: "pl.Trainer"): self.trainer = trainer def on_trainer_init( @@ -50,7 +51,7 @@ def on_trainer_init( enable_model_summary: bool, max_time: Optional[Union[str, timedelta, Dict[str, int]]] = None, accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None, - ): + ) -> None: # init folder paths for checkpoint + weights save callbacks self.trainer._default_root_dir = default_root_dir or os.getcwd() if weights_save_path: @@ -95,16 +96,18 @@ def on_trainer_init( def _configure_accumulated_gradients( self, accumulate_grad_batches: Optional[Union[int, Dict[int, int]]] = None ) -> None: - grad_accum_callback = [cb for cb in self.trainer.callbacks if isinstance(cb, GradientAccumulationScheduler)] + grad_accum_callbacks: List[GradientAccumulationScheduler] = [ + cb for cb in self.trainer.callbacks if isinstance(cb, GradientAccumulationScheduler) + ] - if grad_accum_callback: + if grad_accum_callbacks: if accumulate_grad_batches is not None: raise MisconfigurationException( "You have set both `accumulate_grad_batches` and passed an instance of " "`GradientAccumulationScheduler` inside callbacks. Either remove `accumulate_grad_batches` " "from trainer or remove `GradientAccumulationScheduler` from callbacks list." ) - grad_accum_callback = grad_accum_callback[0] + grad_accum_callback = grad_accum_callbacks[0] else: if accumulate_grad_batches is None: accumulate_grad_batches = 1 @@ -148,6 +151,7 @@ def _configure_model_summary_callback(self, enable_model_summary: bool) -> None: progress_bar_callback = self.trainer.progress_bar_callback is_progress_bar_rich = isinstance(progress_bar_callback, RichProgressBar) + model_summary: ModelSummary if progress_bar_callback is not None and is_progress_bar_rich: model_summary = RichModelSummary() else: @@ -188,7 +192,7 @@ def _configure_timer_callback(self, max_time: Optional[Union[str, timedelta, Dic timer = Timer(duration=max_time, interval="step") self.trainer.callbacks.append(timer) - def _configure_fault_tolerance_callbacks(self): + def _configure_fault_tolerance_callbacks(self) -> None: from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint if any(isinstance(cb, _FaultToleranceCheckpoint) for cb in self.trainer.callbacks): @@ -196,7 +200,7 @@ def _configure_fault_tolerance_callbacks(self): # don't use `log_dir` to minimize the chances of failure self.trainer.callbacks.append(_FaultToleranceCheckpoint(dirpath=self.trainer.default_root_dir)) - def _attach_model_logging_functions(self): + def _attach_model_logging_functions(self) -> None: lightning_module = self.trainer.lightning_module for callback in self.trainer.callbacks: callback.log = lightning_module.log @@ -243,7 +247,7 @@ def _reorder_callbacks(callbacks: List[Callback]) -> List[Callback]: A new list in which the last elements are Checkpoint if there were any present in the input. """ - checkpoints = [c for c in callbacks if isinstance(c, Checkpoint)] + checkpoints: List[Callback] = [c for c in callbacks if isinstance(c, Checkpoint)] not_checkpoints = [c for c in callbacks if not isinstance(c, Checkpoint)] return not_checkpoints + checkpoints @@ -263,12 +267,12 @@ def _configure_external_callbacks() -> List[Callback]: else: from pkg_resources import iter_entry_points - factories = iter_entry_points("pytorch_lightning.callbacks_factory") + factories = iter_entry_points("pytorch_lightning.callbacks_factory") # type: ignore[assignment] - external_callbacks = [] + external_callbacks: List[Callback] = [] for factory in factories: callback_factory = factory.load() - callbacks_list: List[Callback] = callback_factory() + callbacks_list: Union[List[Callback], Callback] = callback_factory() callbacks_list = [callbacks_list] if isinstance(callbacks_list, Callback) else callbacks_list _log.info( f"Adding {len(callbacks_list)} callbacks from entry point '{factory.name}':" From 5c9b352eea38d39360324f9740e119dc42b2078e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Aug 2022 09:25:15 +0000 Subject: [PATCH 110/230] Update wandb requirement from <0.12.20,>=0.10.22 to >=0.10.22,<0.13.2 in /requirements (#14080) --- requirements/pytorch/loggers.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/loggers.txt b/requirements/pytorch/loggers.txt index 48a15c30f842f..df83a077f8457 100644 --- a/requirements/pytorch/loggers.txt +++ b/requirements/pytorch/loggers.txt @@ -7,4 +7,4 @@ neptune-client>=0.10.0, <0.16.4 comet-ml>=3.1.12, <3.31.8 mlflow>=1.0.0, <1.28.0 test_tube>=0.7.5, <=0.7.5 -wandb>=0.10.22, <0.12.20 +wandb>=0.10.22, <0.13.2 From b4ade232c8d8889fcadbf9b7b49380a3690f8acd Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Mon, 8 Aug 2022 07:13:25 -0400 Subject: [PATCH 111/230] Fix: Start Lightning App on Cloud if Repo Begins With Name "Lightning" (#14025) --- .../utilities/packaging/lightning_utils.py | 7 ++++++- .../utilities/packaging/test_lightning_utils.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/utilities/packaging/lightning_utils.py b/src/lightning_app/utilities/packaging/lightning_utils.py index 37f4ff22988eb..073d4d7ab613a 100644 --- a/src/lightning_app/utilities/packaging/lightning_utils.py +++ b/src/lightning_app/utilities/packaging/lightning_utils.py @@ -89,8 +89,13 @@ def get_dist_path_if_editable_install(project_name) -> str: def _prepare_lightning_wheels_and_requirements(root: Path) -> Optional[Callable]: + """This function determines if lightning is installed in editable mode (for developers) and packages the + current lightning source along with the app. - if "site-packages" in _PROJECT_ROOT: + For normal users who install via PyPi or Conda, then this function does not do anything. + """ + + if not get_dist_path_if_editable_install("lightning"): return # Packaging the Lightning codebase happens only inside the `lightning` repo. diff --git a/tests/tests_app/utilities/packaging/test_lightning_utils.py b/tests/tests_app/utilities/packaging/test_lightning_utils.py index b34e3162d5a0c..8f30aa21dd396 100644 --- a/tests/tests_app/utilities/packaging/test_lightning_utils.py +++ b/tests/tests_app/utilities/packaging/test_lightning_utils.py @@ -1,4 +1,5 @@ import os +from unittest import mock import pytest @@ -21,6 +22,21 @@ def test_prepare_lightning_wheels_and_requirement(tmpdir): assert os.listdir(tmpdir) == [] +def _mocked_get_dist_path_if_editable_install(*args, **kwargs): + return None + + +@mock.patch( + "lightning_app.utilities.packaging.lightning_utils.get_dist_path_if_editable_install", + new=_mocked_get_dist_path_if_editable_install, +) +def test_prepare_lightning_wheels_and_requirement_for_packages_installed_in_editable_mode(tmpdir): + """This test ensures the source does not get packaged inside the lightning repo if not installed in editable + mode.""" + cleanup_handle = _prepare_lightning_wheels_and_requirements(tmpdir) + assert cleanup_handle is None + + @pytest.mark.skip(reason="TODO: Find a way to check for the latest version") @RunIf(skip_windows=True) def test_verify_lightning_version(monkeypatch): From d072e4451a73f8fc2d7886086a220fbaf614b49e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 8 Aug 2022 13:35:06 +0200 Subject: [PATCH 112/230] Fix dtype inference during gradient norm computation (#14051) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/utilities/grads.py | 6 +++--- tests/tests_pytorch/utilities/test_grads.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 565ef0e8438b5..915436e5a0bcf 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) +- Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) + + ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/utilities/grads.py b/src/pytorch_lightning/utilities/grads.py index 66c1b7d988522..76c3f39bdc013 100644 --- a/src/pytorch_lightning/utilities/grads.py +++ b/src/pytorch_lightning/utilities/grads.py @@ -41,12 +41,12 @@ def grad_norm(module: Module, norm_type: Union[float, int, str], group_separator raise ValueError(f"`norm_type` must be a positive number or 'inf' (infinity norm). Got {norm_type}") norms = { - f"grad_{norm_type}_norm{group_separator}{name}": p.grad.data.norm(norm_type).item() + f"grad_{norm_type}_norm{group_separator}{name}": p.grad.data.norm(norm_type) for name, p in module.named_parameters() if p.grad is not None } if norms: - total_norm = torch.tensor(list(norms.values())).norm(norm_type).item() + total_norm = torch.tensor(list(norms.values())).norm(norm_type) norms[f"grad_{norm_type}_norm_total"] = total_norm - norms = {k: round(v, 4) for k, v in norms.items()} + norms = {k: round(v.item(), 4) for k, v in norms.items()} return norms diff --git a/tests/tests_pytorch/utilities/test_grads.py b/tests/tests_pytorch/utilities/test_grads.py index a548de66ab85d..49aab76403847 100644 --- a/tests/tests_pytorch/utilities/test_grads.py +++ b/tests/tests_pytorch/utilities/test_grads.py @@ -76,3 +76,17 @@ def __init__(self): def test_grad_norm_invalid_norm_type(norm_type): with pytest.raises(ValueError, match="`norm_type` must be a positive number or 'inf'"): grad_norm(Mock(), norm_type) + + +def test_grad_norm_with_double_dtype(): + class Model(nn.Module): + def __init__(self): + super().__init__() + dtype = torch.double + self.param = nn.Parameter(torch.tensor(1.0, dtype=dtype)) + # grad norm of this would become infinite + self.param.grad = torch.tensor(1e23, dtype=dtype) + + model = Model() + norms = grad_norm(model, 2) + assert all(torch.isfinite(torch.tensor(v)) for v in norms.values()), norms From 61a9f3a9bc19272ed8117a9e4dd25bd9c0608105 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 8 Aug 2022 12:53:57 +0000 Subject: [PATCH 113/230] Update tqdm requirement from <=4.63.0,>=4.57.0 to >=4.57.0,<4.65.0 in /requirements (#13875) Update tqdm requirement in /requirements Updates the requirements on [tqdm](https://github.com/tqdm/tqdm) to permit the latest version. - [Release notes](https://github.com/tqdm/tqdm/releases) - [Commits](https://github.com/tqdm/tqdm/compare/v4.57.0...v4.64.0) --- updated-dependencies: - dependency-name: tqdm dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec --- requirements/pytorch/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index e8743b18c73b0..49e2243319206 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -3,7 +3,7 @@ numpy>=1.17.2, <1.23.1 torch>=1.9.*, <=1.12.0 -tqdm>=4.57.0, <=4.63.0 +tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 tensorboard>=2.9.1, <2.10.0 From 890156a0163668149a47943907694c40cad153d3 Mon Sep 17 00:00:00 2001 From: JongMok Lee Date: Mon, 8 Aug 2022 22:16:56 +0900 Subject: [PATCH 114/230] Fix mypy errors in `pytorch_lightning/strategies/ddp.py` (#13885) Co-authored-by: awaelchli --- pyproject.toml | 1 - .../overrides/distributed.py | 2 - src/pytorch_lightning/strategies/ddp.py | 69 +++++++++++++------ src/pytorch_lightning/strategies/ddp_spawn.py | 3 +- src/pytorch_lightning/strategies/deepspeed.py | 4 +- 5 files changed, 51 insertions(+), 28 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 2f0e290440f44..761c7be04cc0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,6 @@ module = [ "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.profilers.simple", - "pytorch_lightning.strategies.ddp", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", diff --git a/src/pytorch_lightning/overrides/distributed.py b/src/pytorch_lightning/overrides/distributed.py index f09a7b9e3ae08..929d1ed486f4a 100644 --- a/src/pytorch_lightning/overrides/distributed.py +++ b/src/pytorch_lightning/overrides/distributed.py @@ -45,8 +45,6 @@ def _find_tensors( # https://github.com/pytorch/pytorch/blob/v1.7.1/torch/nn/parallel/distributed.py#L626-L638 def prepare_for_backward(model: DistributedDataParallel, output: Any) -> None: # `prepare_for_backward` is `DistributedDataParallel` specific. - if not isinstance(model, DistributedDataParallel): - return if torch.is_grad_enabled() and model.require_backward_grad_sync: model.require_forward_param_sync = True # type: ignore[assignment] # We'll return the output object verbatim since it is a freeform diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 922730df35269..57ab3a151b011 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -32,6 +32,7 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.overrides import LightningDistributedModule +from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment @@ -39,6 +40,7 @@ from pytorch_lightning.plugins.precision import PrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy +from pytorch_lightning.strategies.strategy import TBroadcast from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.distributed import ( _get_process_group_backend_from_env, @@ -57,7 +59,7 @@ from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed -from pytorch_lightning.utilities.types import STEP_OUTPUT +from pytorch_lightning.utilities.types import PredictStep, STEP_OUTPUT, TestStep, ValidationStep if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS @@ -83,12 +85,12 @@ def __init__( checkpoint_io: Optional[CheckpointIO] = None, precision_plugin: Optional[PrecisionPlugin] = None, ddp_comm_state: Optional[object] = None, - ddp_comm_hook: Optional[callable] = None, - ddp_comm_wrapper: Optional[callable] = None, + ddp_comm_hook: Optional[Callable] = None, + ddp_comm_wrapper: Optional[Callable] = None, model_averaging_period: Optional[int] = None, process_group_backend: Optional[str] = None, timeout: Optional[timedelta] = default_pg_timeout, - **kwargs: Union[Any, Dict[str, Any]], + **kwargs: Any, ) -> None: super().__init__( accelerator=accelerator, @@ -105,7 +107,7 @@ def __init__( self._ddp_comm_wrapper = ddp_comm_wrapper self._model_averaging_period = model_averaging_period self._model_averager: Optional[ModelAverager] = None - self._pids: Optional[List[int]] = None + self._pids: List[int] = [] self._sync_dir: Optional[str] = None self._rank_0_will_call_children_scripts: bool = False self._process_group_backend: Optional[str] = process_group_backend @@ -117,6 +119,7 @@ def is_distributed(self) -> bool: @property def root_device(self) -> torch.device: + assert self.parallel_devices is not None return self.parallel_devices[self.local_rank] @property @@ -129,11 +132,11 @@ def num_nodes(self, num_nodes: int) -> None: self._num_nodes = num_nodes @property - def num_processes(self): + def num_processes(self) -> int: return len(self.parallel_devices) if self.parallel_devices is not None else 0 @property - def distributed_sampler_kwargs(self): + def distributed_sampler_kwargs(self) -> Dict[str, Any]: distributed_sampler_kwargs = dict(num_replicas=(self.num_nodes * self.num_processes), rank=self.global_rank) return distributed_sampler_kwargs @@ -146,6 +149,7 @@ def process_group_backend(self) -> Optional[str]: return self._process_group_backend def _configure_launcher(self) -> None: + assert self.cluster_environment is not None if not self.cluster_environment.creates_processes_externally: self._launcher = _SubprocessScriptLauncher(self.cluster_environment, self.num_processes, self.num_nodes) self._rank_0_will_call_children_scripts = True @@ -156,10 +160,11 @@ def setup_environment(self) -> None: def setup(self, trainer: "pl.Trainer") -> None: # share ddp pids to all processes - self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts) + self._rank_0_will_call_children_scripts = bool(self.broadcast(self._rank_0_will_call_children_scripts)) if self._should_run_deadlock_detection(): self._share_information_to_prevent_deadlock() + assert self.accelerator is not None self.accelerator.setup(trainer) # move the model to the correct device @@ -170,6 +175,7 @@ def setup(self, trainer: "pl.Trainer") -> None: if trainer_fn == TrainerFn.FITTING: if self._layer_sync: + assert self.model is not None self.model = self._layer_sync.apply(self.model) self.setup_precision_plugin() @@ -193,7 +199,7 @@ def _setup_model(self, model: Module) -> DistributedDataParallel: log.detail(f"setting up DDP model with device ids: {device_ids}, kwargs: {self._ddp_kwargs}") return DistributedDataParallel(module=model, device_ids=device_ids, **self._ddp_kwargs) - def setup_distributed(self): + def setup_distributed(self) -> None: log.detail(f"{self.__class__.__name__}: setting up distributed...") reset_seed() @@ -204,6 +210,7 @@ def setup_distributed(self): rank_zero_only.rank = self.global_rank self._process_group_backend = self._get_process_group_backend() + assert self.cluster_environment is not None init_dist_connection(self.cluster_environment, self._process_group_backend, timeout=self._timeout) def _get_process_group_backend(self) -> str: @@ -230,6 +237,7 @@ def pre_configure_ddp(self) -> None: def _register_ddp_hooks(self) -> None: log.detail(f"{self.__class__.__name__}: registering ddp hooks") if self.root_device.type == "cuda" and self._is_single_process_single_device: + assert isinstance(self.model, DistributedDataParallel) register_ddp_comm_hook( model=self.model, ddp_comm_state=self._ddp_comm_state, @@ -262,6 +270,7 @@ def _enable_model_averaging(self) -> None: f"{optimizer.__class__.__name__}." ) + assert self._ddp_comm_state is not None self._model_averager = torch.distributed.algorithms.model_averaging.averagers.PeriodicModelAverager( period=self._model_averaging_period, warmup_steps=self._ddp_comm_state.start_localSGD_iter ) @@ -296,15 +305,16 @@ def optimizer_step( def configure_ddp(self) -> None: log.detail(f"{self.__class__.__name__}: configuring DistributedDataParallel") self.pre_configure_ddp() + assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model = self._setup_model(LightningDistributedModule(self.model)) self._register_ddp_hooks() - def determine_ddp_device_ids(self): + def determine_ddp_device_ids(self) -> Optional[List[int]]: if self.root_device.type == "cpu": return None return [self.root_device.index] - def barrier(self, *args, **kwargs) -> None: + def barrier(self, *args: Any, **kwargs: Any) -> None: if not distributed_available(): return if torch.distributed.get_backend() == "nccl": @@ -312,23 +322,29 @@ def barrier(self, *args, **kwargs) -> None: else: torch.distributed.barrier() - def broadcast(self, obj: object, src: int = 0) -> object: + def broadcast(self, obj: TBroadcast, src: int = 0) -> TBroadcast: obj = [obj] if self.global_rank != src: - obj = [None] + obj = [None] # type: ignore[list-item] torch.distributed.broadcast_object_list(obj, src, group=_group.WORLD) return obj[0] def pre_backward(self, closure_loss: Tensor) -> None: """Run before precision plugin executes backward.""" + if not isinstance(self.model, DistributedDataParallel): + return + assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: prepare_for_backward(self.model, closure_loss) - def model_to_device(self): + def model_to_device(self) -> None: log.detail(f"{self.__class__.__name__}: moving model to device [{self.root_device}]...") + assert self.model is not None self.model.to(self.root_device) - def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, str] = "mean") -> Tensor: + def reduce( + self, tensor: Tensor, group: Optional[Any] = None, reduce_op: Optional[Union[ReduceOp, str]] = "mean" + ) -> Tensor: """Reduces a tensor from several distributed processes to one aggregated tensor. Args: @@ -344,30 +360,38 @@ def reduce(self, tensor, group: Optional[Any] = None, reduce_op: Union[ReduceOp, tensor = sync_ddp_if_available(tensor, group, reduce_op=reduce_op) return tensor - def training_step(self, *args, **kwargs) -> STEP_OUTPUT: + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: + assert self.model is not None with self.precision_plugin.train_step_context(): return self.model(*args, **kwargs) - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.val_step_context(): + assert self.lightning_module is not None + assert self.model is not None if self.lightning_module.trainer.state.fn == TrainerFn.FITTING: # used when calling `trainer.fit` return self.model(*args, **kwargs) else: # used when calling `trainer.validate` + assert isinstance(self.model, ValidationStep) return self.model.validation_step(*args, **kwargs) - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: with self.precision_plugin.test_step_context(): + assert isinstance(self.model, TestStep) return self.model.test_step(*args, **kwargs) - def predict_step(self, *args, **kwargs) -> STEP_OUTPUT: + def predict_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: with self.precision_plugin.predict_step_context(): + assert isinstance(self.model, PredictStep) return self.model.predict_step(*args, **kwargs) - def post_training_step(self): + def post_training_step(self) -> None: + assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: - self.model.require_backward_grad_sync = True + assert self.model is not None + self.model.require_backward_grad_sync = True # type: ignore[assignment] @classmethod def register_strategies(cls, strategy_registry: Dict) -> None: @@ -458,7 +482,7 @@ def teardown(self) -> None: if ( _TORCH_GREATER_EQUAL_1_11 and not self.model.static_graph - and self.model._get_ddp_logging_data().get("can_set_static_graph") + and self.model._get_ddp_logging_data().get("can_set_static_graph") # type: ignore[operator] ): rank_zero_info( "Your model can run with static graph optimizations. For future training runs, we suggest you" @@ -475,6 +499,7 @@ def teardown(self) -> None: and pl_module._trainer.state.fn == TrainerFn.FITTING and self._layer_sync ): + assert self.model is not None self.model = self._layer_sync.revert(self.model) super().teardown() diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 30bcef457c44a..21602e60a5754 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -254,9 +254,10 @@ def model_to_device(self) -> None: def pre_backward(self, closure_loss: Tensor) -> None: """Run before precision plugin executes backward.""" + if not isinstance(self.model, DistributedDataParallel): + return assert self.lightning_module is not None if not self.lightning_module.automatic_optimization: - assert isinstance(self.model, DistributedDataParallel) prepare_for_backward(self.model, closure_loss) def reduce( diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index b0b55374ba1a9..3c31aeb7a7657 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -19,7 +19,7 @@ import platform from collections import OrderedDict from pathlib import Path -from typing import Any, cast, Dict, Generator, List, Mapping, Optional, Tuple, Union +from typing import Any, Dict, Generator, List, Mapping, Optional, Tuple, Union import torch from torch import Tensor @@ -831,7 +831,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: if self.load_full_weights and self.zero_stage_3: # Broadcast to ensure we load from the rank 0 checkpoint # This doesn't have to be the case when using deepspeed sharded checkpointing - checkpoint_path = cast(_PATH, self.broadcast(checkpoint_path)) + checkpoint_path = self.broadcast(checkpoint_path) return super().load_checkpoint(checkpoint_path) # Rely on deepspeed to load the checkpoint and necessary information From 7439f5d7491a87fbbb33f47ac18fa4ff8c7eeb23 Mon Sep 17 00:00:00 2001 From: Sean Naren Date: Mon, 8 Aug 2022 14:23:07 +0100 Subject: [PATCH 115/230] Update CODEOWNERS (remove myself from defaults + some specifics) (#14084) Update CODEOWNERS --- .github/CODEOWNERS | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index e40828557c2cf..05f7e91104589 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,7 +5,7 @@ # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. -* @williamfalcon @borda @tchaton @SeanNaren @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7 +* @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7 # CI/CD and configs /.github/ @borda @carmocca @akihironitta @tchaton @@ -28,22 +28,22 @@ # Packages /src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 /src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11 -/src/pytorch_lightning/core @tchaton @SeanNaren @borda @carmocca @justusschock @kaushikb11 +/src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11 /src/pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11 /src/pytorch_lightning/lite @tchaton @awaelchli @carmocca /src/pytorch_lightning/loggers @tchaton @awaelchli @borda /src/pytorch_lightning/loggers/wandb.py @borisdayma /src/pytorch_lightning/loggers/neptune.py @shnela @HubertJaworski @pkasprzyk @pitercl @Raalsky @aniezurawski @kamil-kaczmarek /src/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca -/src/pytorch_lightning/overrides @tchaton @SeanNaren @borda -/src/pytorch_lightning/plugins @tchaton @SeanNaren @awaelchli @justusschock +/src/pytorch_lightning/overrides @tchaton @borda +/src/pytorch_lightning/plugins @tchaton @awaelchli @justusschock /src/pytorch_lightning/profilers @williamfalcon @tchaton @borda @carmocca /src/pytorch_lightning/profilers/pytorch.py @nbcsm @guotuofeng /src/pytorch_lightning/strategies @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 -/src/pytorch_lightning/trainer @williamfalcon @borda @tchaton @SeanNaren @carmocca @awaelchli @justusschock @kaushikb11 -/src/pytorch_lightning/trainer/connectors @tchaton @SeanNaren @carmocca @borda +/src/pytorch_lightning/trainer @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 +/src/pytorch_lightning/trainer/connectors @tchaton @carmocca @borda /src/pytorch_lightning/tuner @SkafteNicki @borda @awaelchli -/src/pytorch_lightning/utilities @borda @tchaton @SeanNaren @carmocca +/src/pytorch_lightning/utilities @borda @tchaton @carmocca /src/lightning_app @tchaton @awaelchli @manskx @hhsecond From 55ae812dbf11f6568c73d5743aef0745715fb9fd Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Mon, 8 Aug 2022 15:48:50 +0200 Subject: [PATCH 116/230] Resolve increased time. (#14074) --- src/lightning_app/CHANGELOG.md | 2 ++ src/lightning_app/utilities/proxies.py | 2 +- tests/tests_app/core/test_lightning_app.py | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 07927a1b01f87..78a4e370e76ee 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -27,3 +27,5 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed - Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) + +- Resolved a bug about a race condition when sending the work state through the caller_queue ([#14074](https://github.com/Lightning-AI/lightning/pull/14074)) diff --git a/src/lightning_app/utilities/proxies.py b/src/lightning_app/utilities/proxies.py index 2c93a6c89f38c..99ad6e2aad0cf 100644 --- a/src/lightning_app/utilities/proxies.py +++ b/src/lightning_app/utilities/proxies.py @@ -74,7 +74,7 @@ def _send_data_to_caller_queue(work: "LightningWork", caller_queue: "BaseQueue", data.update({"state": work_state}) logger.debug(f"Sending to {work.name}: {data}") - caller_queue.put(data) + caller_queue.put(deepcopy(data)) # Reset the calls entry. work_state["calls"] = calls diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index a3a15085b98e3..e6c715f87ef03 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -896,6 +896,7 @@ def __init__(self, **kwargs): def run(self, signal: int): self.counter += 1 + assert len(self._calls) == 2 class SizeFlow(LightningFlow): From 34afde742ebe3acb56f5d0f14cd79d589d9771e0 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Tue, 9 Aug 2022 00:00:46 +0900 Subject: [PATCH 117/230] CI: Enable Python 3.10 in full CPU testing (#13829) * Update docker images to build --- .github/workflows/README.md | 2 +- .github/workflows/ci-pytorch_test-full.yml | 8 ++++++-- .github/workflows/ci-pytorch_test-slow.yml | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8b9e7d173b03c..f559551e1237f 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -6,7 +6,7 @@ | workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | | -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.8), (3.7, 1.11), (3.9, 1.8), (3.9, 1.12) | linux, mac, windows | +| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows | | Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | | Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | | pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch_test-full.yml index fb6916d1414fe..445707d340c4b 100644 --- a/.github/workflows/ci-pytorch_test-full.yml +++ b/.github/workflows/ci-pytorch_test-full.yml @@ -21,9 +21,13 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, windows-2019, macOS-11] - python-version: ["3.7", "3.9"] # minimum, maximum + python-version: ["3.7", "3.10"] # minimum, maximum requires: ["oldest", "latest"] release: ["stable"] + exclude: + # There's no distribution of the oldest PyTorch 1.9 for Python 3.10. + # TODO: Remove the exclusion when dropping PyTorch 1.9 support. + - {python-version: "3.10", requires: "oldest"} # TODO: re-enable RC testing # include: # - {os: ubuntu-20.04, python-version: "3.10", requires: "latest", release: "pre"} @@ -41,7 +45,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-full.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch_test-slow.yml index 905f60aa85699..b3756bbe8c2f7 100644 --- a/.github/workflows/ci-pytorch_test-slow.yml +++ b/.github/workflows/ci-pytorch_test-slow.yml @@ -36,7 +36,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-slow.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES From 82d2d1d85746c7743cca47e760422d3e13af6a6f Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Mon, 8 Aug 2022 16:21:26 -0400 Subject: [PATCH 118/230] Fix mypy errors attributed to `pytorch_lightning.core.saving` (#13932) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: Rohit Gupta --- pyproject.toml | 1 - src/pytorch_lightning/core/saving.py | 39 +++++++++++---------- src/pytorch_lightning/utilities/cloud_io.py | 6 ++-- src/pytorch_lightning/utilities/parsing.py | 4 ++- src/pytorch_lightning/utilities/types.py | 3 +- 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 761c7be04cc0e..8db782df357d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,6 @@ module = [ "pytorch_lightning.callbacks.quantization", "pytorch_lightning.core.datamodule", "pytorch_lightning.core.module", - "pytorch_lightning.core.saving", "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.profilers.base", diff --git a/src/pytorch_lightning/core/saving.py b/src/pytorch_lightning/core/saving.py index da81e4c212560..ffdc0988a1a6e 100644 --- a/src/pytorch_lightning/core/saving.py +++ b/src/pytorch_lightning/core/saving.py @@ -20,10 +20,9 @@ from argparse import Namespace from copy import deepcopy from enum import Enum -from typing import Any, Callable, Dict, IO, MutableMapping, Optional, Union +from typing import Any, Callable, cast, Dict, IO, MutableMapping, Optional, Type, Union from warnings import warn -import torch import yaml import pytorch_lightning as pl @@ -34,7 +33,7 @@ from pytorch_lightning.utilities.migration import pl_legacy_patch from pytorch_lightning.utilities.parsing import parse_class_init_keys from pytorch_lightning.utilities.rank_zero import rank_zero_warn -from pytorch_lightning.utilities.types import _PATH +from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH log = logging.getLogger(__name__) PRIMITIVE_TYPES = (bool, int, float, str) @@ -58,11 +57,11 @@ class ModelIO: def load_from_checkpoint( cls, checkpoint_path: Union[str, IO], - map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None, + map_location: _MAP_LOCATION_TYPE = None, hparams_file: Optional[str] = None, strict: bool = True, - **kwargs, - ): + **kwargs: Any, + ) -> Union["pl.LightningModule", "pl.LightningDataModule"]: r""" Primary way of loading a model from a checkpoint. When Lightning saves a checkpoint it stores the arguments passed to ``__init__`` in the checkpoint under ``"hyper_parameters"``. @@ -171,15 +170,15 @@ def on_hpc_load(self, checkpoint: Dict[str, Any]) -> None: def _load_from_checkpoint( - cls: Union["pl.LightningModule", "pl.LightningDataModule"], + cls: Union[Type["ModelIO"], Type["pl.LightningModule"], Type["pl.LightningDataModule"]], checkpoint_path: Union[str, IO], - map_location: Optional[Union[Dict[str, str], str, torch.device, int, Callable]] = None, + map_location: _MAP_LOCATION_TYPE = None, hparams_file: Optional[str] = None, - strict: Optional[bool] = None, + strict: bool = True, **kwargs: Any, -) -> Any: +) -> Union["pl.LightningModule", "pl.LightningDataModule"]: if map_location is None: - map_location = lambda storage, loc: storage + map_location = cast(_MAP_LOCATION_TYPE, lambda storage, loc: storage) with pl_legacy_patch(): checkpoint = pl_load(checkpoint_path, map_location=map_location) @@ -202,15 +201,18 @@ def _load_from_checkpoint( if issubclass(cls, pl.LightningDataModule): return _load_state(cls, checkpoint, **kwargs) - return _load_state(cls, checkpoint, strict=strict, **kwargs) + # allow cls to be evaluated as subclassed LightningModule or, + # as LightningModule for internal tests + if issubclass(cls, pl.LightningModule): + return _load_state(cls, checkpoint, strict=strict, **kwargs) def _load_state( - cls: Union["pl.LightningModule", "pl.LightningDataModule"], + cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]], checkpoint: Dict[str, Any], - strict: Optional[bool] = None, + strict: bool = True, **cls_kwargs_new: Any, -) -> Any: +) -> Union["pl.LightningModule", "pl.LightningDataModule"]: cls_spec = inspect.getfullargspec(cls.__init__) cls_init_args_name = inspect.signature(cls.__init__).parameters.keys() @@ -228,8 +230,7 @@ def _load_state( cls_kwargs_loaded.update(checkpoint.get(_old_hparam_key, {})) # 2. Try to restore model hparams from checkpoint using the new key - _new_hparam_key = cls.CHECKPOINT_HYPER_PARAMS_KEY - cls_kwargs_loaded.update(checkpoint.get(_new_hparam_key)) + cls_kwargs_loaded.update(checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_KEY, {})) # 3. Ensure that `cls_kwargs_old` has the right type, back compatibility between dict and Namespace cls_kwargs_loaded = _convert_loaded_hparams(cls_kwargs_loaded, checkpoint.get(cls.CHECKPOINT_HYPER_PARAMS_TYPE)) @@ -271,7 +272,9 @@ def _load_state( return obj -def _convert_loaded_hparams(model_args: dict, hparams_type: Optional[Union[Callable, str]] = None) -> object: +def _convert_loaded_hparams( + model_args: Dict[str, Any], hparams_type: Optional[Union[Callable, str]] = None +) -> Dict[str, Any]: """Convert hparams according given type in callable or string (past) format.""" # if not hparams type define if not hparams_type: diff --git a/src/pytorch_lightning/utilities/cloud_io.py b/src/pytorch_lightning/utilities/cloud_io.py index ee3358be59541..99629bcda8980 100644 --- a/src/pytorch_lightning/utilities/cloud_io.py +++ b/src/pytorch_lightning/utilities/cloud_io.py @@ -15,19 +15,19 @@ import io from pathlib import Path -from typing import Any, Callable, Dict, IO, Optional, Union +from typing import Any, Dict, IO, Union import fsspec import torch from fsspec.core import url_to_fs from fsspec.implementations.local import AbstractFileSystem -from pytorch_lightning.utilities.types import _DEVICE, _PATH +from pytorch_lightning.utilities.types import _MAP_LOCATION_TYPE, _PATH def load( path_or_url: Union[IO, _PATH], - map_location: Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] = None, + map_location: _MAP_LOCATION_TYPE = None, ) -> Any: """Loads a checkpoint. diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 9f5fe2d6b6841..81877f1dffba7 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -108,7 +108,9 @@ def clean_namespace(hparams: Union[Dict[str, Any], Namespace]) -> None: del hparams_dict[k] -def parse_class_init_keys(cls: Type["pl.LightningModule"]) -> Tuple[str, Optional[str], Optional[str]]: +def parse_class_init_keys( + cls: Union[Type["pl.LightningModule"], Type["pl.LightningDataModule"]] +) -> Tuple[str, Optional[str], Optional[str]]: """Parse key words for standard ``self``, ``*args`` and ``**kwargs``. Examples: diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index f6c14d366805f..18e2db6feb6c6 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -19,7 +19,7 @@ from contextlib import contextmanager from dataclasses import dataclass from pathlib import Path -from typing import Any, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, Union +from typing import Any, Callable, Dict, Generator, Iterator, List, Mapping, Optional, Sequence, Type, Union import torch from torch import Tensor @@ -49,6 +49,7 @@ ] EVAL_DATALOADERS = Union[DataLoader, Sequence[DataLoader]] _DEVICE = Union[torch.device, str, int] +_MAP_LOCATION_TYPE = Optional[Union[_DEVICE, Callable[[_DEVICE], _DEVICE], Dict[_DEVICE, _DEVICE]]] @runtime_checkable From 0cfc53d6b423531f598c9cfb386b3febbb8eb333 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Aug 2022 10:26:02 +0200 Subject: [PATCH 119/230] Fix regression on default value for `find_unused_parameters` (#14095) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/strategies/ddp_spawn.py | 14 ++++++++++++-- tests/tests_pytorch/strategies/test_ddp.py | 12 ++++++++++++ .../strategies/test_ddp_spawn_strategy.py | 16 ++++++++++++++++ .../strategies/test_sharded_strategy.py | 14 ++++++++++++++ 5 files changed, 57 insertions(+), 2 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 915436e5a0bcf..04eddf2c735f4 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -70,6 +70,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) +- Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) + + ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/strategies/ddp_spawn.py b/src/pytorch_lightning/strategies/ddp_spawn.py index 21602e60a5754..de34320f54093 100644 --- a/src/pytorch_lightning/strategies/ddp_spawn.py +++ b/src/pytorch_lightning/strategies/ddp_spawn.py @@ -315,10 +315,20 @@ def post_training_step(self) -> None: def register_strategies(cls, strategy_registry: Dict) -> None: entries = ( ("ddp_spawn", "spawn"), - ("ddp_spawn_find_unused_parameters_false", "spawn"), ("ddp_fork", "fork"), - ("ddp_fork_find_unused_parameters_false", "fork"), ("ddp_notebook", "fork"), + ) + for name, start_method in entries: + strategy_registry.register( + name, + cls, + description=f"DDP strategy with `start_method` '{start_method}'", + start_method=start_method, + ) + + entries = ( + ("ddp_spawn_find_unused_parameters_false", "spawn"), + ("ddp_fork_find_unused_parameters_false", "fork"), ("ddp_notebook_find_unused_parameters_false", "fork"), ) for name, start_method in entries: diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 4610f6153386b..1a2a0475e7ed6 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -194,3 +194,15 @@ def root_device(self): assert strategy._get_process_group_backend() == expected_process_group_backend else: assert strategy._get_process_group_backend() == expected_process_group_backend + + +@pytest.mark.parametrize( + "strategy_name,expected_ddp_kwargs", + [ + ("ddp", {}), + ("ddp_find_unused_parameters_false", {"find_unused_parameters": False}), + ], +) +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): + trainer = Trainer(strategy=strategy_name) + assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py index 52427c2c8cc3a..f485060833320 100644 --- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py @@ -178,3 +178,19 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group): mock_init_process_group.assert_called_with( process_group_backend, rank=global_rank, world_size=world_size, timeout=test_timedelta ) + + +@pytest.mark.parametrize( + "strategy_name,expected_ddp_kwargs", + [ + ("ddp_spawn", {}), + ("ddp_fork", {}), + ("ddp_notebook", {}), + ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}), + ], +) +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): + trainer = Trainer(strategy=strategy_name) + assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py index a047a10df32e3..ad0673ed1a5fa 100644 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py @@ -300,3 +300,17 @@ def test_block_backward_sync(): with strategy.block_backward_sync(): pass model.no_sync.assert_called_once() + + +@pytest.mark.parametrize( + "strategy_name,expected_ddp_kwargs", + [ + ("ddp_sharded", {}), + ("ddp_sharded_find_unused_parameters_false", {"find_unused_parameters": False}), + ("ddp_sharded_spawn", {}), + ("ddp_sharded_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), + ], +) +def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): + trainer = Trainer(strategy=strategy_name) + assert trainer.strategy._ddp_kwargs == expected_ddp_kwargs From d29a552b3c701ebc14d608347c1dbf55c3dfaa6a Mon Sep 17 00:00:00 2001 From: Robert S Lee Date: Tue, 9 Aug 2022 04:27:08 -0400 Subject: [PATCH 120/230] Fix import in doctest example (#14067) --- src/lightning_app/structures/dict.py | 2 +- src/lightning_app/structures/list.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning_app/structures/dict.py b/src/lightning_app/structures/dict.py index 93e2b161b2e7a..b414269b93eec 100644 --- a/src/lightning_app/structures/dict.py +++ b/src/lightning_app/structures/dict.py @@ -22,7 +22,7 @@ def __init__(self, **kwargs: T): .. doctest:: >>> from lightning_app import LightningFlow, LightningWork - >>> from lightning_app.core import Dict + >>> from lightning_app.structures import Dict >>> class CounterWork(LightningWork): ... def __init__(self): ... super().__init__() diff --git a/src/lightning_app/structures/list.py b/src/lightning_app/structures/list.py index f5a7c5c9913ad..cf691c98a8c38 100644 --- a/src/lightning_app/structures/list.py +++ b/src/lightning_app/structures/list.py @@ -24,7 +24,7 @@ def __init__(self, *items: T): .. doctest:: >>> from lightning_app import LightningFlow, LightningWork - >>> from lightning_app.core import List + >>> from lightning_app.structures import List >>> class CounterWork(LightningWork): ... def __init__(self): ... super().__init__() From c55fe7105b4d00735d22147612434ae9aebee4ab Mon Sep 17 00:00:00 2001 From: Anton Shevtsov <32237302+MrShevan@users.noreply.github.com> Date: Tue, 9 Aug 2022 16:40:30 +0300 Subject: [PATCH 121/230] Prefix seed_everything log messages with rank info (#14031) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Anton Shevtsov Co-authored-by: Rohit Gupta Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 2 +- src/pytorch_lightning/utilities/seed.py | 6 ++---- tests/tests_pytorch/utilities/test_seed.py | 18 ++++++++++++++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 04eddf2c735f4..4cea5685cac6f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- +- Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290)) - diff --git a/src/pytorch_lightning/utilities/seed.py b/src/pytorch_lightning/utilities/seed.py index 6648b5a56b2b1..8fce6a1debfcf 100644 --- a/src/pytorch_lightning/utilities/seed.py +++ b/src/pytorch_lightning/utilities/seed.py @@ -24,7 +24,7 @@ import numpy as np import torch -from pytorch_lightning.utilities.rank_zero import rank_zero_only, rank_zero_warn +from pytorch_lightning.utilities.rank_zero import _get_rank, rank_zero_only, rank_zero_warn log = logging.getLogger(__name__) @@ -66,9 +66,7 @@ def seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") seed = _select_seed_randomly(min_seed_value, max_seed_value) - # using `log.info` instead of `rank_zero_info`, - # so users can verify the seed is properly set in distributed training. - log.info(f"Global seed set to {seed}") + log.info(f"[rank: {_get_rank()}] Global seed set to {seed}") os.environ["PL_GLOBAL_SEED"] = str(seed) random.seed(seed) np.random.seed(seed) diff --git a/tests/tests_pytorch/utilities/test_seed.py b/tests/tests_pytorch/utilities/test_seed.py index 7f162bd605640..6908badf1a037 100644 --- a/tests/tests_pytorch/utilities/test_seed.py +++ b/tests/tests_pytorch/utilities/test_seed.py @@ -1,6 +1,8 @@ import os import random +from typing import Mapping from unittest import mock +from unittest.mock import MagicMock import numpy as np import pytest @@ -96,3 +98,19 @@ def test_isolate_rng(): with isolate_rng(): generated = [random.random() for _ in range(3)] assert random.random() == generated[0] + + +@mock.patch("pytorch_lightning.utilities.seed.log.info") +@pytest.mark.parametrize("env_vars", [{"RANK": "0"}, {"RANK": "1"}, {"RANK": "4"}]) +def test_seed_everything_log_info(log_mock: MagicMock, env_vars: Mapping[str, str]): + """Test that log message prefix with correct rank info.""" + with mock.patch.dict(os.environ, env_vars, clear=True): + from pytorch_lightning.utilities.rank_zero import _get_rank + + rank = _get_rank() + + seed_utils.seed_everything(123) + + expected_log = f"[rank: {rank}] Global seed set to 123" + + log_mock.assert_called_once_with(expected_log) From 9722127a741e9e108e49d9cffbc10a2842302c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Tue, 9 Aug 2022 16:03:36 +0200 Subject: [PATCH 122/230] Add missing codeowners for app package (#13542) --- .github/CODEOWNERS | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 05f7e91104589..f83924b9566ce 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -45,7 +45,13 @@ /src/pytorch_lightning/tuner @SkafteNicki @borda @awaelchli /src/pytorch_lightning/utilities @borda @tchaton @carmocca -/src/lightning_app @tchaton @awaelchli @manskx @hhsecond +/src/lightning_app @tchaton @manskx +/src/lightning_app/cli/pl-app-template @awaelchli @tchaton @Borda +/src/lightning_app/core @tchaton @awaelchli @manskx +/src/lightning_app/core/queues.py @tchaton @hhsecond @manskx +/src/lightning_app/runners/cloud.py @tchaton @hhsecond +/src/lightning_app/testing @tchaton @manskx +/src/lightning_app/__about__.py @nohalon @edenlightning @lantiga # Examples /examples/app_* @tchaton @awaelchli @manskx @hhsecond From ac369f5570d0a492d08ac9c2ba6622e451d7e131 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Tue, 9 Aug 2022 21:25:23 +0530 Subject: [PATCH 123/230] Fix incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` (#14041) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../plugins/precision/deepspeed.py | 2 +- src/pytorch_lightning/plugins/precision/ipu.py | 3 ++- src/pytorch_lightning/strategies/deepspeed.py | 2 +- src/pytorch_lightning/strategies/ipu.py | 2 +- src/pytorch_lightning/strategies/utils.py | 2 +- tests/tests_pytorch/accelerators/test_ipu.py | 2 +- .../strategies/test_deepspeed_strategy.py | 13 ++++++------- 8 files changed, 16 insertions(+), 13 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4cea5685cac6f..dac5533a6cb17 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -67,6 +67,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) +- Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041)) + + - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) diff --git a/src/pytorch_lightning/plugins/precision/deepspeed.py b/src/pytorch_lightning/plugins/precision/deepspeed.py index 01d3017760b0e..456bba1e77823 100644 --- a/src/pytorch_lightning/plugins/precision/deepspeed.py +++ b/src/pytorch_lightning/plugins/precision/deepspeed.py @@ -60,7 +60,7 @@ def __init__(self, precision: Union[str, int], amp_type: str, amp_level: Optiona amp_level = amp_level or "O2" - supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT, PrecisionType.MIXED) + supported_precision = (PrecisionType.HALF, PrecisionType.FLOAT, PrecisionType.BFLOAT) if precision not in supported_precision: raise ValueError( f"`Trainer(strategy='deepspeed', precision={precision!r})` is not supported." diff --git a/src/pytorch_lightning/plugins/precision/ipu.py b/src/pytorch_lightning/plugins/precision/ipu.py index 89f544575f63f..67e5e373e9f52 100644 --- a/src/pytorch_lightning/plugins/precision/ipu.py +++ b/src/pytorch_lightning/plugins/precision/ipu.py @@ -19,6 +19,7 @@ import pytorch_lightning as pl from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin from pytorch_lightning.utilities import GradClipAlgorithmType +from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden from pytorch_lightning.utilities.warnings import WarningCache @@ -35,7 +36,7 @@ class IPUPrecisionPlugin(PrecisionPlugin): """ def __init__(self, precision: int) -> None: - supported_precision_values = (16, 32) + supported_precision_values = (PrecisionType.HALF, PrecisionType.FLOAT) if precision not in supported_precision_values: raise ValueError( f"`Trainer(accelerator='ipu', precision={precision!r})` is not supported." diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 3c31aeb7a7657..8acbc80257bd1 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -696,7 +696,7 @@ def _auto_select_batch_size(self) -> int: def _format_precision_config(self) -> None: assert isinstance(self.config, dict) - if self.precision_plugin.precision in (PrecisionType.HALF, PrecisionType.MIXED): + if self.precision_plugin.precision == PrecisionType.HALF: if "fp16" not in self.config and self.precision_plugin.amp_type == AMPType.NATIVE: # FP16 is a DeepSpeed standalone AMP implementation rank_zero_info("Enabling DeepSpeed FP16.") diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index c40addd4244b2..4bedbfd6d70fc 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -58,7 +58,7 @@ def __init__( self.precision = precision def forward(self, *inputs: Any, **kwargs: Any) -> Any: - if self.precision in (PrecisionType.MIXED, PrecisionType.HALF): + if self.precision == PrecisionType.HALF: inputs = self._move_float_tensors_to_half(inputs) return super().forward(*inputs, **kwargs) diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py index b71458bfc30d3..cdae7bf434eca 100644 --- a/src/pytorch_lightning/strategies/utils.py +++ b/src/pytorch_lightning/strategies/utils.py @@ -24,7 +24,7 @@ def on_colab_kaggle() -> bool: def _fp_to_half(tensor: torch.Tensor, precision: PrecisionType) -> torch.Tensor: if torch.is_floating_point(tensor): - if precision in (PrecisionType.MIXED, PrecisionType.HALF): + if precision == PrecisionType.HALF: return tensor.half() if precision == PrecisionType.BFLOAT: return tensor.bfloat16() diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index 33d59d9a835ca..db3b9d1f91952 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -185,7 +185,7 @@ def test_optimization(tmpdir): @RunIf(ipu=True) -def test_mixed_precision(tmpdir): +def test_half_precision(tmpdir): class TestCallback(Callback): def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[str] = None) -> None: assert trainer.strategy.model.precision == 16 diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 4f2cc14b6c62d..272b03a846688 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -171,12 +171,11 @@ def test_deepspeed_strategy_env(tmpdir, monkeypatch, deepspeed_config): @RunIf(deepspeed=True) @mock.patch("pytorch_lightning.utilities.device_parser.num_cuda_devices", return_value=1) -@pytest.mark.parametrize("precision", [16, "mixed"]) @pytest.mark.parametrize( "amp_backend", ["native", pytest.param("apex", marks=RunIf(amp_apex=True))], ) -def test_deepspeed_precision_choice(_, amp_backend, precision, tmpdir): +def test_deepspeed_precision_choice(_, amp_backend, tmpdir): """Test to ensure precision plugin is also correctly chosen. DeepSpeed handles precision via Custom DeepSpeedPrecisionPlugin @@ -188,16 +187,16 @@ def test_deepspeed_precision_choice(_, amp_backend, precision, tmpdir): accelerator="gpu", strategy="deepspeed", amp_backend=amp_backend, - precision=precision, + precision=16, ) assert isinstance(trainer.strategy, DeepSpeedStrategy) assert isinstance(trainer.strategy.precision_plugin, DeepSpeedPrecisionPlugin) - assert trainer.strategy.precision_plugin.precision == precision + assert trainer.strategy.precision_plugin.precision == 16 @RunIf(deepspeed=True) -def test_deepspeed_with_invalid_config_path(tmpdir): +def test_deepspeed_with_invalid_config_path(): """Test to ensure if we pass an invalid config path we throw an exception.""" with pytest.raises( @@ -218,7 +217,7 @@ def test_deepspeed_with_env_path(tmpdir, monkeypatch, deepspeed_config): @RunIf(deepspeed=True) -def test_deepspeed_defaults(tmpdir): +def test_deepspeed_defaults(): """Ensure that defaults are correctly set as a config for DeepSpeed if no arguments are passed.""" strategy = DeepSpeedStrategy() assert strategy.config is not None @@ -663,7 +662,7 @@ def training_step(self, batch, batch_idx): @RunIf(min_cuda_gpus=2, standalone=True, deepspeed=True) -def test_deepspeed_multigpu_stage_3(tmpdir, deepspeed_config): +def test_deepspeed_multigpu_stage_3(tmpdir): """Test to ensure ZeRO Stage 3 works with a parallel model.""" model = ModelParallelBoringModel() trainer = Trainer( From 56abd60f048f43a7abd036380a6b5297baaa3854 Mon Sep 17 00:00:00 2001 From: Gautier Dagan Date: Tue, 9 Aug 2022 17:32:18 +0100 Subject: [PATCH 124/230] Fix assert wandb Run when mode="disabled" (#14112) --- src/pytorch_lightning/loggers/wandb.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 8e30827759b99..530fb58fabe5e 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -328,7 +328,7 @@ def __getstate__(self) -> Dict[str, Any]: @property # type: ignore[misc] @rank_zero_experiment - def experiment(self) -> Run: + def experiment(self) -> Union[Run, RunDisabled]: r""" Actual wandb object. To use wandb features in your @@ -361,11 +361,13 @@ def experiment(self) -> Run: self._experiment = wandb.init(**self._wandb_init) # define default x-axis - if isinstance(self._experiment, Run) and getattr(self._experiment, "define_metric", None): + if isinstance(self._experiment, (Run, RunDisabled)) and getattr( + self._experiment, "define_metric", None + ): self._experiment.define_metric("trainer/global_step") self._experiment.define_metric("*", step_metric="trainer/global_step", step_sync=True) - assert isinstance(self._experiment, Run) + assert isinstance(self._experiment, (Run, RunDisabled)) return self._experiment def watch(self, model: nn.Module, log: str = "gradients", log_freq: int = 100, log_graph: bool = True) -> None: From d85085479d9248d2f6a06821a0cb41c4a2eb02fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Tue, 9 Aug 2022 19:31:11 +0200 Subject: [PATCH 125/230] Reset all results on epoch end (#14061) --- src/pytorch_lightning/CHANGELOG.md | 3 ++ .../logger_connector/logger_connector.py | 3 +- .../logging_/test_train_loop_logging.py | 29 +++++++++++++++++-- 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index dac5533a6cb17..5dfd871f933f5 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -61,6 +61,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) + + - Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262)) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py index ff882912625d0..02e17a8d93494 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/logger_connector.py @@ -163,8 +163,7 @@ def update_train_epoch_metrics(self) -> None: self.log_metrics(self.metrics["log"]) # reset result collection for next epoch - assert self.trainer._results is not None - self.trainer._results.reset(metrics=True) + self.reset_results() """ Utilities and properties diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index 5855eba4c86af..d16be306b9365 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -569,11 +569,12 @@ def on_train_epoch_end(self, trainer, pl_module): "accelerator", [ pytest.param("gpu", marks=RunIf(min_cuda_gpus=1)), + "cpu", ], ) def test_metric_are_properly_reduced(tmpdir, accelerator): class TestingModel(BoringModel): - def __init__(self, *args, **kwargs) -> None: + def __init__(self) -> None: super().__init__() self.val_acc = Accuracy() @@ -592,7 +593,6 @@ def validation_step(self, batch, batch_idx): return super().validation_step(batch, batch_idx) early_stop = EarlyStopping(monitor="val_acc", mode="max") - checkpoint = ModelCheckpoint(monitor="val_acc", save_last=True, save_top_k=2, mode="max") model = TestingModel() @@ -812,3 +812,28 @@ def training_step(self, batch, batch_idx): call(metrics={"foo_epoch": 0.0, "epoch": 1}, step=3), ] ) + + +@mock.patch("pytorch_lightning.loggers.TensorBoardLogger.log_metrics") +def test_log_on_train_start(mock_log_metrics, tmpdir): + """Tests that logged metrics on_train_start get reset after the first epoch.""" + + class MyModel(BoringModel): + def on_train_start(self): + self.log("foo", 123) + + model = MyModel() + trainer = Trainer( + default_root_dir=tmpdir, + limit_train_batches=1, + limit_val_batches=0, + max_epochs=2, + log_every_n_steps=1, + enable_model_summary=False, + enable_checkpointing=False, + enable_progress_bar=False, + ) + trainer.fit(model) + + assert mock_log_metrics.mock_calls == [call(metrics={"foo": 123.0, "epoch": 0}, step=0)] + assert trainer.max_epochs > 1 From 619c2ff05827872973b2eed18d06651f7cd8bd4e Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Tue, 9 Aug 2022 12:17:57 -0700 Subject: [PATCH 126/230] [CLI] fix cluster creation CLI requiring instance-type selection (#14056) fix cluster creation CLI requiring instace-type selection we've marked `instance_types` as `required=False`, but the CLI calls `split` on the value. So if nothing is provided, we'll actually receive a runtime error, effectively rendering the flag as required. Co-authored-by: thomas chaton --- src/lightning_app/cli/lightning_cli_create.py | 2 +- tests/tests_app/cli/test_cli.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index 7e45fe7e7c078..d400db4b6f337 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -79,7 +79,7 @@ def create_cluster( region=region, role_arn=role_arn, external_id=external_id, - instance_types=instance_types.split(","), + instance_types=instance_types.split(",") if instance_types is not None else None, edit_before_creation=edit_before_creation, cost_savings=cost_savings, wait=wait, diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 16e641ac38f23..8cc5dd50f836e 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -70,7 +70,15 @@ def test_main_lightning_cli_help(): @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") -def test_create_cluster(create: mock.MagicMock): +@pytest.mark.parametrize( + "instance_types,expected_instance_types", + [ + (["--instance-types", "t3.xlarge"], ["t3.xlarge"]), + (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"]), + ([], None), + ], +) +def test_create_cluster(create_command: mock.MagicMock, instance_types, expected_instance_types): runner = CliRunner() runner.invoke( create_cluster, @@ -82,17 +90,16 @@ def test_create_cluster(create: mock.MagicMock): "dummy", "--role-arn", "arn:aws:iam::1234567890:role/lai-byoc", - "--instance-types", - "t2.small", - ], + ] + + instance_types, ) - create.assert_called_once_with( + create_command.assert_called_once_with( cluster_name="test-7", region="us-east-1", role_arn="arn:aws:iam::1234567890:role/lai-byoc", external_id="dummy", - instance_types=["t2.small"], + instance_types=expected_instance_types, edit_before_creation=False, cost_savings=False, wait=False, From 06c255c5c1889e6ecc640dc82a24193ce388511a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 00:54:10 +0200 Subject: [PATCH 127/230] Skip ddp fork tests on windows (#14121) --- .../strategies/test_ddp_spawn_strategy.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py index f485060833320..7fb22206c45c6 100644 --- a/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py +++ b/tests/tests_pytorch/strategies/test_ddp_spawn_strategy.py @@ -184,11 +184,17 @@ def test_ddp_spawn_strategy_set_timeout(mock_init_process_group): "strategy_name,expected_ddp_kwargs", [ ("ddp_spawn", {}), - ("ddp_fork", {}), - ("ddp_notebook", {}), + pytest.param("ddp_fork", {}, marks=RunIf(skip_windows=True)), + pytest.param("ddp_notebook", {}, marks=RunIf(skip_windows=True)), ("ddp_spawn_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}), - ("ddp_notebook_find_unused_parameters_false", {"find_unused_parameters": False}), + pytest.param( + "ddp_fork_find_unused_parameters_false", {"find_unused_parameters": False}, marks=RunIf(skip_windows=True) + ), + pytest.param( + "ddp_notebook_find_unused_parameters_false", + {"find_unused_parameters": False}, + marks=RunIf(skip_windows=True), + ), ], ) def test_ddp_kwargs_from_registry(strategy_name, expected_ddp_kwargs): From 975a4fc2f1daf5a1662a0d1f47212e7dcdae8b2b Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 9 Aug 2022 16:18:21 -0700 Subject: [PATCH 128/230] Support checkpoint save and load with Stochastic Weight Averaging (#9938) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: thomas chaton Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Adrian Wälchli Co-authored-by: Carlos Mocholi Co-authored-by: Kushashwa Ravi Shrimali Co-authored-by: Jirka Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 3 + .../callbacks/stochastic_weight_avg.py | 78 ++++++++++- .../callbacks/test_stochastic_weight_avg.py | 128 +++++++++++++++++- 3 files changed, 195 insertions(+), 14 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5dfd871f933f5..8852367a116f6 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -73,6 +73,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041)) +- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) + + - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) diff --git a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py index 20a3dcc3f0f26..6650bb3f0c479 100644 --- a/src/pytorch_lightning/callbacks/stochastic_weight_avg.py +++ b/src/pytorch_lightning/callbacks/stochastic_weight_avg.py @@ -16,7 +16,7 @@ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ """ from copy import deepcopy -from typing import Any, Callable, cast, List, Optional, Union +from typing import Any, Callable, cast, Dict, List, Optional, Union import torch from torch import nn, Tensor @@ -24,6 +24,7 @@ import pytorch_lightning as pl from pytorch_lightning.callbacks.callback import Callback +from pytorch_lightning.strategies import DDPFullyShardedStrategy, DeepSpeedStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_warn from pytorch_lightning.utilities.types import _LRScheduler, LRSchedulerConfig @@ -112,15 +113,22 @@ def __init__( if device is not None and not isinstance(device, (torch.device, str)): raise MisconfigurationException(f"device is expected to be a torch.device or a str. Found {device}") + self.n_averaged: Optional[torch.Tensor] = None self._swa_epoch_start = swa_epoch_start self._swa_lrs = swa_lrs self._annealing_epochs = annealing_epochs self._annealing_strategy = annealing_strategy self._avg_fn = avg_fn or self.avg_fn self._device = device - self._max_epochs: int - self._model_contains_batch_norm: bool + self._model_contains_batch_norm: Optional[bool] = None self._average_model: "pl.LightningModule" + self._initialized = False + self._swa_scheduler: Optional[_LRScheduler] = None + self._scheduler_state: Optional[Dict] = None + self._init_n_averaged = 0 + self._latest_update_epoch = -1 + self.momenta: Optional[Dict[nn.modules.batchnorm._BatchNorm, float]] = None + self._max_epochs: int @property def swa_start(self) -> int: @@ -147,6 +155,9 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - if len(trainer.lr_scheduler_configs) > 1: raise MisconfigurationException("SWA currently not supported for more than 1 `lr_scheduler`.") + if isinstance(trainer.strategy, (DDPFullyShardedStrategy, DeepSpeedStrategy)): + raise MisconfigurationException("SWA does not currently support sharded models.") + if isinstance(self._swa_epoch_start, float): self._swa_epoch_start = int(trainer.max_epochs * self._swa_epoch_start) @@ -158,8 +169,13 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") - assert trainer.fit_loop.max_epochs is not None trainer.fit_loop.max_epochs += 1 + if self._scheduler_state is not None: + self._clear_schedulers(trainer) + def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: - if trainer.current_epoch == self.swa_start: + if (not self._initialized) and (self.swa_start <= trainer.current_epoch <= self.swa_end): + self._initialized = True + # move average model to request device. self._average_model = self._average_model.to(self._device or pl_module.device) @@ -180,6 +196,17 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo last_epoch=trainer.max_epochs if self._annealing_strategy == "cos" else -1, ), ) + if self._scheduler_state is not None: + # Restore scheduler state from checkpoint + self._swa_scheduler.load_state_dict(self._scheduler_state) + elif trainer.current_epoch != self.swa_start: + # Log a warning if we're initializing after start without any checkpoint data, + # as behaviour will be different compared to having checkpoint data. + rank_zero_warn( + "SWA is initializing after swa_start without any checkpoint data. " + "This may be caused by loading a checkpoint from an older version of PyTorch Lightning." + ) + # We assert that there is only one optimizer on fit start, so know opt_idx is always 0 default_scheduler_cfg = LRSchedulerConfig(self._swa_scheduler, opt_idx=0) assert default_scheduler_cfg.interval == "epoch" and default_scheduler_cfg.frequency == 1 @@ -196,14 +223,18 @@ def on_train_epoch_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningMo else: trainer.lr_scheduler_configs.append(default_scheduler_cfg) - self.n_averaged = torch.tensor(0, dtype=torch.long, device=pl_module.device) + if self.n_averaged is None: + self.n_averaged = torch.tensor(self._init_n_averaged, dtype=torch.long, device=pl_module.device) - if self.swa_start <= trainer.current_epoch <= self.swa_end: + if (self.swa_start <= trainer.current_epoch <= self.swa_end) and ( + trainer.current_epoch > self._latest_update_epoch + ): + assert self.n_averaged is not None self.update_parameters(self._average_model, pl_module, self.n_averaged, self._avg_fn) + self._latest_update_epoch = trainer.current_epoch # Note: No > here in case the callback is saved with the model and training continues if trainer.current_epoch == self.swa_end + 1: - # Transfer weights from average model to pl_module self.transfer_weights(self._average_model, pl_module) @@ -265,6 +296,7 @@ def reset_batch_norm_and_save_state(self, pl_module: "pl.LightningModule") -> No def reset_momenta(self) -> None: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L164-L165.""" + assert self.momenta is not None for bn_module in self.momenta: bn_module.momentum = self.momenta[bn_module] @@ -285,3 +317,35 @@ def update_parameters( def avg_fn(averaged_model_parameter: Tensor, model_parameter: Tensor, num_averaged: Tensor) -> Tensor: """Adapted from https://github.com/pytorch/pytorch/blob/v1.7.1/torch/optim/swa_utils.py#L95-L97.""" return averaged_model_parameter + (model_parameter - averaged_model_parameter) / (num_averaged + 1) + + def state_dict(self) -> Dict[str, Any]: + return { + "n_averaged": 0 if self.n_averaged is None else self.n_averaged.item(), + "latest_update_epoch": self._latest_update_epoch, + "scheduler_state": None if self._swa_scheduler is None else self._swa_scheduler.state_dict(), + "average_model_state": None if self._average_model is None else self._average_model.state_dict(), + } + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + self._init_n_averaged = state_dict["n_averaged"] + self._latest_update_epoch = state_dict["latest_update_epoch"] + self._scheduler_state = state_dict["scheduler_state"] + self._load_average_model_state(state_dict["average_model_state"]) + + @staticmethod + def _clear_schedulers(trainer: "pl.Trainer") -> None: + # If we have scheduler state saved, clear the scheduler configs so that we don't try to + # load state into the wrong type of schedulers when restoring scheduler checkpoint state. + # We'll configure the scheduler and re-load its state in on_train_epoch_start. + # Note that this relies on the callback state being restored before the scheduler state is + # restored, and doesn't work if restore_checkpoint_after_setup is True, but at the time of + # writing that is only True for deepspeed which is already not supported by SWA. + # See https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 for background. + if trainer.lr_scheduler_configs: + assert len(trainer.lr_scheduler_configs) == 1 + trainer.lr_scheduler_configs.clear() + + def _load_average_model_state(self, model_state: Any) -> None: + if self._average_model is None: + return + self._average_model.load_state_dict(model_state) diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 859cf2fa98c0c..65a0fea2fb4a5 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -12,11 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import os +from pathlib import Path +from typing import ContextManager, Optional from unittest import mock import pytest import torch from torch import nn +from torch.optim.lr_scheduler import LambdaLR from torch.optim.swa_utils import SWALR from torch.utils.data import DataLoader @@ -30,7 +34,9 @@ class SwaTestModel(BoringModel): - def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False): + def __init__( + self, batchnorm: bool = True, interval: str = "epoch", iterable_dataset: bool = False, crash_on_epoch=None + ): super().__init__() layers = [nn.Linear(32, 32)] if batchnorm: @@ -39,17 +45,18 @@ def __init__(self, batchnorm: bool = True, interval: str = "epoch", iterable_dat self.layer = nn.Sequential(*layers) self.interval = interval self.iterable_dataset = iterable_dataset + self.crash_on_epoch = crash_on_epoch def training_step(self, batch, batch_idx): + if self.crash_on_epoch and self.trainer.current_epoch >= self.crash_on_epoch: + raise Exception("SWA crash test") output = self.forward(batch) loss = self.loss(batch, output) return {"loss": loss} def train_dataloader(self): - dset_cls = RandomIterableDataset if self.iterable_dataset else RandomDataset dset = dset_cls(32, 64) - return DataLoader(dset, batch_size=2) def configure_optimizers(self): @@ -66,6 +73,8 @@ def configure_optimizers(self): class SwaTestCallback(StochasticWeightAveraging): update_parameters_calls: int = 0 transfer_weights_calls: int = 0 + # Record the first epoch, as if we are resuming from a checkpoint this may not be equal to 0 + first_epoch: Optional[int] = None def update_parameters(self, *args, **kwargs): self.update_parameters_calls += 1 @@ -77,6 +86,11 @@ def transfer_weights(self, *args, **kwargs): def on_train_epoch_start(self, trainer, *args): super().on_train_epoch_start(trainer, *args) + if self.first_epoch is None and not trainer.fit_loop.restarting: + # since the checkpoint loaded was saved `on_train_epoch_end`, the first `FitLoop` iteration will + # not update the model and just call the epoch-level hooks, for that reason, we check that we are not + # restarting before choosing the first epoch + self.first_epoch = trainer.current_epoch assert trainer.fit_loop._skip_backward == (trainer.current_epoch > self.swa_end) if self.swa_start <= trainer.current_epoch: assert isinstance(trainer.lr_scheduler_configs[0].scheduler, SWALR) @@ -88,6 +102,7 @@ def on_train_epoch_end(self, trainer, *args): if self.swa_start <= trainer.current_epoch <= self.swa_end: swa_epoch = trainer.current_epoch - self.swa_start assert self.n_averaged == swa_epoch + 1 + assert self._swa_scheduler is not None # Scheduler is stepped once on initialization and then at the end of each epoch assert self._swa_scheduler._step_count == swa_epoch + 2 elif trainer.current_epoch > self.swa_end: @@ -103,10 +118,13 @@ def on_train_end(self, trainer, pl_module): if not isinstance(trainer.strategy, DDPSpawnStrategy): # check backward call count. the batchnorm update epoch should not backward - assert trainer.strategy.backward.call_count == trainer.max_epochs * trainer.limit_train_batches + assert trainer.strategy.backward.call_count == ( + (trainer.max_epochs - self.first_epoch) * trainer.limit_train_batches + ) # check call counts - assert self.update_parameters_calls == trainer.max_epochs - (self._swa_epoch_start - 1) + first_swa_epoch = max(self.first_epoch, self.swa_start) + assert self.update_parameters_calls == trainer.max_epochs - first_swa_epoch assert self.transfer_weights_calls == 1 @@ -140,7 +158,7 @@ def train_with_swa( devices=devices, ) - with mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward): + with _backward_patch(trainer): trainer.fit(model) # check the model is the expected @@ -226,9 +244,10 @@ def test_swa_multiple_lrs(tmpdir): class TestModel(BoringModel): def __init__(self): - super(BoringModel, self).__init__() + super().__init__() self.layer1 = torch.nn.Linear(32, 32) self.layer2 = torch.nn.Linear(32, 2) + self.on_train_epoch_start_called = False def forward(self, x): x = self.layer1(x) @@ -255,3 +274,98 @@ def on_train_epoch_start(self): ) trainer.fit(model) assert model.on_train_epoch_start_called + + +def _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=False): + swa_start = 3 + trainer_kwargs = { + "default_root_dir": tmpdir, + "max_epochs": 5, + "accelerator": "cpu", + "strategy": "ddp_spawn_find_unused_parameters_false" if ddp else None, + "devices": 2 if ddp else 1, + "limit_train_batches": 5, + "limit_val_batches": 0, + "accumulate_grad_batches": 2, + "enable_progress_bar": False, + } + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer), pytest.raises(Exception, match="SWA crash test"): + trainer.fit(model) + + checkpoint_dir = Path(tmpdir) / "lightning_logs" / "version_0" / "checkpoints" + checkpoint_files = os.listdir(checkpoint_dir) + assert len(checkpoint_files) == 1 + ckpt_path = str(checkpoint_dir / checkpoint_files[0]) + + trainer = Trainer(callbacks=SwaTestCallback(swa_epoch_start=swa_start, swa_lrs=0.1), **trainer_kwargs) + + with _backward_patch(trainer): + trainer.fit(resume_model, ckpt_path=ckpt_path) + + +class CustomSchedulerModel(SwaTestModel): + def configure_optimizers(self): + optimizer = torch.optim.SGD(self.layer.parameters(), lr=0.1) + + def lr_lambda(current_step: int): + return 0.1 + + scheduler = LambdaLR(optimizer, lr_lambda, -1) + return { + "optimizer": optimizer, + "lr_scheduler": { + "scheduler": scheduler, + "interval": self.interval, + }, + } + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint(tmpdir, crash_on_epoch): + model = SwaTestModel(crash_on_epoch=crash_on_epoch) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@pytest.mark.parametrize("crash_on_epoch", [1, 3]) +def test_swa_resume_training_from_checkpoint_custom_scheduler(tmpdir, crash_on_epoch): + # Reproduces the bug reported in https://github.com/PyTorchLightning/pytorch-lightning/issues/11665 + model = CustomSchedulerModel(crash_on_epoch=crash_on_epoch) + resume_model = CustomSchedulerModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model) + + +@RunIf(skip_windows=True) +def test_swa_resume_training_from_checkpoint_ddp(tmpdir): + model = SwaTestModel(crash_on_epoch=3) + resume_model = SwaTestModel() + _swa_resume_training_from_checkpoint(tmpdir, model, resume_model, ddp=True) + + +@pytest.mark.parametrize( + "strategy", + [ + pytest.param("fsdp", marks=RunIf(fairscale_fully_sharded=True, min_cuda_gpus=1)), + pytest.param("deepspeed", marks=RunIf(deepspeed=True, min_cuda_gpus=1)), + ], +) +def test_misconfiguration_error_with_sharded_model(tmpdir, strategy: str): + model = SwaTestModel() + swa_callback = SwaTestCallback(swa_epoch_start=2, swa_lrs=0.1) + trainer = Trainer( + default_root_dir=tmpdir, + enable_progress_bar=False, + max_epochs=5, + callbacks=[swa_callback], + strategy=strategy, + accelerator="gpu", + devices=1, + ) + with pytest.raises(MisconfigurationException, match="SWA does not currently support sharded models"): + trainer.fit(model) + + +def _backward_patch(trainer: Trainer) -> ContextManager: + return mock.patch.object(Strategy, "backward", wraps=trainer.strategy.backward) From 8fa9e8651d044e6122a4380f570fd2451a665f8c Mon Sep 17 00:00:00 2001 From: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> Date: Wed, 10 Aug 2022 09:38:18 +0900 Subject: [PATCH 129/230] Update collect env details and issue template (#14017) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- .github/ISSUE_TEMPLATE/bug_report.md | 8 ++++++ requirements/collect_env_details.py | 41 ++++++++++------------------ 2 files changed, 22 insertions(+), 27 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index f08865180ba1d..de4eacde1f39e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -41,8 +41,16 @@ You can get the script and run it with: ```bash wget https://raw.githubusercontent.com/Lightning-AI/lightning/master/requirements/collect_env_details.py python collect_env_details.py + ``` + +
+ Details + Paste the output here and move this toggle outside of the comment block. +
+ + You can also fill out the list below manually. --> diff --git a/requirements/collect_env_details.py b/requirements/collect_env_details.py index 1d65753a55553..b0c47efc43859 100644 --- a/requirements/collect_env_details.py +++ b/requirements/collect_env_details.py @@ -20,27 +20,17 @@ import platform import sys -import numpy +import pkg_resources import torch -import tqdm sys.path += [os.path.abspath(".."), os.path.abspath("")] -import pytorch_lightning # noqa: E402 -try: - import lightning -except ModuleNotFoundError: - pass -try: - import lightning_app -except ModuleNotFoundError: - pass LEVEL_OFFSET = "\t" KEY_PADDING = 20 -def info_system(): +def info_system() -> dict: return { "OS": platform.system(), "architecture": platform.architecture(), @@ -50,28 +40,24 @@ def info_system(): } -def info_cuda(): +def info_cuda() -> dict: return { - "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())], - # 'nvidia_driver': get_nvidia_driver_version(run_lambda), + "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())] or None, "available": torch.cuda.is_available(), "version": torch.version.cuda, } -def info_packages(): - return { - "numpy": numpy.__version__, - "pyTorch_version": torch.__version__, - "pyTorch_debug": torch.version.debug, - "pytorch-lightning": pytorch_lightning.__version__, - "lightning": lightning.__version__ if "lightning" in sys.modules else None, - "lightning_app": lightning_app.__version__ if "lightning_app" in sys.modules else None, - "tqdm": tqdm.__version__, - } +def info_packages() -> dict: + """Get name and version of all installed packages.""" + packages = {} + for dist in pkg_resources.working_set: + package = dist.as_requirement() + packages[package.key] = package.specs[0][1] + return packages -def nice_print(details, level=0): +def nice_print(details: dict, level: int = 0) -> list: lines = [] for k in sorted(details): key = f"* {k}:" if level == 0 else f"- {k}:" @@ -88,8 +74,9 @@ def nice_print(details, level=0): return lines -def main(): +def main() -> None: details = {"System": info_system(), "CUDA": info_cuda(), "Packages": info_packages()} + details["Lightning"] = {k: v for k, v in details["Packages"].items() if "torch" in k or "lightning" in k} lines = nice_print(details) text = os.linesep.join(lines) print(text) From d211d46e1db3ca4c4c938cafbfe51704f51b8ab4 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 11:35:41 +0900 Subject: [PATCH 130/230] CI: Replace `_` of in GHA workflow filenames with `-` (#13917) * Rename workflow files * Update docs * Fix azure badges * Update the main readme * bad rebase * Update doc --- .actions/setup_tools.py | 5 ++- .github/workflows/README.md | 34 +++++++++---------- ...e2e_test.yml => ci-app-cloud-e2e-test.yml} | 0 ...i-app_examples.yml => ci-app-examples.yml} | 0 .../{ci-app_tests.yml => ci-app-tests.yml} | 0 ...{ci_pkg-install.yml => ci-pkg-install.yml} | 0 ...pr-gatekeeper.yml => ci-pr-gatekeeper.yml} | 0 ...st-conda.yml => ci-pytorch-test-conda.yml} | 0 ...test-full.yml => ci-pytorch-test-full.yml} | 0 ...test-slow.yml => ci-pytorch-test-slow.yml} | 0 .../{ci_schema.yml => ci-schema.yml} | 0 ...h_dockers.yml => cicd-pytorch-dockers.yml} | 0 README.md | 25 ++++++++------ src/pytorch_lightning/README.md | 26 +++++++------- 14 files changed, 46 insertions(+), 44 deletions(-) rename .github/workflows/{ci-app_cloud_e2e_test.yml => ci-app-cloud-e2e-test.yml} (100%) rename .github/workflows/{ci-app_examples.yml => ci-app-examples.yml} (100%) rename .github/workflows/{ci-app_tests.yml => ci-app-tests.yml} (100%) rename .github/workflows/{ci_pkg-install.yml => ci-pkg-install.yml} (100%) rename .github/workflows/{ci_pr-gatekeeper.yml => ci-pr-gatekeeper.yml} (100%) rename .github/workflows/{ci-pytorch_test-conda.yml => ci-pytorch-test-conda.yml} (100%) rename .github/workflows/{ci-pytorch_test-full.yml => ci-pytorch-test-full.yml} (100%) rename .github/workflows/{ci-pytorch_test-slow.yml => ci-pytorch-test-slow.yml} (100%) rename .github/workflows/{ci_schema.yml => ci-schema.yml} (100%) rename .github/workflows/{cicd-pytorch_dockers.yml => cicd-pytorch-dockers.yml} (100%) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index 5088be2020738..a76e81246798c 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -94,11 +94,10 @@ def load_readme_description(path_dir: str, homepage: str, version: str) -> str: text = text.replace("pytorch-lightning.readthedocs.io/en/stable/", f"pytorch-lightning.readthedocs.io/en/{version}") # codecov badge text = text.replace("/branch/master/graph/badge.svg", f"/release/{version}/graph/badge.svg") - # replace github badges for release ones + # github actions badge text = text.replace("badge.svg?branch=master&event=push", f"badge.svg?tag={version}") - # Azure... + # azure pipelines badge text = text.replace("?branchName=master", f"?branchName=refs%2Ftags%2F{version}") - text = re.sub(r"\?definitionId=\d+&branchName=master", f"?definitionId=2&branchName=refs%2Ftags%2F{version}", text) skip_begin = r"" skip_end = r"" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index f559551e1237f..4ed903c0f3a93 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,16 +4,16 @@ ## Unit and Integration Testing -| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | -| -------------------------- | ----------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------ | ------------------- | -| Test full | .github/workflows/ci_test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.10, 1.12) | linux, mac, windows | -| Test with Conda | .github/workflows/ci_test-conda.yml | Same as ci_test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.8), (3.8, 1.9), (3.8, 1.10), (3.9, 1.12) | linux | -| Test slow | .github/workflows/ci_test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.8) | linux, mac, windows | -| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | -| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | -| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | -| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | -| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | +| workflow name | workflow file | action | accelerator\* | (Python, PyTorch) | OS | +| -------------------------- | ------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | ------------------------------------------------- | ------------------- | +| Test PyTorch full | .github/workflows/ci-pytorch-test-full.yml | Run all tests except for accelerator-specific, standalone and slow tests. | CPU | (3.7, 1.9), (3.7, 1.12), (3.9, 1.9), (3.9, 1.12) | linux, mac, windows | +| Test PyTorch with Conda | .github/workflows/ci-pytorch-test-conda.yml | Same as ci-pytorch-test-full.yml but with dependencies installed with conda. | CPU | (3.8, 1.9), (3.8, 1.10), (3.8, 1.11), (3.9, 1.12) | linux | +| Test slow | .github/workflows/ci-pytorch-test-slow.yml | Run only slow tests. Slow tests usually need to spawn threads and cannot be speed up or simplified. | CPU | (3.7, 1.11) | linux, mac, windows | +| pytorch-lightning (IPUs) | .azure-pipelines/ipu-tests.yml | Run only IPU-specific tests. | IPU | (3.8, 1.9) | linux | +| pytorch-lightning (HPUs) | .azure-pipelines/hpu-tests.yml | Run only HPU-specific tests. | HPU | (3.8, 1.10) | linux | +| pytorch-lightning (GPUs) | .azure-pipelines/gpu-tests.yml | Run all CPU and GPU-specific tests, standalone, and examples. Each standalone test needs to be run in separate processes to avoid unwanted interactions between test cases. | GPU | (3.9, 1.12) | linux | +| PyTorchLightning.Benchmark | .azure-pipelines/gpu-benchmark.yml | Run speed/memory benchmarks for parity with pure PyTorch. | GPU | (3.9, 1.12) | linux | +| test-on-tpus | .circleci/config.yml | Run only TPU-specific tests. | TPU | (3.7, 1.12) | linux | - \*Accelerators used in CI - GPU: 2 x NVIDIA Tesla V100 @@ -33,15 +33,15 @@ | --------------------------------- | ----------------------------------------------------------------------------------------- | | .codecov.yml | Measure test coverage with [codecov.io](https://app.codecov.io/gh/Lightning-AI/lightning) | | .github/workflows/code-checks.yml | Check Python typing with [MyPy](https://mypy.readthedocs.io/en/stable/). | -| .github/workflows/ci_schema.yml | Validate the syntax of workflow files. | +| .github/workflows/ci-schema.yml | Validate the syntax of workflow files. | ## Others -| workflow file | action | -| -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| .github/workflows/ci_dockers.yml | Build docker images used for testing in CI without pushing to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). Publishing these built images takes place in `.github/workflows/release-docker.yml` which only runs in master. | -| .github/workflows/ci_pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | -| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | +| workflow file | action | +| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| .github/workflows/cicd-pytorch-dockers.yml | Build docker images used for testing in CI. If run on nightly schedule, push to the [Docker Hub](https://hub.docker.com/r/pytorchlightning/pytorch_lightning). | +| .github/workflows/ci-pkg-install.yml | Test if pytorch-lightning is successfully installed using pip. | +| .github/workflows/events-recurrent.yml | Terminate TPU jobs that live more than one hour to avoid possible resource exhaustion due to hangs. | ## Deployment @@ -60,4 +60,4 @@ | .github/stale.yml | Close inactive issues/PRs sometimes after adding the "won't fix" label to them. | | .github/workflows/probot-auto-cc.yml, .github/lightning-probot.yml | Notify maintainers of interest depending on labels added to an issue We utilize lightning-probot forked from PyTorch’s probot. | | .pre-commit-config.yaml | pre-commit.ci runs a set of linters and formatters, such as black, flake8 and isort. When formatting is applied, the bot pushes a commit with its change. This configuration is also used for running pre-commit locally. | -| .github/workflows/ci_pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | +| .github/workflows/ci-pr-gatekeeper.yml | Prevent PRs from merging into master without any Grid.ai employees’ approval. | diff --git a/.github/workflows/ci-app_cloud_e2e_test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml similarity index 100% rename from .github/workflows/ci-app_cloud_e2e_test.yml rename to .github/workflows/ci-app-cloud-e2e-test.yml diff --git a/.github/workflows/ci-app_examples.yml b/.github/workflows/ci-app-examples.yml similarity index 100% rename from .github/workflows/ci-app_examples.yml rename to .github/workflows/ci-app-examples.yml diff --git a/.github/workflows/ci-app_tests.yml b/.github/workflows/ci-app-tests.yml similarity index 100% rename from .github/workflows/ci-app_tests.yml rename to .github/workflows/ci-app-tests.yml diff --git a/.github/workflows/ci_pkg-install.yml b/.github/workflows/ci-pkg-install.yml similarity index 100% rename from .github/workflows/ci_pkg-install.yml rename to .github/workflows/ci-pkg-install.yml diff --git a/.github/workflows/ci_pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml similarity index 100% rename from .github/workflows/ci_pr-gatekeeper.yml rename to .github/workflows/ci-pr-gatekeeper.yml diff --git a/.github/workflows/ci-pytorch_test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-conda.yml rename to .github/workflows/ci-pytorch-test-conda.yml diff --git a/.github/workflows/ci-pytorch_test-full.yml b/.github/workflows/ci-pytorch-test-full.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-full.yml rename to .github/workflows/ci-pytorch-test-full.yml diff --git a/.github/workflows/ci-pytorch_test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml similarity index 100% rename from .github/workflows/ci-pytorch_test-slow.yml rename to .github/workflows/ci-pytorch-test-slow.yml diff --git a/.github/workflows/ci_schema.yml b/.github/workflows/ci-schema.yml similarity index 100% rename from .github/workflows/ci_schema.yml rename to .github/workflows/ci-schema.yml diff --git a/.github/workflows/cicd-pytorch_dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml similarity index 100% rename from .github/workflows/cicd-pytorch_dockers.yml rename to .github/workflows/cicd-pytorch-dockers.yml diff --git a/README.md b/README.md index 2fef343425f17..9c03e3707ec24 100644 --- a/README.md +++ b/README.md @@ -80,21 +80,24 @@ ______________________________________________________________________ ## Continuous Integration -Lightning is rigorously tested across multiple GPUs, TPUs CPUs and against major Python and PyTorch versions. +Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs and against major Python and PyTorch versions.
Current build statuses
-| System / PyTorch ver. | 1.8 (LTS, min. req.) | 1.9 | 1.10 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 (with Conda | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -136,8 +139,8 @@ conda install pytorch-lightning -c conda-forge The actual status of 1.7 \[stable\] is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml?query=branch%3Arelease%2Fpytorch) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml?query=branch%3Arelease%2Fpytorch) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml?query=branch%3Arelease%2Fpytorch) [![TPU tests](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=shield)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![Check Docs](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/docs-checks.yml?query=branch%3Arelease%2Fpytorch) diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index eb1a42730b5f0..b57aea6fae147 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -78,17 +78,17 @@ Lightning is rigorously tested across multiple CPUs, GPUs, TPUs, IPUs, and HPUs
-| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | -| :------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | -| Linux py3.7 \[GPUs\*\*\] | - | - | - | -| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | -| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | - | -| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=6&branchName=master) | - | -| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | - | -| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-conda.yml) | -| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | -| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci_test-full.yml) | +| System / PyTorch ver. | 1.9 | 1.10 | 1.12 (latest) | +| :------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| Linux py3.7 \[GPUs\*\*\] | - | - | - | +| Linux py3.7 \[TPUs\*\*\*\] | [![CircleCI](https://circleci.com/gh/Lightning-AI/lightning/tree/master.svg?style=svg)](https://circleci.com/gh/Lightning-AI/lightning/tree/master) | - | - | +| Linux py3.8 \[IPUs\] | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=master) | - | - | +| Linux py3.8 \[HPUs\] | - | [![Build Status]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=26&branchName=master) | - | +| Linux py3.8 (with Conda) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | - | +| Linux py3.9 (with Conda) | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) | +| Linux py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| OSX py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | +| Windows py3.{7,9} | - | - | [![Test](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=master&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) | - _\*\* tests run on two NVIDIA P100_ - _\*\*\* tests run on Google GKE TPUv2/3. TPU py3.7 means we support Colab and Kaggle env._ @@ -130,8 +130,8 @@ conda install pytorch-lightning -c conda-forge The actual status of stable is the following: -[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-full.yml) -[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch_test-conda.yml) +[![Test PyTorch full](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-full.yml) +[![Test PyTorch with Conda](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml/badge.svg?branch=release%2Fpytorch&event=push)](https://github.com/Lightning-AI/lightning/actions/workflows/ci-pytorch-test-conda.yml) [![GPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=24&branchName=release%2Fpytorch) [![TPU](https://dl.circleci.com/status-badge/img/gh/Lightning-AI/lightning/tree/release%2Fpytorch.svg?style=svg)](https://dl.circleci.com/status-badge/redirect/gh/Lightning-AI/lightning/tree/release%2Fpytorch) [![IPU]()](https://dev.azure.com/Lightning-AI/lightning/_build/latest?definitionId=25&branchName=release%2Fpytorch) From dfda3f384e020ab3955a2cf3fe29dcd831d8a969 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 18:02:54 +0900 Subject: [PATCH 131/230] CI: Update Windows version from 2019 to 2022 (#14129) Update windows --- .github/workflows/ci-app-examples.yml | 2 +- .github/workflows/ci-app-tests.yml | 6 +++--- .github/workflows/ci-pkg-install.yml | 6 +++--- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index ec8becd5f70d1..01570f59c2c77 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index 1678dab257301..fe3cc36dc16d3 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -21,7 +21,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] requires: ["oldest", "latest"] @@ -126,7 +126,7 @@ jobs: # - name: Clone Quick Start Example Repo # uses: actions/checkout@v3 # # TODO: this needs to be git submodule -# if: matrix.os == 'windows-2019' # because the install doesn't work on windows +# if: matrix.os == 'windows-2022' # because the install doesn't work on windows # with: # repository: Lightning-AI/lightning-quick-start # ref: 'main' @@ -134,6 +134,6 @@ jobs: # # - name: Lightning Install quick-start # shell: bash -# if: matrix.os != 'windows-2019' # because the install doesn't work on windows +# if: matrix.os != 'windows-2022' # because the install doesn't work on windows # run: | # python -m lightning install app lightning/quick-start -y diff --git a/.github/workflows/ci-pkg-install.yml b/.github/workflows/ci-pkg-install.yml index 342e027b07cfe..a9fdd36693a67 100644 --- a/.github/workflows/ci-pkg-install.yml +++ b/.github/workflows/ci-pkg-install.yml @@ -33,7 +33,7 @@ jobs: fail-fast: true max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["app", "pytorch"] python-version: [3.8] # , 3.9 @@ -67,7 +67,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] pkg: ["", "lightning"] python-version: [3.8] # , 3.9 @@ -100,7 +100,7 @@ jobs: fail-fast: false # max-parallel: 1 matrix: - os: [ubuntu-20.04, macOS-11, windows-2019] + os: [ubuntu-20.04, macOS-11, windows-2022] python-version: [3.8] # , 3.9 steps: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 445707d340c4b..7409ce25a5128 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -20,7 +20,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] python-version: ["3.7", "3.10"] # minimum, maximum requires: ["oldest", "latest"] release: ["stable"] diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index b3756bbe8c2f7..36007d3311451 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -19,7 +19,7 @@ jobs: strategy: fail-fast: false matrix: - os: [ubuntu-20.04, windows-2019, macOS-11] + os: [ubuntu-20.04, windows-2022, macOS-11] # same config as '.azure-pipelines/gpu-tests.yml' python-version: ["3.7"] pytorch-version: ["1.11"] From dc8ff5ed2699b2ab9d21ee1ea6270191e290f620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 11:23:20 +0200 Subject: [PATCH 132/230] Fix device placement when `.cuda()` called without specifying index (#14128) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../core/mixins/device_dtype_mixin.py | 10 ++++---- .../utilities/test_dtype_device_mixin.py | 24 ++++++++++++++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 8852367a116f6..b405665b9df88 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -82,6 +82,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) +- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) + + ## [1.7.0] - 2022-08-02 ### Added diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index 62e81e4839da6..2916d8b07cb4e 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -116,14 +116,16 @@ def cuda(self, device: Optional[Union[torch.device, int]] = None) -> Self: # ty while being optimized. Arguments: - device: if specified, all parameters will be - copied to that device + device: If specified, all parameters will be copied to that device. If `None`, the current CUDA device + index will be used. Returns: Module: self """ - if device is None or isinstance(device, int): - device = torch.device("cuda", index=(device or 0)) + if device is None: + device = torch.device("cuda", torch.cuda.current_device()) + elif isinstance(device, int): + device = torch.device("cuda", index=device) self.__update_properties(device=device) return super().cuda(device=device) diff --git a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py index 38f72b555d52d..7c17b3d9f7642 100644 --- a/tests/tests_pytorch/utilities/test_dtype_device_mixin.py +++ b/tests/tests_pytorch/utilities/test_dtype_device_mixin.py @@ -113,7 +113,7 @@ def test_submodules_multi_gpu_ddp_spawn(tmpdir): ], ) @RunIf(min_cuda_gpus=1) -def test_gpu_cuda_device(device): +def test_cuda_device(device): model = TopModule() model.cuda(device) @@ -122,3 +122,25 @@ def test_gpu_cuda_device(device): assert device.type == "cuda" assert device.index is not None assert device.index == torch.cuda.current_device() + + +@RunIf(min_cuda_gpus=2) +def test_cuda_current_device(): + """Test that calling .cuda() moves the model to the correct device and respects current cuda device setting.""" + + class CudaModule(DeviceDtypeModuleMixin): + def __init__(self): + super().__init__() + self.layer = nn.Linear(1, 1) + + model = CudaModule() + + torch.cuda.set_device(0) + model.cuda(1) + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) + + torch.cuda.set_device(1) + model.cuda() # model is already on device 1, and calling .cuda() without device index should not move model + assert model.device == torch.device("cuda", 1) + assert model.layer.weight.device == torch.device("cuda", 1) From ddb476d334f501a655586ae3809587e09f71b9c8 Mon Sep 17 00:00:00 2001 From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com> Date: Wed, 10 Aug 2022 11:48:06 +0200 Subject: [PATCH 133/230] [App] Application logs in CLI (#13634) --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/cli/lightning_cli.py | 89 +++++++++++++ src/lightning_app/testing/testing.py | 2 +- src/lightning_app/utilities/app_logs.py | 125 ++++++++++++++++++ .../utilities/logs_socket_api.py | 95 +++++++++++++ tests/tests_app/cli/test_cmd_show_logs.py | 61 +++++++++ tests/tests_app_examples/test_boring_app.py | 15 +++ .../test_collect_failures.py | 1 + tests/tests_app_examples/test_commands.py | 1 + .../test_custom_work_dependencies.py | 2 +- tests/tests_app_examples/test_drive.py | 1 + tests/tests_app_examples/test_idle_timeout.py | 1 + tests/tests_app_examples/test_payload.py | 2 +- tests/tests_app_examples/test_quick_start.py | 2 +- .../test_template_react_ui.py | 1 + .../test_template_streamlit_ui.py | 1 + tests/tests_app_examples/test_v0_app.py | 1 + 17 files changed, 398 insertions(+), 4 deletions(-) create mode 100644 src/lightning_app/utilities/app_logs.py create mode 100644 src/lightning_app/utilities/logs_socket_api.py create mode 100644 tests/tests_app/cli/test_cmd_show_logs.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 78a4e370e76ee..ba8cdd796c5bb 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -13,6 +13,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) - Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987)) - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) +- Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) + ### Changed diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index fb4c40330dfd9..45c80d4dcc357 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -8,7 +8,9 @@ import click import requests +import rich from requests.exceptions import ConnectionError +from rich.color import ANSI_COLOR_NAMES from lightning_app import __version__ as ver from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init @@ -18,12 +20,15 @@ from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType +from lightning_app.utilities.app_logs import _app_logs_reader from lightning_app.utilities.cli_helpers import ( _format_input_env_variables, _retrieve_application_url_and_available_commands, ) +from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth +from lightning_app.utilities.network import LightningClient from lightning_app.utilities.state import headers_for logger = logging.getLogger(__name__) @@ -50,9 +55,93 @@ def main(): @click.version_option(ver) def _main(): register_all_external_components() + + +@_main.group() +def show(): + """Show given resource.""" pass +@show.command() +@click.argument("app_name", required=False) +@click.argument("components", nargs=-1, required=False) +@click.option("-f", "--follow", required=False, is_flag=True, help="Wait for new logs, to exit use CTRL+C.") +def logs(app_name: str, components: List[str], follow: bool) -> None: + """Show cloud application logs. By default prints logs for all currently available components. + + Example uses: + + Print all application logs: + + $ lightning show logs my-application + + + Print logs only from the flow (no work): + + $ lightning show logs my-application flow + + + Print logs only from selected works: + + $ lightning show logs my-application root.work_a root.work_b + """ + + client = LightningClient() + project = _get_project(client) + + apps = { + app.name: app + for app in client.lightningapp_instance_service_list_lightningapp_instances(project.project_id).lightningapps + } + + if not apps: + raise click.ClickException( + "You don't have any application in the cloud. Please, run an application first with `--cloud`." + ) + + if not app_name: + raise click.ClickException( + f"You have not specified any Lightning App. Please select one of available: [{', '.join(apps.keys())}]" + ) + + if app_name not in apps: + raise click.ClickException( + f"The Lightning App '{app_name}' does not exist. Please select one of following: [{', '.join(apps.keys())}]" + ) + + # Fetch all lightning works from given application + # 'Flow' component is somewhat implicit, only one for whole app, + # and not listed in lightningwork API - so we add it directly to the list + works = client.lightningwork_service_list_lightningwork( + project_id=project.project_id, app_id=apps[app_name].id + ).lightningworks + app_component_names = ["flow"] + [f.name for f in apps[app_name].spec.flow_servers] + [w.name for w in works] + + if not components: + components = app_component_names + + for component in components: + if component not in app_component_names: + raise click.ClickException(f"Component '{component}' does not exist in app {app_name}.") + + log_reader = _app_logs_reader( + client=client, + project_id=project.project_id, + app_id=apps[app_name].id, + component_names=components, + follow=follow, + ) + + rich_colors = list(ANSI_COLOR_NAMES) + colors = {c: rich_colors[i + 1] for i, c in enumerate(components)} + + for component_name, log_event in log_reader: + date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") + color = colors[component_name] + rich.print(f"[{color}]{component_name}[/{color}] {date} {log_event.message}") + + @_main.command() def login(): """Log in to your Lightning.ai account.""" diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index e1cc2e180dab5..74d57db38c427 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -318,7 +318,7 @@ def fetch_logs() -> str: ) try: - yield admin_page, view_page, fetch_logs + yield admin_page, view_page, fetch_logs, name except KeyboardInterrupt: pass finally: diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py new file mode 100644 index 0000000000000..4a7af9b5c5143 --- /dev/null +++ b/src/lightning_app/utilities/app_logs.py @@ -0,0 +1,125 @@ +import json +import queue +import sys +from dataclasses import dataclass +from datetime import datetime, timedelta +from json import JSONDecodeError +from threading import Thread +from typing import Iterator, List, Optional, Tuple + +import dateutil.parser +from websocket import WebSocketApp + +from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI +from lightning_app.utilities.network import LightningClient + + +@dataclass +class _LogEventLabels: + app: str + container: str + filename: str + job: str + namespace: str + node_name: str + pod: str + stream: Optional[str] = None + + +@dataclass +class _LogEvent: + message: str + timestamp: datetime + labels: _LogEventLabels + + +def _push_logevents_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue): + """Pushes _LogEvents from websocket to read_queue. + + Returns callback function used with `on_message_callback` of websocket.WebSocketApp. + """ + + def callback(ws_app: WebSocketApp, msg: str): + # We strongly trust that the contract on API will hold atm :D + event_dict = json.loads(msg) + labels = _LogEventLabels(**event_dict["labels"]) + if "message" in event_dict: + event = _LogEvent( + message=event_dict["message"], + timestamp=dateutil.parser.isoparse(event_dict["timestamp"]), + labels=labels, + ) + read_queue.put((event.timestamp, component_name, event)) + + return callback + + +def _error_callback(ws_app: WebSocketApp, error: Exception): + errors = { + KeyError: "Malformed log message, missing key", + JSONDecodeError: "Malformed log message", + TypeError: "Malformed log format", + ValueError: "Malformed date format", + } + print(f"Error while reading logs ({errors.get(type(error), 'Unknown')})", file=sys.stderr) + ws_app.close() + + +def _app_logs_reader( + client: LightningClient, project_id: str, app_id: str, component_names: List[str], follow: bool +) -> Iterator[Tuple[str, _LogEvent]]: + + read_queue = queue.PriorityQueue() + logs_api_client = _LightningLogsSocketAPI(client.api_client) + + # We will use a socket per component + log_sockets = [ + logs_api_client.create_lightning_logs_socket( + project_id=project_id, + app_id=app_id, + component=component_name, + on_message_callback=_push_logevents_to_read_queue_callback(component_name, read_queue), + on_error_callback=_error_callback, + ) + for component_name in component_names + ] + + # And each socket on separate thread pushing log event to print queue + # run_forever() will run until we close() the connection from outside + log_threads = [Thread(target=work.run_forever) for work in log_sockets] + + # Establish connection and begin pushing logs to the print queue + for th in log_threads: + th.start() + + user_log_start = "<<< BEGIN USER_RUN_FLOW SECTION >>>" + start_timestamp = None + + # Print logs from queue when log event is available + try: + while True: + _, component_name, log_event = read_queue.get(timeout=None if follow else 1.0) + log_event: _LogEvent + + if user_log_start in log_event.message: + start_timestamp = log_event.timestamp + timedelta(seconds=0.5) + + if start_timestamp and log_event.timestamp > start_timestamp: + yield component_name, log_event + + except queue.Empty: + # Empty is raised by queue.get if timeout is reached. Follow = False case. + pass + + except KeyboardInterrupt: + # User pressed CTRL+C to exit, we sould respect that + pass + + finally: + # Close connections - it will cause run_forever() to finish -> thread as finishes aswell + for socket in log_sockets: + socket.close() + + # Because all socket were closed, we can just wait for threads to finish. + for th in log_threads: + th.join() diff --git a/src/lightning_app/utilities/logs_socket_api.py b/src/lightning_app/utilities/logs_socket_api.py new file mode 100644 index 0000000000000..0ab9a5c24f3e5 --- /dev/null +++ b/src/lightning_app/utilities/logs_socket_api.py @@ -0,0 +1,95 @@ +from typing import Callable, Optional +from urllib.parse import urlparse + +from lightning_cloud.openapi import ApiClient, AuthServiceApi, V1LoginRequest +from websocket import WebSocketApp + +from lightning_app.utilities.login import Auth + + +class _LightningLogsSocketAPI: + def __init__(self, api_client: ApiClient): + self.api_client = api_client + self._auth = Auth() + self._auth.authenticate() + self._auth_service = AuthServiceApi(api_client) + + def _get_api_token(self) -> str: + token_resp = self._auth_service.auth_service_login( + body=V1LoginRequest( + username=self._auth.username, + api_key=self._auth.api_key, + ) + ) + return token_resp.token + + @staticmethod + def _socket_url(host: str, project_id: str, app_id: str, token: str, component: str) -> str: + return ( + f"wss://{host}/v1/projects/{project_id}/appinstances/{app_id}/logs?" + f"token={token}&component={component}&follow=true" + ) + + def create_lightning_logs_socket( + self, + project_id: str, + app_id: str, + component: str, + on_message_callback: Callable[[WebSocketApp, str], None], + on_error_callback: Optional[Callable[[Exception, str], None]] = None, + ) -> WebSocketApp: + """Creates and returns WebSocketApp to listen to lightning app logs. + + .. code-block:: python + # Synchronous reading, run_forever() is blocking + + + def print_log_msg(ws_app, msg): + print(msg) + + + flow_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "flow", print_log_msg) + flow_socket.run_forever() + + .. code-block:: python + # Asynchronous reading (with Threads) + + + def print_log_msg(ws_app, msg): + print(msg) + + + flow_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "flow", print_log_msg) + work_logs_socket = client.create_lightning_logs_socket("project_id", "app_id", "work_1", print_log_msg) + + flow_logs_thread = Thread(target=flow_logs_socket.run_forever) + work_logs_thread = Thread(target=work_logs_socket.run_forever) + + flow_logs_thread.start() + work_logs_thread.start() + # ....... + + flow_logs_socket.close() + work_logs_thread.close() + + Arguments: + project_id: Project ID. + app_id: Application ID. + component: Component name eg flow. + on_message_callback: Callback object which is called when received data. + on_error_callback: Callback object which is called when we get error. + + Returns: + WebSocketApp of the wanted socket + """ + _token = self._get_api_token() + clean_ws_host = urlparse(self.api_client.configuration.host).netloc + socket_url = self._socket_url( + host=clean_ws_host, + project_id=project_id, + app_id=app_id, + token=_token, + component=component, + ) + + return WebSocketApp(socket_url, on_message=on_message_callback, on_error=on_error_callback) diff --git a/tests/tests_app/cli/test_cmd_show_logs.py b/tests/tests_app/cli/test_cmd_show_logs.py new file mode 100644 index 0000000000000..0dc06025151fa --- /dev/null +++ b/tests/tests_app/cli/test_cmd_show_logs.py @@ -0,0 +1,61 @@ +from unittest import mock + +from click.testing import CliRunner + +from lightning_app.cli.lightning_cli import logs + + +@mock.patch("lightning_app.cli.lightning_cli.LightningClient") +@mock.patch("lightning_app.cli.lightning_cli._get_project") +def test_show_logs_errors(project, client): + """Test that the CLI prints the errors for the show logs command.""" + + runner = CliRunner() + + # Response prep + app = mock.MagicMock() + app.name = "MyFakeApp" + work = mock.MagicMock() + work.name = "MyFakeWork" + flow = mock.MagicMock() + flow.name = "MyFakeFlow" + + # No apps ever run + apps = {} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + + result = runner.invoke(logs, ["NonExistentApp"]) + + assert result.exit_code == 1 + assert "Error: You don't have any application in the cloud" in result.output + + # App not specified + apps = {app} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + + result = runner.invoke(logs) + + assert result.exit_code == 1 + assert "Please select one of available: [MyFakeApp]" in str(result.output) + + # App does not exit + apps = {app} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + + result = runner.invoke(logs, ["ThisAppDoesNotExist"]) + + assert result.exit_code == 1 + assert "The Lightning App 'ThisAppDoesNotExist' does not exist." in str(result.output) + + # Component does not exist + apps = {app} + works = {work} + flows = {flow} + client.return_value.lightningapp_instance_service_list_lightningapp_instances.return_value.lightningapps = apps + client.return_value.lightningwork_service_list_lightningwork.return_value.lightningworks = works + app.spec.flow_servers = flows + + result = runner.invoke(logs, ["MyFakeApp", "NonExistentComponent"]) + + assert result.exit_code == 1 + assert "Component 'NonExistentComponent' does not exist in app MyFakeApp." in result.output diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index 1f681260de5c2..f8143b1db1a88 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -1,8 +1,10 @@ import os import pytest +from click.testing import CliRunner from tests_app import _PROJECT_ROOT +from lightning_app.cli.lightning_cli import logs from lightning_app.testing.testing import run_app_in_cloud, wait_for @@ -12,6 +14,7 @@ def test_boring_app_example_cloud() -> None: _, view_page, _, + name, ): def check_hello_there(*_, **__): @@ -21,3 +24,15 @@ def check_hello_there(*_, **__): return True wait_for(view_page, check_hello_there) + + runner = CliRunner() + result = runner.invoke(logs, [name]) + lines = result.output.splitlines() + + assert result.exit_code == 0 + assert result.exception is None + assert len(lines) > 1, result.output + # We know that at some point we need to intstall lightning, so we check for that + assert any( + "Successfully built lightning" in line for line in lines + ), f"Did not find logs with lightning installation: {result.output}" diff --git a/tests/tests_app_examples/test_collect_failures.py b/tests/tests_app_examples/test_collect_failures.py index f263ebb1a9f58..c149211e10774 100644 --- a/tests/tests_app_examples/test_collect_failures.py +++ b/tests/tests_app_examples/test_collect_failures.py @@ -26,6 +26,7 @@ def test_collect_failures_example_cloud() -> None: _, _, fetch_logs, + _, ): last_found_log_index = -1 while len(expected_logs) != 0: diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py index 5116b1b9d54bb..266f0305c7604 100644 --- a/tests/tests_app_examples/test_commands.py +++ b/tests/tests_app_examples/test_commands.py @@ -16,6 +16,7 @@ def test_commands_example_cloud() -> None: admin_page, _, fetch_logs, + _, ): app_id = admin_page.url.split("/")[-1] cmd = f"lightning trigger_with_client_command --name=something --app_id {app_id}" diff --git a/tests/tests_app_examples/test_custom_work_dependencies.py b/tests/tests_app_examples/test_custom_work_dependencies.py index 8390233e2eee3..d7c9db5ef610a 100644 --- a/tests/tests_app_examples/test_custom_work_dependencies.py +++ b/tests/tests_app_examples/test_custom_work_dependencies.py @@ -13,7 +13,7 @@ def test_custom_work_dependencies_example_cloud() -> None: with run_app_in_cloud( os.path.join(_PROJECT_ROOT, "tests/tests_app_examples/custom_work_dependencies/"), app_name="app.py", - ) as (_, _, fetch_logs): + ) as (_, _, fetch_logs, _): has_logs = False while not has_logs: for log in fetch_logs(): diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py index 9cebca9cf1072..14efc3458716e 100644 --- a/tests/tests_app_examples/test_drive.py +++ b/tests/tests_app_examples/test_drive.py @@ -13,6 +13,7 @@ def test_drive_example_cloud() -> None: _, view_page, fetch_logs, + _, ): has_logs = False diff --git a/tests/tests_app_examples/test_idle_timeout.py b/tests/tests_app_examples/test_idle_timeout.py index fb58a83aefc93..a39ae3f693f7a 100644 --- a/tests/tests_app_examples/test_idle_timeout.py +++ b/tests/tests_app_examples/test_idle_timeout.py @@ -13,6 +13,7 @@ def test_idle_timeout_example_cloud() -> None: _, _, fetch_logs, + _, ): has_logs = False while not has_logs: diff --git a/tests/tests_app_examples/test_payload.py b/tests/tests_app_examples/test_payload.py index 28d2391c18a2a..58fc28a4a8d3c 100644 --- a/tests/tests_app_examples/test_payload.py +++ b/tests/tests_app_examples/test_payload.py @@ -9,7 +9,7 @@ @pytest.mark.cloud def test_payload_example_cloud() -> None: - with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_payload")) as (_, _, fetch_logs): + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_payload")) as (_, _, fetch_logs, _): has_logs = False while not has_logs: diff --git a/tests/tests_app_examples/test_quick_start.py b/tests/tests_app_examples/test_quick_start.py index 9db693a5dc3d6..454c1084ca1bb 100644 --- a/tests/tests_app_examples/test_quick_start.py +++ b/tests/tests_app_examples/test_quick_start.py @@ -51,7 +51,7 @@ def test_quick_start_example(caplog, monkeypatch): @pytest.mark.cloud def test_quick_start_example_cloud() -> None: - with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "lightning-quick-start/")) as (_, view_page, _): + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "lightning-quick-start/")) as (_, view_page, _, _): def click_gradio_demo(*_, **__): button = view_page.locator('button:has-text("Interactive demo")') diff --git a/tests/tests_app_examples/test_template_react_ui.py b/tests/tests_app_examples/test_template_react_ui.py index 2e348035fe6e5..4b4588d2397e5 100644 --- a/tests/tests_app_examples/test_template_react_ui.py +++ b/tests/tests_app_examples/test_template_react_ui.py @@ -14,6 +14,7 @@ def test_template_react_ui_example_cloud() -> None: _, view_page, fetch_logs, + _, ): def click_button(*_, **__): diff --git a/tests/tests_app_examples/test_template_streamlit_ui.py b/tests/tests_app_examples/test_template_streamlit_ui.py index a8ba93794f2a0..e2c33305298f7 100644 --- a/tests/tests_app_examples/test_template_streamlit_ui.py +++ b/tests/tests_app_examples/test_template_streamlit_ui.py @@ -14,6 +14,7 @@ def test_template_streamlit_ui_example_cloud() -> None: _, view_page, fetch_logs, + _, ): def click_button(*_, **__): diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py index d34a92d6102f8..acc9e285c4d79 100644 --- a/tests/tests_app_examples/test_v0_app.py +++ b/tests/tests_app_examples/test_v0_app.py @@ -74,5 +74,6 @@ def test_v0_app_example_cloud() -> None: _, view_page, fetch_logs, + _, ): run_v0_app(fetch_logs, view_page) From d5f35ece72fd253adeb8e9947fd9be4a5992f8f8 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Wed, 10 Aug 2022 19:37:50 +0900 Subject: [PATCH 134/230] CI/CD: Add CUDA version to docker image tags (#13831) * append cuda version to tags * revertme: push to hub * Update docker readme * Build base-conda-py3.9-torch1.12-cuda11.3.1 * Use new images in conda tests * revertme: push to hub * Revert "revertme: push to hub" This reverts commit 0f7d534b2ae41e4bd227961a929c333c88e35f59. * Revert "revertme: push to hub" This reverts commit 46a05fccbb9b596aa98d5d68424917b5811c5b4f. * Run conda if workflow edited * Run gpu testing if workflow edited * Use new tags in release/Dockerfile * Build base-cuda and PL release images with all combinations * Update release docker * Update conda from py3.9-torch1.12 to py3.10-torch.1.12 * Fix ubuntu version * Revert conda * revertme: push to hub * Don't build Python 3.10 for now... * Fix pl release builder * updating version contribute to the error? https://github.com/docker/buildx/issues/456 * Update actions' versions * Update slack user to notify * Don't use 11.6.0 to avoid bagua incompatibility * Don't use 11.1, and use 11.1.1 * Update .github/workflows/ci-pytorch_test-conda.yml Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> * Update trigger * Ignore artfacts from tutorials * Trim docker images to distribute * Add an image for tutorials * Update conda image 3.8x1.10 * Try different conda variants * No need to set cuda for conda jobs * Update who to notify ipu failure * Don't push * update filenaem Co-authored-by: Luca Medeiros <67411094+luca-medeiros@users.noreply.github.com> --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 4 +- .github/workflows/cicd-pytorch-dockers.yml | 80 +++++++++++---------- .github/workflows/release-docker.yml | 31 +++++--- .gitignore | 6 ++ dockers/README.md | 45 +++--------- dockers/release/Dockerfile | 3 +- 8 files changed, 87 insertions(+), 88 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index ac5ca6f60a6b4..0de590f2c54a6 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index f37c17613affc..68ba6974a3527 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them @@ -44,7 +44,7 @@ jobs: - bash: | CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml' echo $CHANGED_FILES > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 777ec2af759a0..2bbdb699c2c1e 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -22,13 +22,11 @@ jobs: strategy: fail-fast: false matrix: - # nightly: add when there's a release candidate include: - {python-version: "3.8", pytorch-version: "1.9"} - {python-version: "3.8", pytorch-version: "1.10"} - {python-version: "3.9", pytorch-version: "1.11"} - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 30 steps: @@ -45,7 +43,7 @@ jobs: id: skip shell: bash -l {0} run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml' echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt MATCHES=$(cat changed_files.txt | grep -E $FILTER) echo $MATCHES diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/cicd-pytorch-dockers.yml index a6ba2ac4aa5f4..84051cafd82d8 100644 --- a/.github/workflows/cicd-pytorch-dockers.yml +++ b/.github/workflows/cicd-pytorch-dockers.yml @@ -29,17 +29,22 @@ jobs: strategy: fail-fast: false matrix: - # the config used in '.azure-pipelines/gpu-tests.yml' since the Dockerfile uses the cuda image - python_version: ["3.9"] - pytorch_version: ["1.12"] + include: + # We only release one docker image per PyTorch version. + # The matrix here is the same as the one in release-docker.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/release/Dockerfile push: false # pushed in release-docker.yml only when PL is released timeout-minutes: 50 @@ -53,14 +58,14 @@ jobs: python_version: ["3.7"] xla_version: ["1.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -85,30 +90,31 @@ jobs: fail-fast: false matrix: include: - # the config used in '.azure-pipelines/gpu-tests.yml' - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1", ubuntu_version: "20.04"} - # latest (used in Tutorials) - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.1.1", ubuntu_version: "20.04"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1", ubuntu_version: "20.04"} + # These are the base images for PL release docker images, + # so include at least all of the combinations in release-dockers.yml. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + # Used in Lightning-AI/tutorials + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} CUDA_VERSION=${{ matrix.cuda_version }} - UBUNTU_VERSION=${{ matrix.ubuntu_version }} file: dockers/base-cuda/Dockerfile push: ${{ env.PUSH_TO_HUB }} - tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} + tags: pytorchlightning/pytorch_lightning:base-cuda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 95 - uses: ravsamhq/notify-slack-action@v1 if: failure() && env.PUSH_TO_HUB == 'true' @@ -126,25 +132,23 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} - - {python_version: "3.8", pytorch_version: "1.10", cuda_version: "11.1.1"} - - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - # nightly: add when there's a release candidate - # - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9"} + - {python_version: "3.8", pytorch_version: "1.10"} + - {python_version: "3.9", pytorch_version: "1.11"} + - {python_version: "3.9", pytorch_version: "1.12"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} - CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} @@ -168,14 +172,14 @@ jobs: # the config used in 'dockers/ci-runner-ipu/Dockerfile' - {python_version: "3.9", pytorch_version: "1.9"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -184,7 +188,7 @@ jobs: push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-ipu-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} timeout-minutes: 100 - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | PYTHON_VERSION=${{ matrix.python_version }} @@ -199,7 +203,7 @@ jobs: status: ${{ job.status }} token: ${{ secrets.GITHUB_TOKEN }} notification_title: ${{ format('IPU; {0} py{1} for *{2}*', runner.os, matrix.python_version, matrix.pytorch_version) }} - message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01BULUS2BG>' # SeanNaren + message_format: '{emoji} *{workflow}* {status_message}, see <{run_url}|detail>, cc: <@U01GD29QCAV>' # kaushikb11 env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} @@ -212,14 +216,14 @@ jobs: # the config used in 'dockers/ci-runner-hpu/Dockerfile' - {gaudi_version: "1.5.0", pytorch_version: "1.11.0"} steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 - - uses: docker/login-action@v1 + - uses: docker/login-action@v2 if: env.PUSH_TO_HUB == 'true' with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - - uses: docker/build-push-action@v2 + - uses: docker/build-push-action@v3 with: build-args: | DIST=latest @@ -243,10 +247,10 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Build Conda Docker # publish master/release - uses: docker/build-push-action@v2 + uses: docker/build-push-action@v3 with: file: dockers/nvidia/Dockerfile push: false diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 9d87f1a582fb1..6901a24204683 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -1,6 +1,5 @@ name: Docker -# https://www.docker.com/blog/first-docker-github-action-is-here -# https://github.com/docker/build-push-action + on: push: branches: [master, "release/*"] @@ -15,8 +14,12 @@ jobs: strategy: fail-fast: false matrix: - python_version: ["3.7", "3.8", "3.9"] - pytorch_version: ["1.9", "1.10"] + include: + # We only release one docker image per PyTorch version. + - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - name: Checkout uses: actions/checkout@v2 @@ -32,19 +35,29 @@ jobs: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} - tags: "${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }},latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}" + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + tags: | + ${{ steps.get_version.outputs.RELEASE_VERSION }}-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} + latest-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }}-cuda${{ matrix.cuda_version }} timeout-minutes: 55 - name: Publish Latest to Docker uses: docker/build-push-action@v1.1.0 - # only on releases and latest Python and PyTorch - if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.10' + # Only latest Python and PyTorch + if: matrix.python_version == '3.9' && matrix.pytorch_version == '1.12' with: repository: pytorchlightning/pytorch_lightning username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} dockerfile: dockers/release/Dockerfile - build_args: PYTHON_VERSION=${{ matrix.python_version }},PYTORCH_VERSION=${{ matrix.pytorch_version }},LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} + build_args: | + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} + LIGHTNING_VERSION=${{ steps.get_version.outputs.RELEASE_VERSION }} tags: "latest" timeout-minutes: 55 diff --git a/.gitignore b/.gitignore index 719f291a492ca..259d9f271189c 100644 --- a/.gitignore +++ b/.gitignore @@ -165,3 +165,9 @@ hars* artifacts/* *docs/examples* *docs/source-app/api* + +# tutorials +our_model.tar +test.png +saved_models +data/ diff --git a/dockers/README.md b/dockers/README.md index 533c85739f528..b1ff9826b6c1f 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -1,36 +1,17 @@ # Docker images -## Builds images form attached Dockerfiles +## Build images from Dockerfiles You can build it on your own, note it takes lots of time, be prepared. ```bash -git clone -docker image build -t pytorch-lightning:latest -f dockers/conda/Dockerfile . -``` - -or with specific arguments - -```bash -git clone -docker image build \ - -t pytorch-lightning:base-cuda-py3.9-pt1.10 \ - -f dockers/base-cuda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.10 \ - . -``` +git clone https://github.com/Lightning-AI/lightning.git -or nightly version from Conda +# build with the default arguments +docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . -```bash -git clone -docker image build \ - -t pytorch-lightning:base-conda-py3.9-pt1.11 \ - -f dockers/base-conda/Dockerfile \ - --build-arg PYTHON_VERSION=3.9 \ - --build-arg PYTORCH_VERSION=1.11 \ - . +# build with specific arguments +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . ``` To run your docker use @@ -49,7 +30,7 @@ docker image rm pytorch-lightning:latest ## Run docker image with GPUs -To run docker image with access to you GPUs you need to install +To run docker image with access to your GPUs, you need to install ```bash # Add the package repositories @@ -61,10 +42,10 @@ sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit sudo systemctl restart docker ``` -and later run the docker image with `--gpus all` so for example +and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.10 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 ``` ## Run Jupyter server @@ -73,15 +54,11 @@ Inspiration comes from https://u.group/thinking/how-to-put-jupyter-notebooks-in- 1. Build the docker image: ```bash - docker image build \ - -t pytorch-lightning:v1.3.1 \ - -f dockers/nvidia/Dockerfile \ - --build-arg LIGHTNING_VERSION=1.3.1 \ - . + docker image build -t pytorch-lightning:v1.6.5 -f dockers/nvidia/Dockerfile --build-arg LIGHTNING_VERSION=1.6.5 . ``` 1. start the server and map ports: ```bash - docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.3.1 + docker run --rm -it --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all -p 8888:8888 pytorch-lightning:v1.6.5 ``` 1. Connect in local browser: - copy the generated path e.g. `http://hostname:8888/?token=0719fa7e1729778b0cec363541a608d5003e26d4910983c6` diff --git a/dockers/release/Dockerfile b/dockers/release/Dockerfile index cb393c91dfbe0..c39e66509188c 100644 --- a/dockers/release/Dockerfile +++ b/dockers/release/Dockerfile @@ -14,8 +14,9 @@ ARG PYTHON_VERSION=3.9 ARG PYTORCH_VERSION=1.11 +ARG CUDA_VERSION=11.3.1 -FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION} +FROM pytorchlightning/pytorch_lightning:base-cuda-py${PYTHON_VERSION}-torch${PYTORCH_VERSION}-cuda${CUDA_VERSION} LABEL maintainer="Lightning-AI " From 2f7daac4b80bc13135f7e14dffcdd0bd3d50a654 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 13:17:29 +0200 Subject: [PATCH 135/230] Use websockets in e2es (#14138) --- src/lightning_app/cli/lightning_cli.py | 6 +- src/lightning_app/testing/testing.py | 72 ++++++++++++------- src/lightning_app/utilities/app_logs.py | 41 +++++++---- tests/tests_app/utilities/test_app_logs.py | 11 +++ tests/tests_app_examples/test_commands.py | 2 +- .../test_custom_work_dependencies.py | 2 +- tests/tests_app_examples/test_drive.py | 4 +- tests/tests_app_examples/test_idle_timeout.py | 2 +- tests/tests_app_examples/test_payload.py | 2 +- tests/tests_app_examples/test_v0_app.py | 2 +- 10 files changed, 97 insertions(+), 47 deletions(-) create mode 100644 tests/tests_app/utilities/test_app_logs.py diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 45c80d4dcc357..babe0aa2b2abc 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -136,10 +136,10 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: rich_colors = list(ANSI_COLOR_NAMES) colors = {c: rich_colors[i + 1] for i, c in enumerate(components)} - for component_name, log_event in log_reader: + for log_event in log_reader: date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") - color = colors[component_name] - rich.print(f"[{color}]{component_name}[/{color}] {date} {log_event.message}") + color = colors[log_event.component_name] + rich.print(f"[{color}]{log_event.component_name}[/{color}] {date} {log_event.message}") @_main.command() diff --git a/src/lightning_app/testing/testing.py b/src/lightning_app/testing/testing.py index 74d57db38c427..884c02a0521c1 100644 --- a/src/lightning_app/testing/testing.py +++ b/src/lightning_app/testing/testing.py @@ -1,26 +1,30 @@ import asyncio import json +import logging import os import shutil import subprocess import sys import tempfile import time +import traceback from contextlib import contextmanager from subprocess import Popen from time import sleep -from typing import Any, Callable, Dict, Generator, List, Type +from typing import Any, Callable, Dict, Generator, List, Optional, Type import requests from lightning_cloud.openapi.rest import ApiException from requests import Session from rich import print +from rich.color import ANSI_COLOR_NAMES from lightning_app import LightningApp, LightningFlow from lightning_app.cli.lightning_cli import run_app from lightning_app.core.constants import LIGHTNING_CLOUD_PROJECT_ID from lightning_app.runners.multiprocess import MultiProcessRuntime from lightning_app.testing.config import Config +from lightning_app.utilities.app_logs import _app_logs_reader from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.imports import _is_playwright_available, requires @@ -32,6 +36,9 @@ from playwright.sync_api import HttpCredentials, sync_playwright +_logger = logging.getLogger(__name__) + + class LightningTestApp(LightningApp): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -282,20 +289,6 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str var scrollingElement = (document.scrollingElement || document.body); scrollingElement.scrollTop = scrollingElement.scrollHeight; }, 200); - - if (!window._logs) { - window._logs = []; - } - - if (window.logTerminals) { - Object.entries(window.logTerminals).forEach( - ([key, value]) => { - window.logTerminals[key]._onLightningWritelnHandler = function (data) { - window._logs = window._logs.concat([data]); - } - } - ); - } """ ) @@ -309,8 +302,46 @@ def run_app_in_cloud(app_folder: str, app_name: str = "app.py", extra_args: [str except (playwright._impl._api_types.Error, playwright._impl._api_types.TimeoutError): pass - def fetch_logs() -> str: - return admin_page.evaluate("window._logs;") + client = LightningClient() + project = _get_project(client) + identifiers = [] + rich_colors = list(ANSI_COLOR_NAMES) + + def fetch_logs(component_names: Optional[List[str]] = None) -> Generator: + """This methods creates websockets connection in threads and returns the logs to the main thread.""" + app_id = admin_page.url.split("/")[-1] + + if not component_names: + works = client.lightningwork_service_list_lightningwork( + project_id=project.project_id, + app_id=app_id, + ).lightningworks + component_names = ["flow"] + [w.name for w in works] + + def on_error_callback(ws_app, *_): + print(traceback.print_exc()) + ws_app.close() + + colors = {c: rich_colors[i + 1] for i, c in enumerate(component_names)} + gen = _app_logs_reader( + client=client, + project_id=project.project_id, + app_id=app_id, + component_names=component_names, + follow=False, + on_error_callback=on_error_callback, + ) + max_length = max(len(c.replace("root.", "")) for c in component_names) + for log_event in gen: + message = log_event.message + identifier = f"{log_event.timestamp}{log_event.message}" + if identifier not in identifiers: + date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") + identifiers.append(identifier) + color = colors[log_event.component_name] + padding = (max_length - len(log_event.component_name)) * " " + print(f"[{color}]{log_event.component_name}{padding}[/{color}] {date} {message}") + yield message # 5. Print your application ID print( @@ -323,11 +354,6 @@ def fetch_logs() -> str: pass finally: print("##################################################") - printed_logs = [] - for log in fetch_logs(): - if log not in printed_logs: - printed_logs.append(log) - print(log.split("[0m")[-1]) button = admin_page.locator('[data-cy="stop"]') try: button.wait_for(timeout=3 * 1000) @@ -337,8 +363,6 @@ def fetch_logs() -> str: context.close() browser.close() - client = LightningClient() - project = _get_project(client) list_lightningapps = client.lightningapp_instance_service_list_lightningapp_instances(project.project_id) for lightningapp in list_lightningapps.lightningapps: diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py index 4a7af9b5c5143..536fbaae05093 100644 --- a/src/lightning_app/utilities/app_logs.py +++ b/src/lightning_app/utilities/app_logs.py @@ -5,7 +5,7 @@ from datetime import datetime, timedelta from json import JSONDecodeError from threading import Thread -from typing import Iterator, List, Optional, Tuple +from typing import Callable, Iterator, List, Optional import dateutil.parser from websocket import WebSocketApp @@ -30,10 +30,17 @@ class _LogEventLabels: class _LogEvent: message: str timestamp: datetime + component_name: str labels: _LogEventLabels + def __ge__(self, other: "_LogEvent") -> bool: + return self.timestamp >= other.timestamp -def _push_logevents_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue): + def __gt__(self, other: "_LogEvent") -> bool: + return self.timestamp > other.timestamp + + +def _push_log_events_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue): """Pushes _LogEvents from websocket to read_queue. Returns callback function used with `on_message_callback` of websocket.WebSocketApp. @@ -43,13 +50,17 @@ def callback(ws_app: WebSocketApp, msg: str): # We strongly trust that the contract on API will hold atm :D event_dict = json.loads(msg) labels = _LogEventLabels(**event_dict["labels"]) + if "message" in event_dict: + message = event_dict["message"] + timestamp = dateutil.parser.isoparse(event_dict["timestamp"]) event = _LogEvent( - message=event_dict["message"], - timestamp=dateutil.parser.isoparse(event_dict["timestamp"]), + message=message, + timestamp=timestamp, + component_name=component_name, labels=labels, ) - read_queue.put((event.timestamp, component_name, event)) + read_queue.put(event) return callback @@ -66,8 +77,13 @@ def _error_callback(ws_app: WebSocketApp, error: Exception): def _app_logs_reader( - client: LightningClient, project_id: str, app_id: str, component_names: List[str], follow: bool -) -> Iterator[Tuple[str, _LogEvent]]: + client: LightningClient, + project_id: str, + app_id: str, + component_names: List[str], + follow: bool, + on_error_callback: Optional[Callable] = None, +) -> Iterator[_LogEvent]: read_queue = queue.PriorityQueue() logs_api_client = _LightningLogsSocketAPI(client.api_client) @@ -78,8 +94,8 @@ def _app_logs_reader( project_id=project_id, app_id=app_id, component=component_name, - on_message_callback=_push_logevents_to_read_queue_callback(component_name, read_queue), - on_error_callback=_error_callback, + on_message_callback=_push_log_events_to_read_queue_callback(component_name, read_queue), + on_error_callback=on_error_callback or _error_callback, ) for component_name in component_names ] @@ -92,20 +108,19 @@ def _app_logs_reader( for th in log_threads: th.start() + # Print logs from queue when log event is available user_log_start = "<<< BEGIN USER_RUN_FLOW SECTION >>>" start_timestamp = None # Print logs from queue when log event is available try: while True: - _, component_name, log_event = read_queue.get(timeout=None if follow else 1.0) - log_event: _LogEvent - + log_event = read_queue.get(timeout=None if follow else 1.0) if user_log_start in log_event.message: start_timestamp = log_event.timestamp + timedelta(seconds=0.5) if start_timestamp and log_event.timestamp > start_timestamp: - yield component_name, log_event + yield log_event except queue.Empty: # Empty is raised by queue.get if timeout is reached. Follow = False case. diff --git a/tests/tests_app/utilities/test_app_logs.py b/tests/tests_app/utilities/test_app_logs.py new file mode 100644 index 0000000000000..e7384dd72d6e2 --- /dev/null +++ b/tests/tests_app/utilities/test_app_logs.py @@ -0,0 +1,11 @@ +from datetime import datetime +from unittest.mock import MagicMock + +from lightning_app.utilities.app_logs import _LogEvent + + +def test_log_event(): + event_1 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) + event_2 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) + assert event_1 < event_2 + assert event_1 <= event_2 diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py index 266f0305c7604..236e587e23101 100644 --- a/tests/tests_app_examples/test_commands.py +++ b/tests/tests_app_examples/test_commands.py @@ -26,7 +26,7 @@ def test_commands_example_cloud() -> None: has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "['something', 'else']" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_custom_work_dependencies.py b/tests/tests_app_examples/test_custom_work_dependencies.py index d7c9db5ef610a..b8971e0ef2148 100644 --- a/tests/tests_app_examples/test_custom_work_dependencies.py +++ b/tests/tests_app_examples/test_custom_work_dependencies.py @@ -16,7 +16,7 @@ def test_custom_work_dependencies_example_cloud() -> None: ) as (_, _, fetch_logs, _): has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Custom Work Dependency checker End" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py index 14efc3458716e..630e76b550e9e 100644 --- a/tests/tests_app_examples/test_drive.py +++ b/tests/tests_app_examples/test_drive.py @@ -11,14 +11,14 @@ def test_drive_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_drive")) as ( _, - view_page, + _, fetch_logs, _, ): has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Application End!" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_idle_timeout.py b/tests/tests_app_examples/test_idle_timeout.py index a39ae3f693f7a..f06181ce86ed3 100644 --- a/tests/tests_app_examples/test_idle_timeout.py +++ b/tests/tests_app_examples/test_idle_timeout.py @@ -17,7 +17,7 @@ def test_idle_timeout_example_cloud() -> None: ): has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Application End" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_payload.py b/tests/tests_app_examples/test_payload.py index 58fc28a4a8d3c..b40b8ca52defd 100644 --- a/tests/tests_app_examples/test_payload.py +++ b/tests/tests_app_examples/test_payload.py @@ -13,7 +13,7 @@ def test_payload_example_cloud() -> None: has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "Application End!" in log: has_logs = True sleep(1) diff --git a/tests/tests_app_examples/test_v0_app.py b/tests/tests_app_examples/test_v0_app.py index acc9e285c4d79..026c45a4e1ba1 100644 --- a/tests/tests_app_examples/test_v0_app.py +++ b/tests/tests_app_examples/test_v0_app.py @@ -45,7 +45,7 @@ def check_content(button_name, text_content): wait_for(view_page, check_content, "TAB_2", "Hello from component B") has_logs = False while not has_logs: - for log in fetch_logs(): + for log in fetch_logs(["flow"]): if "'a': 'a', 'b': 'b'" in log: has_logs = True sleep(1) From b8b8f033fd55db6c03e28ced1ddc2b49f6c8b770 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 14:56:41 +0200 Subject: [PATCH 136/230] (app) Run the flow only if the state has updated 1/2 (#14076) --- src/lightning_app/CHANGELOG.md | 2 + src/lightning_app/core/app.py | 31 +++++++++++----- src/lightning_app/utilities/app_helpers.py | 7 ++-- src/lightning_app/utilities/commands/base.py | 1 + src/lightning_app/utilities/scheduler.py | 2 +- tests/tests_app/core/test_lightning_app.py | 39 +++++++++++++++++++- tests/tests_app/core/test_lightning_flow.py | 21 +++++------ tests/tests_app/utilities/test_commands.py | 4 +- 8 files changed, 78 insertions(+), 29 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index ba8cdd796c5bb..f32d07697f376 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -24,6 +24,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703)) +- Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) + ### Deprecated ### Fixed diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 584f94285c219..3f9e2521eb21d 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -15,7 +15,7 @@ from lightning_app.core.queues import BaseQueue, SingleProcessQueue from lightning_app.frontend import Frontend from lightning_app.storage.path import storage_root_dir -from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef +from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests from lightning_app.utilities.component import _convert_paths_after_init from lightning_app.utilities.enum import AppStage, CacheCallsKeys @@ -94,7 +94,7 @@ def __init__( self.processes: t.Dict[str, WorkManager] = {} self.frontends: t.Dict[str, Frontend] = {} self.stage = AppStage.RUNNING - self._has_updated: bool = False + self._has_updated: bool = True self._schedules: t.Dict[str, t.Dict] = {} self.threads: t.List[threading.Thread] = [] @@ -278,7 +278,7 @@ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]: if component_output: logger.debug(f"Received from {component_output.id} : {component_output.delta.to_dict()}") work = self.get_component_by_name(component_output.id) - new_work_delta = _delta_to_appstate_delta(self.root, work, deepcopy(component_output.delta)) + new_work_delta = _delta_to_app_state_delta(self.root, work, deepcopy(component_output.delta)) deltas.append(new_work_delta) else: should_get_component_output = False @@ -307,9 +307,11 @@ def maybe_apply_changes(self) -> bool: if not deltas: # When no deltas are received from the Rest API or work queues, # we need to check if the flow modified the state and populate changes. - if Delta(DeepDiff(self.last_state, self.state, verbose_level=2)).to_dict(): + deep_diff = DeepDiff(self.last_state, self.state, verbose_level=2) + if deep_diff: + # TODO: Resolve changes with ``CacheMissException``. # new_state = self.populate_changes(self.last_state, self.state) - self.set_state(self.state) + self.set_last_state(self.state) self._has_updated = True return False @@ -329,7 +331,6 @@ def maybe_apply_changes(self) -> bool: def run_once(self): """Method used to collect changes and run the root Flow once.""" done = False - self._has_updated = False self._last_run_time = 0.0 if self.backend is not None: @@ -352,17 +353,23 @@ def run_once(self): _process_command_requests(self) + t0 = time() + try: self.check_error_queue() - t0 = time() - self.root.run() - self._last_run_time = time() - t0 + # Execute the flow only if: + # - There are state changes + # - It is the first execution of the flow + if self._has_updated: + self.root.run() except CacheMissException: self._on_cache_miss_exception() except (ExitAppException, KeyboardInterrupt): done = True self.stage = AppStage.STOPPING + self._last_run_time = time() - t0 + self.on_run_once_end() return done @@ -414,6 +421,8 @@ def _run(self) -> bool: if self._has_updated and self.should_publish_changes_to_api and self.api_publish_state_queue: self.api_publish_state_queue.put(self.state_vars) + self._has_updated = False + return True def _update_layout(self) -> None: @@ -430,8 +439,10 @@ def _apply_restarting(self) -> bool: self.stage = AppStage.BLOCKING return False - def _has_work_finished(self, work): + def _has_work_finished(self, work) -> bool: latest_call_hash = work._calls[CacheCallsKeys.LATEST_CALL_HASH] + if latest_call_hash is None: + return False return "ret" in work._calls[latest_call_hash] def _collect_work_finish_status(self) -> dict: diff --git a/src/lightning_app/utilities/app_helpers.py b/src/lightning_app/utilities/app_helpers.py index 4144c6de3ba12..faa612bba1998 100644 --- a/src/lightning_app/utilities/app_helpers.py +++ b/src/lightning_app/utilities/app_helpers.py @@ -299,7 +299,7 @@ def _set_child_name(component: "Component", child: "Component", new_name: str) - return child_name -def _delta_to_appstate_delta(root: "LightningFlow", component: "Component", delta: Delta) -> Delta: +def _delta_to_app_state_delta(root: "LightningFlow", component: "Component", delta: Delta) -> Delta: delta_dict = delta.to_dict() for changed in delta_dict.values(): for delta_key in changed.copy().keys(): @@ -322,8 +322,9 @@ def _delta_to_appstate_delta(root: "LightningFlow", component: "Component", delt delta_key_without_root = delta_key[4:] # the first 4 chars are the word 'root', strip it new_key = new_prefix + delta_key_without_root - changed[new_key] = val - del changed[delta_key] + if new_key != delta_key: + changed[new_key] = val + del changed[delta_key] return Delta(delta_dict) diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py index 11661e51ca26a..b87b41b05df42 100644 --- a/src/lightning_app/utilities/commands/base.py +++ b/src/lightning_app/utilities/commands/base.py @@ -243,3 +243,4 @@ def _process_command_requests(app): # Validation is done on the CLI side. response = method(**command_query["command_arguments"]) app.commands_responses_queue.put({"response": response, "id": command_query["id"]}) + app._has_updated = True diff --git a/src/lightning_app/utilities/scheduler.py b/src/lightning_app/utilities/scheduler.py index 012930f017f20..e45b0879246b9 100644 --- a/src/lightning_app/utilities/scheduler.py +++ b/src/lightning_app/utilities/scheduler.py @@ -15,7 +15,7 @@ class SchedulerThread(threading.Thread): def __init__(self, app) -> None: super().__init__(daemon=True) self._exit_event = threading.Event() - self._sleep_time = 0.5 + self._sleep_time = 1.0 self._app = app def run(self) -> None: diff --git a/tests/tests_app/core/test_lightning_app.py b/tests/tests_app/core/test_lightning_app.py index e6c715f87ef03..3776481965be3 100644 --- a/tests/tests_app/core/test_lightning_app.py +++ b/tests/tests_app/core/test_lightning_app.py @@ -1,3 +1,4 @@ +import logging import os import pickle from time import sleep @@ -27,6 +28,8 @@ from lightning_app.utilities.redis import check_if_redis_running from lightning_app.utilities.warnings import LightningFlowWarning +logger = logging.getLogger() + class B1(LightningFlow): def __init__(self): @@ -439,19 +442,25 @@ def __init__(self): self.counter = 0 def run(self): - self.counter = 1 + if self.counter < 2: + self.counter += 1 def test_maybe_apply_changes_from_flow(): """This test validates the app `_updated` is set to True only if the state was changed in the flow.""" app = LightningApp(SimpleFlow()) - assert not app._has_updated + assert app._has_updated app.maybe_apply_changes() app.root.run() app.maybe_apply_changes() assert app._has_updated app._has_updated = False + app.root.run() + app.maybe_apply_changes() + assert app._has_updated + app._has_updated = False + app.root.run() app.maybe_apply_changes() assert not app._has_updated @@ -920,3 +929,29 @@ def test_state_size_constant_growth(): MultiProcessRuntime(app, start_server=False).dispatch() assert app.root._state_sizes[0] <= 5904 assert app.root._state_sizes[20] <= 23736 + + +class FlowUpdated(LightningFlow): + def run(self): + logger.info("Hello World") + + +class NonUpdatedLightningTestApp(LightningTestApp): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.counter = 0 + + def on_after_run_once(self): + self.counter += 1 + if not self._has_updated and self.counter > 2: + return True + return super().on_after_run_once() + + +def test_non_updated_flow(caplog): + """This tests validate the app can run 3 times and call the flow only once.""" + with caplog.at_level(logging.INFO): + app = NonUpdatedLightningTestApp(FlowUpdated()) + MultiProcessRuntime(app, start_server=False).dispatch() + assert caplog.messages == ["Hello World"] + assert app.counter == 3 diff --git a/tests/tests_app/core/test_lightning_flow.py b/tests/tests_app/core/test_lightning_flow.py index e8ce1222a3186..4c0eb23ea014c 100644 --- a/tests/tests_app/core/test_lightning_flow.py +++ b/tests/tests_app/core/test_lightning_flow.py @@ -16,7 +16,7 @@ from lightning_app.storage import Path from lightning_app.storage.path import storage_root_dir from lightning_app.testing.helpers import EmptyFlow, EmptyWork -from lightning_app.utilities.app_helpers import _delta_to_appstate_delta, _LightningAppRef +from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef from lightning_app.utilities.enum import CacheCallsKeys from lightning_app.utilities.exceptions import ExitAppException @@ -416,7 +416,7 @@ def run(self): flow_a.work.counter = 1 work_state_2 = flow_a.work.state delta = Delta(DeepDiff(work_state, work_state_2, verbose_level=2)) - delta = _delta_to_appstate_delta(flow_a, flow_a.work, delta) + delta = _delta_to_app_state_delta(flow_a, flow_a.work, delta) new_flow_state = LightningApp.populate_changes(flow_state, flow_state + delta) flow_a.set_state(new_flow_state) assert flow_a.work.counter == 1 @@ -592,24 +592,23 @@ def run(self): class FlowSchedule(LightningFlow): def __init__(self): super().__init__() - self._last_time = None + self._last_times = [] + self.target = 3 + self.seconds = ",".join([str(v) for v in range(0, 60, self.target)]) def run(self): - if self.schedule("* * * * * 0,5,10,15,20,25,30,35,40,45,50,55"): - if self._last_time is None: - self._last_time = False - elif not self._last_time: - self._last_time = time() + if self.schedule(f"* * * * * {self.seconds}"): + if len(self._last_times) < 3: + self._last_times.append(time()) else: - # TODO (tchaton) Optimize flow execution. - assert 4.0 < abs(time() - self._last_time) < 6.0 + assert abs((time() - self._last_times[-1]) - self.target) < 3 self._exit() def test_scheduling_api(): app = LightningApp(FlowSchedule()) - MultiProcessRuntime(app).dispatch() + MultiProcessRuntime(app, start_server=True).dispatch() def test_lightning_flow(): diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py index 1e8e36ed09545..ed7f386395282 100644 --- a/tests/tests_app/utilities/test_commands.py +++ b/tests/tests_app/utilities/test_commands.py @@ -44,7 +44,7 @@ def __init__(self): def run(self): if self.has_sweep and len(self.names) == 1: - sleep(2) + sleep(1) self._exit() def trigger_method(self, name: str): @@ -156,7 +156,7 @@ def test_configure_commands(monkeypatch): monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name", "my_name", "--num_trials", "1"]) app_command() time_left = 15 - while time_left > 0 or process.exitcode is None: + while time_left > 0 and process.exitcode != 0: sleep(0.1) time_left -= 0.1 assert process.exitcode == 0 From cda381a626719d965d85f9034993cae1f4227f29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 15:03:53 +0200 Subject: [PATCH 137/230] Update changelog after 1.7.1 release (#14127) --- src/pytorch_lightning/CHANGELOG.md | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index b405665b9df88..baf98d81a7733 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -52,39 +52,29 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) -- Casted tensors to fp16 before moving them to device with `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) +- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) -- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) +- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) -- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +## [1.7.1] - 2022-08-09 +### Fixed +- Casted only floating point tensors to fp16 with IPUs ([#13983](https://github.com/Lightning-AI/lightning/pull/13983)) +- Casted tensors to fp16 before moving them to device with `DeepSpeedStrategy` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) +- Fixed the `NeptuneLogger` dependency being unrecognized ([#13988](https://github.com/Lightning-AI/lightning/pull/13988)) - Fixed an issue where users would be warned about unset `max_epochs` even when `fast_dev_run` was set ([#13262](https://github.com/Lightning-AI/lightning/pull/13262)) - - - Fixed MPS device being unrecognized ([#13992](https://github.com/Lightning-AI/lightning/pull/13992)) - - - Fixed incorrect `precision="mixed"` being used with `DeepSpeedStrategy` and `IPUStrategy` ([#14041](https://github.com/Lightning-AI/lightning/pull/14041)) - - -- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) - - - Fixed dtype inference during gradient norm computation ([#14051](https://github.com/Lightning-AI/lightning/pull/14051)) - - - Fixed a bug that caused `ddp_find_unused_parameters` to be set `False`, whereas the intended default is `True` ([#14095](https://github.com/Lightning-AI/lightning/pull/14095)) -- Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) - - ## [1.7.0] - 2022-08-02 ### Added From 58014846ee0fb54b92e4bfb4c0965b72bc0a9641 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Wed, 10 Aug 2022 14:32:12 +0100 Subject: [PATCH 138/230] Update Grid links to Lightning AI (#14081) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * initial changes for lightning * Update .github/BECOMING_A_CORE_CONTRIBUTOR.md Co-authored-by: Adrian Wälchli Co-authored-by: Adrian Wälchli --- .github/BECOMING_A_CORE_CONTRIBUTOR.md | 2 +- SECURITY.md | 2 +- src/pytorch_lightning/README.md | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/BECOMING_A_CORE_CONTRIBUTOR.md b/.github/BECOMING_A_CORE_CONTRIBUTOR.md index a179161f687a1..fd40e29e1ebf1 100644 --- a/.github/BECOMING_A_CORE_CONTRIBUTOR.md +++ b/.github/BECOMING_A_CORE_CONTRIBUTOR.md @@ -62,4 +62,4 @@ We are on the lookout for new people to join, however, if you feel like you meet ## Employment -You can also become a [Grid.ai](https://www.grid.ai) employee or intern and work on Lightning. To get started, you can email `careers@grid.ai` with your resume or check out our [open job postings](https://boards.greenhouse.io/gridai). +You can also become a [Lightning AI](https://lightning.ai/) employee or intern and work on Lightning. To get started, you can email `careers@lightning.ai` with your resume or check out our [open job postings](https://boards.greenhouse.io/lightningai). diff --git a/SECURITY.md b/SECURITY.md index 8f265f26be452..862563f84e2fe 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,2 +1,2 @@ -developer@grid.ai +developer@lightning.ai developer@pytorchlightning.ai diff --git a/src/pytorch_lightning/README.md b/src/pytorch_lightning/README.md index b57aea6fae147..914596c0a9d2f 100644 --- a/src/pytorch_lightning/README.md +++ b/src/pytorch_lightning/README.md @@ -14,8 +14,8 @@ ______________________________________________________________________ DocsExamplesCommunity • - Grid AI • - License + Lightning AI • + License

From 4e87a44002a91c869f43c0929d29fa8600f14f15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 10 Aug 2022 17:15:35 +0200 Subject: [PATCH 139/230] Avoid entry_points deprecation warning (#14052) Co-authored-by: Adam J. Stewart Co-authored-by: Akihiro Nitta --- src/pytorch_lightning/CHANGELOG.md | 6 ++++++ .../trainer/connectors/callback_connector.py | 11 ++++++++--- src/pytorch_lightning/utilities/imports.py | 1 + 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index baf98d81a7733..90285b55c8037 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -61,6 +61,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) +- Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) + + +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/trainer/connectors/callback_connector.py b/src/pytorch_lightning/trainer/connectors/callback_connector.py index bb7f912420256..32d67d44ad44c 100644 --- a/src/pytorch_lightning/trainer/connectors/callback_connector.py +++ b/src/pytorch_lightning/trainer/connectors/callback_connector.py @@ -31,7 +31,7 @@ from pytorch_lightning.callbacks.rich_model_summary import RichModelSummary from pytorch_lightning.callbacks.timer import Timer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation, rank_zero_info _log = logging.getLogger(__name__) @@ -260,14 +260,19 @@ def _configure_external_callbacks() -> List[Callback]: Return: A list of all callbacks collected from external factories. """ + group = "pytorch_lightning.callbacks_factory" + if _PYTHON_GREATER_EQUAL_3_8_0: from importlib.metadata import entry_points - factories = entry_points().get("pytorch_lightning.callbacks_factory", ()) + if _PYTHON_GREATER_EQUAL_3_10_0: + factories = entry_points(group=group) # type: ignore[call-arg] + else: + factories = entry_points().get(group, {}) # type: ignore[assignment] else: from pkg_resources import iter_entry_points - factories = iter_entry_points("pytorch_lightning.callbacks_factory") # type: ignore[assignment] + factories = iter_entry_points(group) # type: ignore[assignment] external_callbacks: List[Callback] = [] for factory in factories: diff --git a/src/pytorch_lightning/utilities/imports.py b/src/pytorch_lightning/utilities/imports.py index 67bf75be3c4d3..ba437ad332dfa 100644 --- a/src/pytorch_lightning/utilities/imports.py +++ b/src/pytorch_lightning/utilities/imports.py @@ -124,6 +124,7 @@ def __repr__(self) -> str: _IS_WINDOWS = platform.system() == "Windows" _IS_INTERACTIVE = hasattr(sys, "ps1") # https://stackoverflow.com/a/64523765 _PYTHON_GREATER_EQUAL_3_8_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 8) +_PYTHON_GREATER_EQUAL_3_10_0 = (sys.version_info.major, sys.version_info.minor) >= (3, 10) _TORCH_GREATER_EQUAL_1_9_1 = _compare_version("torch", operator.ge, "1.9.1") _TORCH_GREATER_EQUAL_1_10 = _compare_version("torch", operator.ge, "1.10.0") _TORCH_LESSER_EQUAL_1_10_2 = _compare_version("torch", operator.le, "1.10.2") From 9b61b1c482cb8be569e664647a577730e55680c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 10 Aug 2022 17:21:05 +0200 Subject: [PATCH 140/230] Remove duplicated test classes (#14122) Remove duplicated classes --- .../progress/test_rich_progress_bar.py | 3 +- .../callbacks/test_stochastic_weight_avg.py | 3 +- tests/tests_pytorch/helpers/datasets.py | 39 +------------------ .../strategies/test_deepspeed_strategy.py | 3 +- .../trainer/flags/test_val_check_interval.py | 3 +- .../logging_/test_train_loop_logging.py | 3 +- .../test_estimated_stepping_batches.py | 3 +- .../tests_pytorch/trainer/test_dataloaders.py | 8 +++- tests/tests_pytorch/trainer/test_trainer.py | 8 +++- tests/tests_pytorch/utilities/test_data.py | 3 +- 10 files changed, 20 insertions(+), 56 deletions(-) diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py index e9374f8ea4be1..f1ccf2a2726a2 100644 --- a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py @@ -21,8 +21,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ProgressBarBase, RichProgressBar from pytorch_lightning.callbacks.progress.rich_progress import RichProgressBarTheme -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset -from tests_pytorch.helpers.datasets import RandomIterableDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py index 65a0fea2fb4a5..7f1692e30a3f2 100644 --- a/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py +++ b/tests/tests_pytorch/callbacks/test_stochastic_weight_avg.py @@ -26,10 +26,9 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import StochasticWeightAveraging -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.strategies import DDPSpawnStrategy, Strategy from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/helpers/datasets.py b/tests/tests_pytorch/helpers/datasets.py index 3443020d4528f..c9d185313e85e 100644 --- a/tests/tests_pytorch/helpers/datasets.py +++ b/tests/tests_pytorch/helpers/datasets.py @@ -19,7 +19,7 @@ from typing import Optional, Sequence, Tuple import torch -from torch.utils.data import Dataset, IterableDataset +from torch.utils.data import Dataset class MNIST(Dataset): @@ -212,40 +212,3 @@ def __getitem__(self, idx): def __len__(self): return len(self.y) - - -class RandomDictDataset(Dataset): - def __init__(self, size: int, length: int): - self.len = length - self.data = torch.randn(length, size) - - def __getitem__(self, index): - a = self.data[index] - b = a + 2 - return {"a": a, "b": b} - - def __len__(self): - return self.len - - -class RandomIterableDataset(IterableDataset): - def __init__(self, size: int, count: int): - self.count = count - self.size = size - - def __iter__(self): - for _ in range(self.count): - yield torch.randn(self.size) - - -class RandomIterableDatasetWithLen(IterableDataset): - def __init__(self, size: int, count: int): - self.count = count - self.size = size - - def __iter__(self): - for _ in range(len(self)): - yield torch.randn(self.size) - - def __len__(self): - return self.count diff --git a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py index 272b03a846688..e3c6f95f3ff47 100644 --- a/tests/tests_pytorch/strategies/test_deepspeed_strategy.py +++ b/tests/tests_pytorch/strategies/test_deepspeed_strategy.py @@ -28,13 +28,12 @@ from pytorch_lightning import LightningDataModule, LightningModule, Trainer from pytorch_lightning.callbacks import Callback, LearningRateMonitor, ModelCheckpoint -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.plugins import DeepSpeedPrecisionPlugin from pytorch_lightning.strategies import DeepSpeedStrategy from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE, LightningDeepSpeedModule from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf if _DEEPSPEED_AVAILABLE: diff --git a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py index 9414fd1c5096f..e5fd9b5dd2706 100644 --- a/tests/tests_pytorch/trainer/flags/test_val_check_interval.py +++ b/tests/tests_pytorch/trainer/flags/test_val_check_interval.py @@ -16,10 +16,9 @@ import pytest from torch.utils.data import DataLoader -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.trainer.trainer import Trainer from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset @pytest.mark.parametrize("max_epochs", [1, 2, 3]) diff --git a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py index d16be306b9365..85ed3d8e3471d 100644 --- a/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_train_loop_logging.py @@ -28,9 +28,8 @@ from pytorch_lightning import callbacks, Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, TQDMProgressBar from pytorch_lightning.core.module import LightningModule -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomDictDataset from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomDictDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py index 92a1126294dfc..846a39a748a60 100644 --- a/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py +++ b/tests/tests_pytorch/trainer/properties/test_estimated_stepping_batches.py @@ -22,11 +22,10 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks.gradient_accumulation_scheduler import GradientAccumulationScheduler -from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.demos.boring_classes import BoringModel, RandomIterableDataset from pytorch_lightning.strategies.ipu import IPUStrategy from pytorch_lightning.utilities import device_parser from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py index 5bea5a4cbbe1c..34504392dc0c1 100644 --- a/tests/tests_pytorch/trainer/test_dataloaders.py +++ b/tests/tests_pytorch/trainer/test_dataloaders.py @@ -25,12 +25,16 @@ from pytorch_lightning import Callback, seed_everything, Trainer from pytorch_lightning.callbacks import ModelCheckpoint -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import ( + BoringModel, + RandomDataset, + RandomIterableDataset, + RandomIterableDatasetWithLen, +) from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.data import _auto_add_worker_init_fn, has_iterable_dataset, has_len_all_ranks from pytorch_lightning.utilities.exceptions import MisconfigurationException from tests_pytorch.helpers.dataloaders import CustomInfDataloader, CustomNotImplementedErrorDataloader -from tests_pytorch.helpers.datasets import RandomIterableDataset, RandomIterableDatasetWithLen from tests_pytorch.helpers.runif import RunIf diff --git a/tests/tests_pytorch/trainer/test_trainer.py b/tests/tests_pytorch/trainer/test_trainer.py index e4be8929f9c7e..9506acee425d0 100644 --- a/tests/tests_pytorch/trainer/test_trainer.py +++ b/tests/tests_pytorch/trainer/test_trainer.py @@ -41,7 +41,12 @@ from pytorch_lightning.callbacks.fault_tolerance import _FaultToleranceCheckpoint from pytorch_lightning.callbacks.prediction_writer import BasePredictionWriter from pytorch_lightning.core.saving import load_hparams_from_tags_csv, load_hparams_from_yaml, save_hparams_to_tags_csv -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import ( + BoringModel, + RandomDataset, + RandomIterableDataset, + RandomIterableDatasetWithLen, +) from pytorch_lightning.loggers import TensorBoardLogger from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper, UnrepeatedDistributedSampler from pytorch_lightning.strategies import ( @@ -60,7 +65,6 @@ from pytorch_lightning.utilities.imports import _OMEGACONF_AVAILABLE, _TORCH_GREATER_EQUAL_1_12 from pytorch_lightning.utilities.seed import seed_everything from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.datasets import RandomIterableDataset, RandomIterableDatasetWithLen from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.simple_models import ClassificationModel diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index ffb898efaa815..3700feaba9992 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -6,7 +6,7 @@ from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler from pytorch_lightning import Trainer -from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset +from pytorch_lightning.demos.boring_classes import BoringModel, RandomDataset, RandomIterableDataset from pytorch_lightning.overrides.distributed import IndexBatchSamplerWrapper from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.data import ( @@ -23,7 +23,6 @@ warning_cache, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from tests_pytorch.helpers.datasets import RandomIterableDataset from tests_pytorch.helpers.utils import no_warning_call From 2abed91c5386ee9434b4e45e859e91d06bef3080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 10 Aug 2022 17:25:44 +0200 Subject: [PATCH 141/230] Update CODEOWNERS (#14119) * Update CODEOWNERS * Cleanup and remove old sections * pl focus Co-authored-by: Jirka Borovec --- .github/CODEOWNERS | 40 ++++++++++++++---------------- src/pytorch_lightning/__about__.py | 1 - 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index f83924b9566ce..0b4692731bff9 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -5,7 +5,7 @@ # the repo. Unless a later match takes precedence, # @global-owner1 and @global-owner2 will be requested for # review when someone opens a pull request. -* @williamfalcon @borda @tchaton @carmocca @awaelchli @justusschock @kaushikb11 @rohitgr7 +* @williamfalcon @borda @tchaton @awaelchli @kaushikb11 @rohitgr7 # CI/CD and configs /.github/ @borda @carmocca @akihironitta @tchaton @@ -26,13 +26,14 @@ /docs/source-app/expertise_levels @williamfalcon @Felonious-Spellfire @RobertLaurella # Packages +/src/pytorch_lightning @carmocca @justusschock /src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 /src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11 /src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11 /src/pytorch_lightning/distributed @williamfalcon @tchaton @awaelchli @kaushikb11 /src/pytorch_lightning/lite @tchaton @awaelchli @carmocca /src/pytorch_lightning/loggers @tchaton @awaelchli @borda -/src/pytorch_lightning/loggers/wandb.py @borisdayma +/src/pytorch_lightning/loggers/wandb.py @borisdayma @borda /src/pytorch_lightning/loggers/neptune.py @shnela @HubertJaworski @pkasprzyk @pitercl @Raalsky @aniezurawski @kamil-kaczmarek /src/pytorch_lightning/loops @tchaton @awaelchli @justusschock @carmocca /src/pytorch_lightning/overrides @tchaton @borda @@ -46,7 +47,7 @@ /src/pytorch_lightning/utilities @borda @tchaton @carmocca /src/lightning_app @tchaton @manskx -/src/lightning_app/cli/pl-app-template @awaelchli @tchaton @Borda +/src/lightning_app/cli/pl-app-template @tchaton @awaelchli @Borda /src/lightning_app/core @tchaton @awaelchli @manskx /src/lightning_app/core/queues.py @tchaton @hhsecond @manskx /src/lightning_app/runners/cloud.py @tchaton @hhsecond @@ -54,28 +55,23 @@ /src/lightning_app/__about__.py @nohalon @edenlightning @lantiga # Examples -/examples/app_* @tchaton @awaelchli @manskx @hhsecond +/examples/app_* @tchaton @awaelchli @manskx @hhsecond # App tests -/tests/tests_app @tchaton @awaelchli @manskx @hhsecond -/tests/tests_app_examples @tchaton @awaelchli @manskx @hhsecond +/tests/tests_app @tchaton @awaelchli @manskx @hhsecond +/tests/tests_app_examples @tchaton @awaelchli @manskx @hhsecond # Specifics -/src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca -/src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca - +/src/pytorch_lightning/trainer/connectors/logger_connector @tchaton @carmocca +/src/pytorch_lightning/trainer/progress.py @tchaton @awaelchli @carmocca # API -/src/pytorch_lightning/callbacks/base.py @williamfalcon @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/core/datamodule.py @williamFalcon @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli -/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @ananthsub @carmocca -/src/pytorch_lightning/core/lightning.py @williamfalcon @tchaton @awaelchli - -# Testing -/tests/helpers/boring_model.py @williamfalcon @tchaton @borda +/src/pytorch_lightning/callbacks/callback.py @williamfalcon @awaelchli @ananthsub @carmocca +/src/pytorch_lightning/core/datamodule.py @williamFalcon @awaelchli @ananthsub @carmocca +/src/pytorch_lightning/trainer/trainer.py @williamfalcon @tchaton @awaelchli +/src/pytorch_lightning/core/hooks.py @williamfalcon @tchaton @awaelchli @ananthsub @carmocca +/src/pytorch_lightning/core/module.py @williamfalcon @tchaton @awaelchli -/.github/CODEOWNERS @williamfalcon -/.github/approve_config.yml @williamfalcon -/SECURITY.md @williamfalcon -/README.md @williamfalcon @edenlightning @borda -/setup.py @williamfalcon @borda @carmocca +/.github/CODEOWNERS @williamfalcon +/SECURITY.md @williamfalcon +/README.md @williamfalcon @edenlightning @borda +/setup.py @williamfalcon @borda @carmocca /src/pytorch_lightning/__about__.py @williamfalcon @borda @carmocca diff --git a/src/pytorch_lightning/__about__.py b/src/pytorch_lightning/__about__.py index 6d09c5264e1ab..e2fdbd9ee3016 100644 --- a/src/pytorch_lightning/__about__.py +++ b/src/pytorch_lightning/__about__.py @@ -13,7 +13,6 @@ # limitations under the License. import time -# __version__ = "1.7.0" __author__ = "Lightning AI et al." __author_email__ = "pytorch@lightning.ai" __license__ = "Apache-2.0" From 527b28ed974c326f9e86c334b0c5bd477b635f89 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Wed, 10 Aug 2022 16:26:44 +0100 Subject: [PATCH 142/230] Fix mypy errors attributed to `pytorch_lightning.profilers.simple` (#14103) --- pyproject.toml | 1 - src/pytorch_lightning/profilers/simple.py | 19 +++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8db782df357d8..b5e806bc69900 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,6 @@ module = [ "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", - "pytorch_lightning.profilers.simple", "pytorch_lightning.strategies.sharded", "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", diff --git a/src/pytorch_lightning/profilers/simple.py b/src/pytorch_lightning/profilers/simple.py index 20d76f9b2d378..0fb9497ff17fb 100644 --- a/src/pytorch_lightning/profilers/simple.py +++ b/src/pytorch_lightning/profilers/simple.py @@ -60,7 +60,7 @@ def __init__( """ super().__init__(dirpath=dirpath, filename=filename) self.current_actions: Dict[str, float] = {} - self.recorded_durations = defaultdict(list) + self.recorded_durations: Dict = defaultdict(list) self.extended = extended self.start_time = time.monotonic() @@ -104,20 +104,23 @@ def summary(self) -> str: if len(self.recorded_durations) > 0: max_key = max(len(k) for k in self.recorded_durations.keys()) - def log_row(action, mean, num_calls, total, per): + def log_row_extended(action: str, mean: str, num_calls: str, total: str, per: str) -> str: row = f"{sep}| {action:<{max_key}s}\t| {mean:<15}\t|" row += f" {num_calls:<15}\t| {total:<15}\t| {per:<15}\t|" return row - header_string = log_row("Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %") + header_string = log_row_extended( + "Action", "Mean duration (s)", "Num calls", "Total time (s)", "Percentage %" + ) output_string_len = len(header_string.expandtabs()) sep_lines = f"{sep}{'-' * output_string_len}" output_string += sep_lines + header_string + sep_lines - report, total_calls, total_duration = self._make_report_extended() - output_string += log_row("Total", "-", f"{total_calls:}", f"{total_duration:.5}", "100 %") + report_extended: _TABLE_DATA_EXTENDED + report_extended, total_calls, total_duration = self._make_report_extended() + output_string += log_row_extended("Total", "-", f"{total_calls:}", f"{total_duration:.5}", "100 %") output_string += sep_lines - for action, mean_duration, num_calls, total_duration, duration_per in report: - output_string += log_row( + for action, mean_duration, num_calls, total_duration, duration_per in report_extended: + output_string += log_row_extended( action, f"{mean_duration:.5}", f"{num_calls}", @@ -128,7 +131,7 @@ def log_row(action, mean, num_calls, total, per): else: max_key = max(len(k) for k in self.recorded_durations) - def log_row(action, mean, total): + def log_row(action: str, mean: str, total: str) -> str: return f"{sep}| {action:<{max_key}s}\t| {mean:<15}\t| {total:<15}\t|" header_string = log_row("Action", "Mean duration (s)", "Total time (s)") From 6f4edd721f9852d8f4afaa49edd1f80c5fc6dc72 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 10 Aug 2022 09:03:51 -0700 Subject: [PATCH 143/230] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 9c03e3707ec24..2d32094f6595f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +### ** NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines** +
From f132d44821f9fe7ad83d74edbb13dc6ee7769a3d Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Wed, 10 Aug 2022 18:09:50 +0200 Subject: [PATCH 144/230] Fix a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported (#14117) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/utilities/data.py | 10 +++++---- tests/tests_pytorch/utilities/test_data.py | 25 ++++++++++++++++++++++ 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 90285b55c8037..97bb317b02a14 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -52,6 +52,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed +- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) + + - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index 00a7cb8486709..b625a046f6122 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -501,15 +501,17 @@ def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = Non It patches the ``__init__`` method. """ classes = _get_all_subclasses(base_cls) | {base_cls} - wrapped = set() for cls in classes: - if cls.__init__ not in wrapped: + # Check that __init__ belongs to the class + # https://stackoverflow.com/a/5253424 + if "__init__" in cls.__dict__: cls._old_init = cls.__init__ cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) - wrapped.add(cls.__init__) yield for cls in classes: - if hasattr(cls, "_old_init"): + # Check that _old_init belongs to the class + # https://stackoverflow.com/a/5253424 + if "_old_init" in cls.__dict__: cls.__init__ = cls._old_init del cls._old_init diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index 3700feaba9992..cc70417988616 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -1,3 +1,4 @@ +import random from dataclasses import dataclass import pytest @@ -172,6 +173,30 @@ def __init__(self, randomize, *args, **kwargs): assert isinstance(new_dataloader, GoodImpl) +def test_replace_init_method_multiple_loaders_without_init(): + """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` + method (the one we are wrapping), it can happen, that `hasattr(cls, "_old_init")` is True because of parent + class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error + occured only sometimes because it depends on the order in which we are iterating over a set of classes we are + patching. + + This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` + and are children of `DataLoader`. We are testing that a) context manager `_replace_init_method` exits cleanly, and + b) the mechanism checking for presence of `_old_init` works as expected. + """ + classes = [DataLoader] + for i in range(100): + classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) + + with _replace_init_method(DataLoader, "dataset"): + for cls in classes[1:]: # First one is `DataLoader` + assert "_old_init" not in cls.__dict__ + assert hasattr(cls, "_old_init") + + assert "_old_init" in DataLoader.__dict__ + assert hasattr(DataLoader, "_old_init") + + class DataLoaderSubclass1(DataLoader): def __init__(self, attribute1, *args, **kwargs): self.at1 = attribute1 From 45a10a137cbbc7bd07bf3bf4b7c4b8b8a9439516 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 10 Aug 2022 18:22:44 +0200 Subject: [PATCH 145/230] update chlog after 0.5.5 (#14133) --- src/lightning_app/CHANGELOG.md | 65 ++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index f32d07697f376..ea28c57611311 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -9,27 +9,86 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) + + - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) + + - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) + + - Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987)) + + - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) - Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) + ### Changed -- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537)) +- + ### Changed -- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703)) +- + - Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) ### Deprecated +- + + ### Fixed -- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) +- + + +## [0.5.5] - 2022-08-9 +### Deprecated + +- Deprecate sheety API ([#14004](https://github.com/Lightning-AI/lightning/pull/14004)) + +### Fixed + +- Resolved a bug where the work statuses will grow quickly and be duplicated ([#13970](https://github.com/Lightning-AI/lightning/pull/13970)) - Resolved a bug about a race condition when sending the work state through the caller_queue ([#14074](https://github.com/Lightning-AI/lightning/pull/14074)) +- Fixed Start Lightning App on Cloud if Repo Begins With Name "Lightning" ([#14025](https://github.com/Lightning-AI/lightning/pull/14025)) + + +## [0.5.4] - 2022-08-01 + +### Changed + +- Wrapped imports for traceability ([#13924](https://github.com/Lightning-AI/lightning/pull/13924)) +- Set version as today ([#13906](https://github.com/Lightning-AI/lightning/pull/13906)) + +### Fixed + +- Included app templates to the lightning and app packages ([#13731](https://github.com/Lightning-AI/lightning/pull/13731)) +- Added UI for install all ([#13732](https://github.com/Lightning-AI/lightning/pull/13732)) +- Fixed build meta pkg flow ([#13926](https://github.com/Lightning-AI/lightning/pull/13926)) + +## [0.5.3] - 2022-07-25 + +### Changed + +- Pruned requirements duplicity ([#13739](https://github.com/Lightning-AI/lightning/pull/13739)) + +### Fixed + +- Use correct python version in lightning component template ([#13790](https://github.com/Lightning-AI/lightning/pull/13790)) + +## [0.5.2] - 2022-07-18 + +### Added + +- Update the Lightning App docs ([#13537](https://github.com/Lightning-AI/lightning/pull/13537)) + +### Changed + +- Added `LIGHTNING_` prefix to Platform AWS credentials ([#13703](https://github.com/Lightning-AI/lightning/pull/13703)) From e226180527b065813bb1ba5e83f4990c3b81d444 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 19:26:01 +0200 Subject: [PATCH 146/230] (app) Remove ClickRunner (#14147) --- README.md | 2 +- tests/tests_app_examples/test_boring_app.py | 7 +------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 2d32094f6595f..f9d5a9a57f5e2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -### ** NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines** +### \*\* NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines\*\*
diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index f8143b1db1a88..0ca1b823b4706 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -13,7 +13,6 @@ def test_boring_app_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py") as ( _, view_page, - _, name, ): @@ -31,8 +30,4 @@ def check_hello_there(*_, **__): assert result.exit_code == 0 assert result.exception is None - assert len(lines) > 1, result.output - # We know that at some point we need to intstall lightning, so we check for that - assert any( - "Successfully built lightning" in line for line in lines - ), f"Did not find logs with lightning installation: {result.output}" + assert any("http://0.0.0.0:8080" in line for line in lines) From 3966f959aab2682df26f9712c37e468704304792 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 10 Aug 2022 19:38:39 +0200 Subject: [PATCH 147/230] relax `docker` requirement (#14009) --- requirements/app/cloud.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index ff18d47b44565..6644a56a2894b 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,4 +1,4 @@ starsessions redis>=4.0.0, <=4.2.4 -docker==5.0.3 +docker>=5.0.0, <=5.0.3 # setuptools==59.5.0 From f11f1e2bb470a57f4043a41b1cdf194071c4be1e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Aug 2022 19:40:34 +0200 Subject: [PATCH 148/230] Update gcsfs requirement from <2022.6.0,>=2021.5.0 to >=2021.5.0,<2022.8.0 in /requirements (#14079) Update gcsfs requirement in /requirements Updates the requirements on [gcsfs](https://github.com/fsspec/gcsfs) to permit the latest version. - [Release notes](https://github.com/fsspec/gcsfs/releases) - [Commits](https://github.com/fsspec/gcsfs/compare/2021.05.0...2022.7.1) --- updated-dependencies: - dependency-name: gcsfs dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/extra.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/extra.txt b/requirements/pytorch/extra.txt index c386c5581cc42..20b6c1b8dbc12 100644 --- a/requirements/pytorch/extra.txt +++ b/requirements/pytorch/extra.txt @@ -7,5 +7,5 @@ torchtext>=0.10.*, <0.14.0 omegaconf>=2.0.5, <2.3.0 hydra-core>=1.0.5, <1.3.0 jsonargparse[signatures]>=4.12.0, <=4.12.0 -gcsfs>=2021.5.0, <2022.6.0 +gcsfs>=2021.5.0, <2022.8.0 rich>=10.14.0, !=10.15.0.a, <13.0.0 From 7e7736778bfc1f3864d878458b9de87de7ded52c Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 10 Aug 2022 14:27:35 -0400 Subject: [PATCH 149/230] Update onnxruntime requirement from <=1.12.0 to <1.13.0 in /requirements (#14083) Updates the requirements on [onnxruntime](https://github.com/microsoft/onnxruntime) to permit the latest version. - [Release notes](https://github.com/microsoft/onnxruntime/releases) - [Changelog](https://github.com/microsoft/onnxruntime/blob/master/docs/ReleaseManagement.md) - [Commits](https://github.com/microsoft/onnxruntime/compare/v0.1.4...v1.12.1) --- updated-dependencies: - dependency-name: onnxruntime dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index c155400a3d35f..f8bd5793a0af6 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -10,7 +10,7 @@ mypy==0.971 # needed in tests cloudpickle>=1.3, <=2.1.0 scikit-learn>0.22.1, <=1.1.1 -onnxruntime<=1.12.0 +onnxruntime<1.13.0 psutil<=5.9.1 # for `DeviceStatsMonitor` pandas>1.0, <=1.4.3 # needed in benchmarks fastapi<=0.79.0 From 784b60412c1dec73c5f7c90ced343d2bbd394c25 Mon Sep 17 00:00:00 2001 From: panos-is <102533125+panos-is@users.noreply.github.com> Date: Wed, 10 Aug 2022 23:07:23 +0300 Subject: [PATCH 150/230] (app) Add s3 drive type (1/2) (#14002) * Add S3 protocol and optimization field to the drive object * Add a list of drives to the work specification * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * add only protocol for s3 drives, no optimization arguments, and add tests * added trailing slash criteria * allow slash in s3 drives * fix * fixed test issues Co-authored-by: Panos Lantavos-Stratigakis Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Rick Izzo Co-authored-by: Jirka Borovec Co-authored-by: Rick Izzo --- src/lightning_app/storage/drive.py | 33 ++++++++++- tests/tests_app/storage/test_drive.py | 81 +++++++++++++++++++-------- 2 files changed, 90 insertions(+), 24 deletions(-) diff --git a/src/lightning_app/storage/drive.py b/src/lightning_app/storage/drive.py index 3bcdf72780653..b69d2581851b8 100644 --- a/src/lightning_app/storage/drive.py +++ b/src/lightning_app/storage/drive.py @@ -13,7 +13,7 @@ class Drive: __IDENTIFIER__ = "__drive__" - __PROTOCOLS__ = ["lit://"] + __PROTOCOLS__ = ["lit://", "s3://"] def __init__( self, @@ -35,15 +35,28 @@ def __init__( root_folder: This is the folder from where the Drive perceives the data (e.g this acts as a mount dir). """ self.id = None + self.protocol = None for protocol in self.__PROTOCOLS__: if id.startswith(protocol): self.protocol = protocol self.id = id.replace(protocol, "") + break + else: # N.B. for-else loop + raise ValueError( + f"Unknown protocol for the drive 'id' argument '{id}`. The 'id' string " + f"must start with one of the following prefixes {self.__PROTOCOLS__}" + ) + + if self.protocol == "s3://" and not self.id.endswith("/"): + raise ValueError( + "S3 drives must end in a trailing slash (`/`) to indicate a folder is being mounted. " + f"Recieved: '{id}'. Mounting a single file is not currently supported." + ) if not self.id: raise Exception(f"The Drive id needs to start with one of the following protocols: {self.__PROTOCOLS__}") - if "/" in self.id: + if self.protocol != "s3://" and "/" in self.id: raise Exception(f"The id should be unique to identify your drive. Found `{self.id}`.") self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else os.getcwd() @@ -75,6 +88,10 @@ def put(self, path: str) -> None: raise Exception("The component name needs to be known to put a path to the Drive.") if _is_flow_context(): raise Exception("The flow isn't allowed to put files into a Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently add files via this API. Did you mean to use `lit://` drives?" + ) self._validate_path(path) @@ -98,6 +115,10 @@ def list(self, path: Optional[str] = ".", component_name: Optional[str] = None) """ if _is_flow_context(): raise Exception("The flow isn't allowed to list files from a Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently list files via this API. Did you mean to use `lit://` drives?" + ) if component_name: paths = [ @@ -142,6 +163,10 @@ def get( """ if _is_flow_context(): raise Exception("The flow isn't allowed to get files from a Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently get files via this API. Did you mean to use `lit://` drives?" + ) if component_name: shared_path = self._to_shared_path( @@ -189,6 +214,10 @@ def delete(self, path: str) -> None: """ if not self.component_name: raise Exception("The component name needs to be known to delete a path to the Drive.") + if self.protocol == "s3://": + raise PermissionError( + "S3 based drives cannot currently delete files via this API. Did you mean to use `lit://` drives?" + ) shared_path = self._to_shared_path( path, diff --git a/tests/tests_app/storage/test_drive.py b/tests/tests_app/storage/test_drive.py index 3d9db44c10e13..0d452571d9f43 100644 --- a/tests/tests_app/storage/test_drive.py +++ b/tests/tests_app/storage/test_drive.py @@ -11,7 +11,7 @@ from lightning_app.utilities.component import _set_flow_context -class SyncWorkA(LightningWork): +class SyncWorkLITDriveA(LightningWork): def __init__(self, tmpdir): super().__init__() self.tmpdir = tmpdir @@ -25,19 +25,19 @@ def run(self, drive: Drive): os.remove(f"{self.tmpdir}/a.txt") -class SyncWorkB(LightningWork): +class SyncWorkLITDriveB(LightningWork): def run(self, drive: Drive): assert not os.path.exists("a.txt") drive.get("a.txt") assert os.path.exists("a.txt") -class SyncFlow(LightningFlow): +class SyncFlowLITDrives(LightningFlow): def __init__(self, tmpdir): super().__init__() self.log_dir = Drive("lit://log_dir") - self.work_a = SyncWorkA(str(tmpdir)) - self.work_b = SyncWorkB() + self.work_a = SyncWorkLITDriveA(str(tmpdir)) + self.work_b = SyncWorkLITDriveB() def run(self): self.work_a.run(self.log_dir) @@ -45,15 +45,15 @@ def run(self): self._exit() -def test_synchronization_drive(tmpdir): +def test_synchronization_lit_drive(tmpdir): if os.path.exists("a.txt"): os.remove("a.txt") - app = LightningApp(SyncFlow(tmpdir)) + app = LightningApp(SyncFlowLITDrives(tmpdir)) MultiProcessRuntime(app, start_server=False).dispatch() os.remove("a.txt") -class Work(LightningWork): +class LITDriveWork(LightningWork): def __init__(self): super().__init__(parallel=True) self.drive = None @@ -75,7 +75,7 @@ def run(self, *args, **kwargs): self.counter += 1 -class Work2(LightningWork): +class LITDriveWork2(LightningWork): def __init__(self): super().__init__(parallel=True) @@ -86,11 +86,11 @@ def run(self, drive: Drive, **kwargs): assert drive.list(".", component_name=self.name) == [] -class Flow(LightningFlow): +class LITDriveFlow(LightningFlow): def __init__(self): super().__init__() - self.work = Work() - self.work2 = Work2() + self.work = LITDriveWork() + self.work2 = LITDriveWork2() def run(self): self.work.run("0") @@ -102,15 +102,15 @@ def run(self): self._exit() -def test_drive_transferring_files(): - app = LightningApp(Flow()) +def test_lit_drive_transferring_files(): + app = LightningApp(LITDriveFlow()) MultiProcessRuntime(app, start_server=False).dispatch() os.remove("a.txt") -def test_drive(): - with pytest.raises(Exception, match="The Drive id needs to start with one of the following protocols"): - Drive("this_drive_id") +def test_lit_drive(): + with pytest.raises(Exception, match="Unknown protocol for the drive 'id' argument"): + Drive("invalid_drive_id") with pytest.raises( Exception, match="The id should be unique to identify your drive. Found `this_drive_id/something_else`." @@ -213,9 +213,46 @@ def test_drive(): os.remove("a.txt") -def test_maybe_create_drive(): +def test_s3_drives(): + drive = Drive("s3://foo/", allow_duplicates=True) + drive.component_name = "root.work" - drive = Drive("lit://drive_3", allow_duplicates=False) + with pytest.raises( + Exception, match="S3 based drives cannot currently add files via this API. Did you mean to use `lit://` drives?" + ): + drive.put("a.txt") + with pytest.raises( + Exception, + match="S3 based drives cannot currently list files via this API. Did you mean to use `lit://` drives?", + ): + drive.list("a.txt") + with pytest.raises( + Exception, match="S3 based drives cannot currently get files via this API. Did you mean to use `lit://` drives?" + ): + drive.get("a.txt") + with pytest.raises( + Exception, + match="S3 based drives cannot currently delete files via this API. Did you mean to use `lit://` drives?", + ): + drive.delete("a.txt") + + _set_flow_context() + with pytest.raises(Exception, match="The flow isn't allowed to put files into a Drive."): + drive.put("a.txt") + with pytest.raises(Exception, match="The flow isn't allowed to list files from a Drive."): + drive.list("a.txt") + with pytest.raises(Exception, match="The flow isn't allowed to get files from a Drive."): + drive.get("a.txt") + + +def test_create_s3_drive_without_trailing_slash_fails(): + with pytest.raises(ValueError, match="S3 drives must end in a trailing slash"): + Drive("s3://foo") + + +@pytest.mark.parametrize("drive_id", ["lit://drive", "s3://drive/"]) +def test_maybe_create_drive(drive_id): + drive = Drive(drive_id, allow_duplicates=False) drive.component_name = "root.work1" new_drive = _maybe_create_drive(drive.component_name, drive.to_dict()) assert new_drive.protocol == drive.protocol @@ -223,9 +260,9 @@ def test_maybe_create_drive(): assert new_drive.component_name == drive.component_name -def test_drive_deepcopy(): - - drive = Drive("lit://drive", allow_duplicates=True) +@pytest.mark.parametrize("drive_id", ["lit://drive", "s3://drive/"]) +def test_drive_deepcopy(drive_id): + drive = Drive(drive_id, allow_duplicates=True) drive.component_name = "root.work1" new_drive = deepcopy(drive) assert new_drive.id == drive.id From 5396b1899fa2ed3de1a369a1551fa155a80c4321 Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Wed, 10 Aug 2022 22:34:23 +0200 Subject: [PATCH 151/230] Resolve e2es V3 (#14153) update --- tests/tests_app_examples/test_boring_app.py | 4 ++++ tests/tests_app_examples/test_drive.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/tests_app_examples/test_boring_app.py b/tests/tests_app_examples/test_boring_app.py index 0ca1b823b4706..afb958571d16b 100644 --- a/tests/tests_app_examples/test_boring_app.py +++ b/tests/tests_app_examples/test_boring_app.py @@ -13,6 +13,7 @@ def test_boring_app_example_cloud() -> None: with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_boring/"), app_name="app_dynamic.py") as ( _, view_page, + fetch_logs, name, ): @@ -24,6 +25,9 @@ def check_hello_there(*_, **__): wait_for(view_page, check_hello_there) + for _ in fetch_logs(): + pass + runner = CliRunner() result = runner.invoke(logs, [name]) lines = result.output.splitlines() diff --git a/tests/tests_app_examples/test_drive.py b/tests/tests_app_examples/test_drive.py index 630e76b550e9e..dde68d1a85113 100644 --- a/tests/tests_app_examples/test_drive.py +++ b/tests/tests_app_examples/test_drive.py @@ -18,7 +18,7 @@ def test_drive_example_cloud() -> None: has_logs = False while not has_logs: - for log in fetch_logs(["flow"]): + for log in fetch_logs(): if "Application End!" in log: has_logs = True sleep(1) From 4008f9cd414db2b0319b62ab4cb5d2193c6e97ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 10 Aug 2022 23:15:12 +0200 Subject: [PATCH 152/230] Convert subprocess test to standalone test (#14101) --- tests/tests_pytorch/run_standalone_tasks.sh | 10 ++- tests/tests_pytorch/serve/__init__.py | 0 tests/tests_pytorch/strategies/ddp_model.py | 58 ---------------- .../strategies/scripts/__init__.py | 0 .../strategies/scripts/cli_script.py | 24 +++++++ tests/tests_pytorch/strategies/test_ddp.py | 67 +++++++------------ tests/tests_pytorch/utilities/distributed.py | 45 ------------- 7 files changed, 55 insertions(+), 149 deletions(-) create mode 100644 tests/tests_pytorch/serve/__init__.py delete mode 100644 tests/tests_pytorch/strategies/ddp_model.py create mode 100644 tests/tests_pytorch/strategies/scripts/__init__.py create mode 100644 tests/tests_pytorch/strategies/scripts/cli_script.py delete mode 100644 tests/tests_pytorch/utilities/distributed.py diff --git a/tests/tests_pytorch/run_standalone_tasks.sh b/tests/tests_pytorch/run_standalone_tasks.sh index 960bd867ceaa4..698ed7863ab96 100644 --- a/tests/tests_pytorch/run_standalone_tasks.sh +++ b/tests/tests_pytorch/run_standalone_tasks.sh @@ -34,6 +34,10 @@ fi # test that a user can manually launch individual processes echo "Running manual ddp launch test" export PYTHONPATH="${PYTHONPATH}:$(pwd)" -args="--trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} & -MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python ../../examples/convert_from_pt_to_pl/image_classifier_5_lightning_datamodule.py ${args} +args="fit --trainer.accelerator gpu --trainer.devices 2 --trainer.strategy ddp --trainer.max_epochs=1 --trainer.limit_train_batches=1 --trainer.limit_val_batches=1 --trainer.limit_test_batches=1" +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=1 python strategies/scripts/cli_script.py ${args} & +MASTER_ADDR="localhost" MASTER_PORT=1234 LOCAL_RANK=0 python strategies/scripts/cli_script.py ${args} + +# test that ddp can launched as a module (-m option) +echo "Running ddp example as module" +python -m strategies.scripts.cli_script ${args} diff --git a/tests/tests_pytorch/serve/__init__.py b/tests/tests_pytorch/serve/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/ddp_model.py b/tests/tests_pytorch/strategies/ddp_model.py deleted file mode 100644 index 76d1f3f2f6866..0000000000000 --- a/tests/tests_pytorch/strategies/ddp_model.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Runs either `.fit()` or `.test()` on a single node across multiple gpus.""" -import os -from argparse import ArgumentParser - -import torch - -from pytorch_lightning import seed_everything, Trainer -from tests_pytorch.helpers.datamodules import ClassifDataModule -from tests_pytorch.helpers.simple_models import ClassificationModel - - -def main(): - seed_everything(4321) - - parser = ArgumentParser(add_help=False) - parser = Trainer.add_argparse_args(parser) - parser.add_argument("--trainer_method", default="fit") - parser.add_argument("--tmpdir") - parser.add_argument("--workdir") - parser.set_defaults(accelerator="gpu", devices=2) - parser.set_defaults(strategy="ddp") - args = parser.parse_args() - - dm = ClassifDataModule() - model = ClassificationModel() - trainer = Trainer.from_argparse_args(args) - - if args.trainer_method == "fit": - trainer.fit(model, datamodule=dm) - result = None - elif args.trainer_method == "test": - result = trainer.test(model, datamodule=dm) - elif args.trainer_method == "fit_test": - trainer.fit(model, datamodule=dm) - result = trainer.test(model, datamodule=dm) - else: - raise ValueError(f"Unsupported: {args.trainer_method}") - - result_ext = {"status": "complete", "method": args.trainer_method, "result": result} - file_path = os.path.join(args.tmpdir, "ddp.result") - torch.save(result_ext, file_path) - - -if __name__ == "__main__": - main() diff --git a/tests/tests_pytorch/strategies/scripts/__init__.py b/tests/tests_pytorch/strategies/scripts/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tests_pytorch/strategies/scripts/cli_script.py b/tests/tests_pytorch/strategies/scripts/cli_script.py new file mode 100644 index 0000000000000..17f0d29392eb9 --- /dev/null +++ b/tests/tests_pytorch/strategies/scripts/cli_script.py @@ -0,0 +1,24 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""A trivial script that wraps a LightningCLI around the BoringModel and BoringDataModule.""" +from pytorch_lightning.cli import LightningCLI +from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel + +if __name__ == "__main__": + LightningCLI( + BoringModel, + BoringDataModule, + seed_everything_default=42, + save_config_overwrite=True, + ) diff --git a/tests/tests_pytorch/strategies/test_ddp.py b/tests/tests_pytorch/strategies/test_ddp.py index 1a2a0475e7ed6..9b196f3e2a97f 100644 --- a/tests/tests_pytorch/strategies/test_ddp.py +++ b/tests/tests_pytorch/strategies/test_ddp.py @@ -21,60 +21,41 @@ from torch.nn.parallel.distributed import DistributedDataParallel import pytorch_lightning as pl -from pytorch_lightning import Trainer +from pytorch_lightning import seed_everything, Trainer from pytorch_lightning.callbacks import Callback from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.strategies import DDPStrategy +from tests_pytorch.helpers.datamodules import ClassifDataModule from tests_pytorch.helpers.runif import RunIf -from tests_pytorch.strategies import ddp_model -from tests_pytorch.utilities.distributed import call_training_script +from tests_pytorch.helpers.simple_models import ClassificationModel -CLI_ARGS = "--max_epochs 1 --accelerator gpu --devices 2 --strategy ddp" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit", tmpdir, timeout=120, as_module=as_module) - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_test_only(tmpdir): + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.test(model, datamodule=dm) - # verify the file wrote the expected outputs - assert result["status"] == "complete" +@RunIf(min_cuda_gpus=2, standalone=True) +def test_multi_gpu_model_ddp_fit_test(tmpdir): + seed_everything(4321) + dm = ClassifDataModule() + model = ClassificationModel() + trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, accelerator="gpu", devices=2, strategy="ddp") + trainer.fit(model, datamodule=dm) + result = trainer.test(model, datamodule=dm) -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_test_only(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "test", tmpdir, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - -@RunIf(min_cuda_gpus=2) -@pytest.mark.parametrize("as_module", [True, False]) -def test_multi_gpu_model_ddp_fit_test(tmpdir, as_module): - # call the script - call_training_script(ddp_model, CLI_ARGS, "fit_test", tmpdir, timeout=20, as_module=as_module) - - # load the results of the script - result_path = os.path.join(tmpdir, "ddp.result") - result = torch.load(result_path) - - # verify the file wrote the expected outputs - assert result["status"] == "complete" - - model_outs = result["result"] - for out in model_outs: + for out in result: assert out["test_acc"] > 0.7 diff --git a/tests/tests_pytorch/utilities/distributed.py b/tests/tests_pytorch/utilities/distributed.py deleted file mode 100644 index 38a50edcc7177..0000000000000 --- a/tests/tests_pytorch/utilities/distributed.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright The PyTorch Lightning team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import subprocess -import sys -from pathlib import Path -from subprocess import TimeoutExpired - -import pytorch_lightning - - -def call_training_script(module_file, cli_args, method, tmpdir, timeout=60, as_module=False): - file = Path(module_file.__file__).absolute() - cli_args = cli_args.split(" ") if cli_args else [] - cli_args += ["--tmpdir", str(tmpdir)] - cli_args += ["--trainer_method", method] - file_args = ["-m", module_file.__spec__.name] if as_module else [str(file)] - command = [sys.executable] + file_args + cli_args - - # need to set the PYTHONPATH in case pytorch_lightning was not installed into the environment - env = os.environ.copy() - env["PYTHONPATH"] = env.get("PYTHONPATH", "") + f"{pytorch_lightning.__file__}:" - - # for running in ddp mode, we need to launch it's own process or pytest will get stuck - p = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - try: - std, err = p.communicate(timeout=timeout) - err = str(err.decode("utf-8")) - if "Exception" in err: - raise Exception(err) - except TimeoutExpired: - p.kill() - std, err = p.communicate() - return std, err From 2a10a36b9211fbecdfc79dc0bdae9b972ec8f91d Mon Sep 17 00:00:00 2001 From: William Falcon Date: Wed, 10 Aug 2022 18:30:01 -0400 Subject: [PATCH 153/230] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f9d5a9a57f5e2..6f075f5fd42b6 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@
-**Build high-performance PyTorch models and deploy them with Lightning Apps (scalable end-to-end ML systems).** +**Build high-performance (PyTorch) models, research workflows, ML production pipelines.** ______________________________________________________________________ From a7cebf24169dbe80c5e718946cb5de931082f814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 01:32:32 +0200 Subject: [PATCH 154/230] Fix entry point test for Python 3.10 (#14154) --- .../trainer/connectors/test_callback_connector.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py index d6d5018aa1dd0..02e846425a2a0 100644 --- a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py @@ -30,7 +30,7 @@ ) from pytorch_lightning.demos.boring_classes import BoringModel from pytorch_lightning.trainer.connectors.callback_connector import CallbackConnector -from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0 +from pytorch_lightning.utilities.imports import _PYTHON_GREATER_EQUAL_3_8_0, _PYTHON_GREATER_EQUAL_3_10_0 def test_checkpoint_callbacks_are_last(tmpdir): @@ -265,7 +265,10 @@ def _make_entry_point_query_mock(callback_factory): entry_point = Mock() entry_point.name = "mocked" entry_point.load.return_value = callback_factory - if _PYTHON_GREATER_EQUAL_3_8_0: + if _PYTHON_GREATER_EQUAL_3_10_0: + query_mock.return_value = [entry_point] + import_path = "importlib.metadata.entry_points" + elif _PYTHON_GREATER_EQUAL_3_8_0: query_mock().get.return_value = [entry_point] import_path = "importlib.metadata.entry_points" else: From 3dc08b1ef565774853467a7e56842becfa381dd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Aug 2022 09:33:19 +0200 Subject: [PATCH 155/230] Fix flaky test caused by weak reference (#14157) --- tests/tests_pytorch/trainer/connectors/test_data_connector.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 52ef4c4db6d8d..2650e46b7fa60 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -445,7 +445,8 @@ def test_dataloader_source_direct_access(): def test_dataloader_source_request_from_module(): """Test requesting a dataloader from a module works.""" module = BoringModel() - module.trainer = Trainer() + trainer = Trainer() + module.trainer = trainer module.foo = Mock(return_value=module.train_dataloader()) source = _DataLoaderSource(module, "foo") From 6eed72b621921856a846e39e4dd6bc9fd764348b Mon Sep 17 00:00:00 2001 From: thomas chaton Date: Thu, 11 Aug 2022 12:35:00 +0200 Subject: [PATCH 156/230] (app) Introduce configure_api and Post, Get, Delete, Put HttpMethods (#13945) --- .github/workflows/ci-app-cloud-e2e-test.yml | 2 +- .../.lightning | 0 .../app.py | 18 +- .../command.py | 0 src/lightning_app/CHANGELOG.md | 1 + src/lightning_app/api/__init__.py | 3 + src/lightning_app/api/http_methods.py | 107 +++++++++++ src/lightning_app/api/request_types.py | 36 ++++ src/lightning_app/cli/lightning_cli.py | 50 ++--- src/lightning_app/core/api.py | 126 +++++-------- src/lightning_app/core/app.py | 32 ++-- src/lightning_app/core/flow.py | 33 ++++ src/lightning_app/core/queues.py | 22 +-- src/lightning_app/runners/backends/backend.py | 5 +- src/lightning_app/runners/multiprocess.py | 19 +- src/lightning_app/utilities/cli_helpers.py | 38 +++- src/lightning_app/utilities/commands/base.py | 175 ++++++++---------- src/lightning_app/utilities/enum.py | 6 + src/lightning_app/utilities/network.py | 3 +- tests/tests_app/core/test_lightning_api.py | 107 ++++++++--- tests/tests_app/utilities/test_app_logs.py | 2 + tests/tests_app/utilities/test_commands.py | 33 ++-- tests/tests_app_examples/test_commands.py | 32 ---- .../test_commands_and_api.py | 42 +++++ 24 files changed, 568 insertions(+), 324 deletions(-) rename examples/{app_commands => app_commands_and_api}/.lightning (100%) rename examples/{app_commands => app_commands_and_api}/app.py (56%) rename examples/{app_commands => app_commands_and_api}/command.py (100%) create mode 100644 src/lightning_app/api/__init__.py create mode 100644 src/lightning_app/api/http_methods.py create mode 100644 src/lightning_app/api/request_types.py delete mode 100644 tests/tests_app_examples/test_commands.py create mode 100644 tests/tests_app_examples/test_commands_and_api.py diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index 3ad455650a117..9a5a10a95cd33 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -54,7 +54,7 @@ jobs: - custom_work_dependencies - drive - payload - - commands + - commands_and_api timeout-minutes: 35 steps: - uses: actions/checkout@v2 diff --git a/examples/app_commands/.lightning b/examples/app_commands_and_api/.lightning similarity index 100% rename from examples/app_commands/.lightning rename to examples/app_commands_and_api/.lightning diff --git a/examples/app_commands/app.py b/examples/app_commands_and_api/app.py similarity index 56% rename from examples/app_commands/app.py rename to examples/app_commands_and_api/app.py index 99eb15c75c709..0d15bc531bb38 100644 --- a/examples/app_commands/app.py +++ b/examples/app_commands_and_api/app.py @@ -1,15 +1,16 @@ from command import CustomCommand, CustomConfig from lightning import LightningFlow +from lightning_app.api import Post from lightning_app.core.app import LightningApp class ChildFlow(LightningFlow): - def trigger_method(self, name: str): + def nested_command(self, name: str): print(f"Hello {name}") def configure_commands(self): - return [{"nested_trigger_command": self.trigger_method}] + return [{"nested_command": self.nested_command}] class FlowCommands(LightningFlow): @@ -19,21 +20,24 @@ def __init__(self): self.child_flow = ChildFlow() def run(self): - if len(self.names): + if self.names: print(self.names) - def trigger_without_client_command(self, name: str): + def command_without_client(self, name: str): self.names.append(name) - def trigger_with_client_command(self, config: CustomConfig): + def command_with_client(self, config: CustomConfig): self.names.append(config.name) def configure_commands(self): commands = [ - {"trigger_without_client_command": self.trigger_without_client_command}, - {"trigger_with_client_command": CustomCommand(self.trigger_with_client_command)}, + {"command_without_client": self.command_without_client}, + {"command_with_client": CustomCommand(self.command_with_client)}, ] return commands + self.child_flow.configure_commands() + def configure_api(self): + return [Post("/user/command_without_client", self.command_without_client)] + app = LightningApp(FlowCommands()) diff --git a/examples/app_commands/command.py b/examples/app_commands_and_api/command.py similarity index 100% rename from examples/app_commands/command.py rename to examples/app_commands_and_api/command.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index ea28c57611311..7158d1ff7a2da 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -25,6 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). +- Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) ### Changed - diff --git a/src/lightning_app/api/__init__.py b/src/lightning_app/api/__init__.py new file mode 100644 index 0000000000000..25ec5c4708761 --- /dev/null +++ b/src/lightning_app/api/__init__.py @@ -0,0 +1,3 @@ +from lightning_app.api.http_methods import Delete, Get, Post, Put + +__all__ = ["Delete", "Get", "Post", "Put"] diff --git a/src/lightning_app/api/http_methods.py b/src/lightning_app/api/http_methods.py new file mode 100644 index 0000000000000..02b6ec87f17d2 --- /dev/null +++ b/src/lightning_app/api/http_methods.py @@ -0,0 +1,107 @@ +import asyncio +import inspect +import time +from copy import deepcopy +from functools import wraps +from multiprocessing import Queue +from typing import Any, Callable, Dict, List, Optional +from uuid import uuid4 + +from fastapi import FastAPI + +from lightning_app.api.request_types import APIRequest, CommandRequest + + +def _signature_proxy_function(): + pass + + +class HttpMethod: + def __init__(self, route: str, method: Callable, method_name: Optional[str] = None, timeout: int = 30, **kwargs): + """This class is used to inject user defined methods within the App Rest API. + + Arguments: + route: The path used to route the requests + method: The associated flow method + timeout: The time in seconds taken before raising a timeout exception. + """ + self.route = route + self.component_name = method.__self__.name + self.method_name = method_name or method.__name__ + self.method_annotations = method.__annotations__ + # TODO: Validate the signature contains only pydantic models. + self.method_signature = inspect.signature(method) + self.timeout = timeout + self.kwargs = kwargs + + def add_route(self, app: FastAPI, request_queue: Queue, responses_store: Dict[str, Any]) -> None: + # 1: Create a proxy function with the signature of the wrapped method. + fn = deepcopy(_signature_proxy_function) + fn.__annotations__ = self.method_annotations + fn.__name__ = self.method_name + setattr(fn, "__signature__", self.method_signature) + + # 2: Get the route associated with the http method. + route = getattr(app, self.__class__.__name__.lower()) + + request_cls = CommandRequest if self.route.startswith("/command/") else APIRequest + + # 3: Define the request handler. + @wraps(_signature_proxy_function) + async def _handle_request(*args, **kwargs): + async def fn(*args, **kwargs): + request_id = str(uuid4()).split("-")[0] + request_queue.put( + request_cls( + name=self.component_name, + method_name=self.method_name, + args=args, + kwargs=kwargs, + id=request_id, + ) + ) + + t0 = time.time() + while request_id not in responses_store: + await asyncio.sleep(0.1) + if (time.time() - t0) > self.timeout: + raise Exception("The response was never received.") + + return responses_store.pop(request_id) + + return await asyncio.create_task(fn(*args, **kwargs)) + + # 4: Register the user provided route to the Rest API. + route(self.route, **self.kwargs)(_handle_request) + + +class Post(HttpMethod): + pass + + +class Get(HttpMethod): + + pass + + +class Put(HttpMethod): + + pass + + +class Delete(HttpMethod): + pass + + +def _add_tags_to_api(apis: List[HttpMethod], tags: List[str]) -> None: + for api in apis: + if not api.kwargs.get("tag"): + api.kwargs["tags"] = tags + + +def _validate_api(apis: List[HttpMethod]) -> None: + for api in apis: + if not isinstance(api, HttpMethod): + raise Exception(f"The provided api should be either [{Delete}, {Get}, {Post}, {Put}]") + if api.route.startswith("/command"): + raise Exception("The route `/command` is reserved for commands. Please, use something else.") diff --git a/src/lightning_app/api/request_types.py b/src/lightning_app/api/request_types.py new file mode 100644 index 0000000000000..53a6df25820a3 --- /dev/null +++ b/src/lightning_app/api/request_types.py @@ -0,0 +1,36 @@ +from dataclasses import asdict, dataclass +from typing import Any + +from deepdiff import Delta + + +@dataclass +class BaseRequest: + def to_dict(self): + return asdict(self) + + +@dataclass +class DeltaRequest(BaseRequest): + delta: Delta + + def to_dict(self): + return self.delta.to_dict() + + +@dataclass +class CommandRequest(BaseRequest): + id: str + name: str + method_name: str + args: Any + kwargs: Any + + +@dataclass +class APIRequest(BaseRequest): + id: str + name: str + method_name: str + args: Any + kwargs: Any diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index babe0aa2b2abc..6a6e41df57026 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -4,7 +4,6 @@ from argparse import ArgumentParser from pathlib import Path from typing import List, Tuple, Union -from uuid import uuid4 import click import requests @@ -26,10 +25,10 @@ _retrieve_application_url_and_available_commands, ) from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth from lightning_app.utilities.network import LightningClient -from lightning_app.utilities.state import headers_for logger = logging.getLogger(__name__) @@ -263,41 +262,42 @@ def app_command(): hparams, argv = parser.parse_known_args() # 1: Collect the url and comments from the running application - url, commands = _retrieve_application_url_and_available_commands(hparams.app_id) - if url is None or commands is None: + url, api_commands = _retrieve_application_url_and_available_commands(hparams.app_id) + if url is None or api_commands is None: raise Exception("We couldn't find any matching running app.") - if not commands: + if not api_commands: raise Exception("This application doesn't expose any commands yet.") command = argv[0] - command_names = [c["command"] for c in commands] - if command not in command_names: - raise Exception(f"The provided command {command} isn't available in {command_names}") + if command not in api_commands: + raise Exception(f"The provided command {command} isn't available in {list(api_commands)}") # 2: Send the command from the user - command_metadata = [c for c in commands if c["command"] == command][0] - params = command_metadata["params"] + metadata = api_commands[command] # 3: Execute the command - if not command_metadata["is_client_command"]: - # TODO: Improve what is supported there. - kwargs = {k.split("=")[0].replace("--", ""): k.split("=")[1] for k in argv[1:]} - for param in params: - if param not in kwargs: - raise Exception(f"The argument --{param}=X hasn't been provided.") - json = { - "command_name": command, - "command_arguments": kwargs, - "affiliation": command_metadata["affiliation"], - "id": str(uuid4()), - } - resp = requests.post(url + "/api/v1/commands", json=json, headers=headers_for({})) + if metadata["tag"] == OpenAPITags.APP_COMMAND: + # TODO: Improve what is current supported + kwargs = [v.replace("--", "") for v in argv[1:]] + + for p in kwargs: + if p.split("=")[0] not in metadata["parameters"]: + raise Exception(f"Some arguments need to be provided. The keys are {list(metadata['parameters'])}.") + # TODO: Encode the parameters and validate their type. + query_parameters = "&".join(kwargs) + resp = requests.post(url + f"/command/{command}?{query_parameters}") assert resp.status_code == 200, resp.json() else: - client_command, models = _download_command(command_metadata, hparams.app_id, debug_mode=debug_mode) - client_command._setup(metadata=command_metadata, models=models, app_url=url) + client_command = _download_command( + command, + metadata["cls_path"], + metadata["cls_name"], + hparams.app_id, + debug_mode=debug_mode, + ) + client_command._setup(command_name=command, app_url=url) sys.argv = argv client_command.run() diff --git a/src/lightning_app/core/api.py b/src/lightning_app/core/api.py index f19ada5340d57..8b625713e0c2c 100644 --- a/src/lightning_app/core/api.py +++ b/src/lightning_app/core/api.py @@ -3,7 +3,6 @@ import os import queue import sys -import time import traceback from copy import deepcopy from multiprocessing import Queue @@ -21,9 +20,12 @@ from pydantic import BaseModel from websockets.exceptions import ConnectionClosed +from lightning_app.api.http_methods import HttpMethod +from lightning_app.api.request_types import DeltaRequest from lightning_app.core.constants import FRONTEND_DIR from lightning_app.core.queues import RedisQueue from lightning_app.utilities.app_helpers import InMemoryStateStore, StateStore +from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.imports import _is_redis_available, _is_starsessions_available if _is_starsessions_available(): @@ -42,9 +44,6 @@ class SessionMiddleware: frontend_static_dir = os.path.join(FRONTEND_DIR, "static") api_app_delta_queue: Queue = None -api_commands_requests_queue: Queue = None -api_commands_metadata_queue: Queue = None -api_commands_responses_queue: Queue = None template = {"ui": {}, "app": {}} templates = Jinja2Templates(directory=FRONTEND_DIR) @@ -56,8 +55,8 @@ class SessionMiddleware: lock = Lock() app_spec: Optional[List] = None -app_commands_metadata: Optional[Dict] = None -commands_response_store = {} +# In the future, this would be abstracted to support horizontal scaling. +responses_store = {} logger = logging.getLogger(__name__) @@ -67,11 +66,10 @@ class SessionMiddleware: class UIRefresher(Thread): - def __init__(self, api_publish_state_queue, api_commands_metadata_queue, api_commands_responses_queue) -> None: + def __init__(self, api_publish_state_queue, api_response_queue) -> None: super().__init__(daemon=True) self.api_publish_state_queue = api_publish_state_queue - self.api_commands_metadata_queue = api_commands_metadata_queue - self.api_commands_responses_queue = api_commands_responses_queue + self.api_response_queue = api_response_queue self._exit_event = Event() def run(self): @@ -93,18 +91,11 @@ def run_once(self): pass try: - metadata = self.api_commands_metadata_queue.get(timeout=0) + response = self.api_response_queue.get(timeout=0) with lock: - global app_commands_metadata - app_commands_metadata = metadata - except queue.Empty: - pass - - try: - response = self.api_commands_responses_queue.get(timeout=0) - with lock: - global commands_response_store - commands_response_store[response["id"]] = response["response"] + # TODO: Abstract the responses store to support horizontal scaling. + global responses_store + responses_store[response["id"]] = response["response"] except queue.Empty: pass @@ -117,6 +108,23 @@ class StateUpdate(BaseModel): state: dict = {} +openapi_tags = [ + { + "name": OpenAPITags.APP_CLIENT_COMMAND, + "description": "The App Endpoints to be triggered exclusively from the CLI", + }, + { + "name": OpenAPITags.APP_COMMAND, + "description": "The App Endpoints that can be triggered equally from the CLI or from a Http Request", + }, + { + "name": OpenAPITags.APP_API, + "description": "The App Endpoints that can be triggered exclusively from a Http Request", + }, +] + +app = FastAPI(openapi_tags=openapi_tags) + fastapi_service = FastAPI() fastapi_service.add_middleware( @@ -176,50 +184,13 @@ async def get_spec( return app_spec or [] -@fastapi_service.post("/api/v1/commands", response_class=JSONResponse) -async def run_remote_command( - request: Request, -) -> None: - data = await request.json() - command_name = data.get("command_name", None) - if not command_name: - raise Exception("The provided command name is empty.") - command_arguments = data.get("command_arguments", None) - if not command_arguments: - raise Exception("The provided command metadata is empty.") - affiliation = data.get("affiliation", None) - if not affiliation: - raise Exception("The provided affiliation is empty.") - - async def fn(data): - request_id = data["id"] - api_commands_requests_queue.put(data) - - t0 = time.time() - while request_id not in commands_response_store: - await asyncio.sleep(0.1) - if (time.time() - t0) > 15: - raise Exception("The response was never received.") - - return commands_response_store[request_id] - - return await asyncio.create_task(fn(data)) - - -@fastapi_service.get("/api/v1/commands", response_class=JSONResponse) -async def get_commands() -> Optional[Dict]: - global app_commands_metadata - with lock: - return app_commands_metadata - - @fastapi_service.post("/api/v1/delta") async def post_delta( request: Request, x_lightning_type: Optional[str] = Header(None), x_lightning_session_uuid: Optional[str] = Header(None), x_lightning_session_id: Optional[str] = Header(None), -) -> Mapping: +) -> None: """This endpoint is used to make an update to the app state using delta diff, mainly used by streamlit to update the state.""" @@ -229,9 +200,7 @@ async def post_delta( raise Exception("Missing X-Lightning-Session-ID header") body: Dict = await request.json() - delta = body["delta"] - update_delta = Delta(delta) - api_app_delta_queue.put(update_delta) + api_app_delta_queue.put(DeltaRequest(delta=Delta(body["delta"]))) @fastapi_service.post("/api/v1/state") @@ -240,7 +209,7 @@ async def post_state( x_lightning_type: Optional[str] = Header(None), x_lightning_session_uuid: Optional[str] = Header(None), x_lightning_session_id: Optional[str] = Header(None), -) -> Mapping: +) -> None: if x_lightning_session_uuid is None: raise Exception("Missing X-Lightning-Session-UUID header") if x_lightning_session_id is None: @@ -263,8 +232,7 @@ async def post_state( state = body["state"] last_state = global_app_state_store.get_served_state(x_lightning_session_uuid) deep_diff = DeepDiff(last_state, state, verbose_level=2) - update_delta = Delta(deep_diff) - api_app_delta_queue.put(update_delta) + api_app_delta_queue.put(DeltaRequest(delta=Delta(deep_diff))) @fastapi_service.get("/healthz", status_code=200) @@ -307,8 +275,6 @@ async def websocket_endpoint(websocket: WebSocket): await websocket.close() -# Catch-all for nonexistent API routes (since we define a catch-all for client-side routing) -@fastapi_service.get("/api{full_path:path}", response_class=JSONResponse) async def api_catch_all(request: Request, full_path: str): raise HTTPException(status_code=404, detail="Not found") @@ -317,14 +283,18 @@ async def api_catch_all(request: Request, full_path: str): fastapi_service.mount("/static", StaticFiles(directory=frontend_static_dir, check_dir=False), name="static") -# Catch-all for frontend routes, must be defined after all other routes -@fastapi_service.get("/{full_path:path}", response_class=HTMLResponse) async def frontend_route(request: Request, full_path: str): if "pytest" in sys.modules: return "" return templates.TemplateResponse("index.html", {"request": request}) +def register_global_routes(): + # Catch-all for nonexistent API routes (since we define a catch-all for client-side routing) + fastapi_service.get("/api{full_path:path}", response_class=JSONResponse)(api_catch_all) + fastapi_service.get("/{full_path:path}", response_class=HTMLResponse)(frontend_route) + + class LightningUvicornServer(uvicorn.Server): has_started_queue = None @@ -346,34 +316,28 @@ async def check_is_started(self, queue): def start_server( api_publish_state_queue, api_delta_queue, - commands_requests_queue, - commands_responses_queue, - commands_metadata_queue, + api_response_queue, has_started_queue: Optional[Queue] = None, host="127.0.0.1", port=8000, uvicorn_run: bool = True, spec: Optional[List] = None, + apis: Optional[List[HttpMethod]] = None, app_state_store: Optional[StateStore] = None, ): global api_app_delta_queue global global_app_state_store - global api_commands_requests_queue - global api_commands_responses_queue global app_spec app_spec = spec api_app_delta_queue = api_delta_queue - api_commands_requests_queue = commands_requests_queue - api_commands_responses_queue = commands_responses_queue - api_commands_metadata_queue = commands_metadata_queue if app_state_store is not None: global_app_state_store = app_state_store global_app_state_store.add(TEST_SESSION_UUID) - refresher = UIRefresher(api_publish_state_queue, api_commands_metadata_queue, commands_responses_queue) + refresher = UIRefresher(api_publish_state_queue, api_response_queue) refresher.setDaemon(True) refresher.start() @@ -384,6 +348,14 @@ def start_server( LightningUvicornServer.has_started_queue = has_started_queue # uvicorn is doing some uglyness by replacing uvicorn.main by click command. sys.modules["uvicorn.main"].Server = LightningUvicornServer + + # Register the user API. + if apis: + for api in apis: + api.add_route(fastapi_service, api_app_delta_queue, responses_store) + + register_global_routes() + uvicorn.run(app=fastapi_service, host=host, port=port, log_level="error") return refresher diff --git a/src/lightning_app/core/app.py b/src/lightning_app/core/app.py index 3f9e2521eb21d..65242a1ae0a2a 100644 --- a/src/lightning_app/core/app.py +++ b/src/lightning_app/core/app.py @@ -11,12 +11,13 @@ from deepdiff import DeepDiff, Delta import lightning_app +from lightning_app.api.request_types import APIRequest, CommandRequest, DeltaRequest from lightning_app.core.constants import FLOW_DURATION_SAMPLES, FLOW_DURATION_THRESHOLD, STATE_ACCUMULATE_WAIT from lightning_app.core.queues import BaseQueue, SingleProcessQueue from lightning_app.frontend import Frontend from lightning_app.storage.path import storage_root_dir from lightning_app.utilities.app_helpers import _delta_to_app_state_delta, _LightningAppRef -from lightning_app.utilities.commands.base import _populate_commands_endpoint, _process_command_requests +from lightning_app.utilities.commands.base import _process_requests from lightning_app.utilities.component import _convert_paths_after_init from lightning_app.utilities.enum import AppStage, CacheCallsKeys from lightning_app.utilities.exceptions import CacheMissException, ExitAppException @@ -73,9 +74,7 @@ def __init__( # queues definition. self.delta_queue: t.Optional[BaseQueue] = None self.readiness_queue: t.Optional[BaseQueue] = None - self.commands_requests_queue: t.Optional[BaseQueue] = None - self.commands_responses_queue: t.Optional[BaseQueue] = None - self.commands_metadata_queue: t.Optional[BaseQueue] = None + self.api_response_queue: t.Optional[BaseQueue] = None self.api_publish_state_queue: t.Optional[BaseQueue] = None self.api_delta_queue: t.Optional[BaseQueue] = None self.error_queue: t.Optional[BaseQueue] = None @@ -253,7 +252,7 @@ def named_works(self) -> t.List[t.Tuple[str, "lightning_app.LightningWork"]]: """Returns all the works defined within this application with their names.""" return self.root.named_works(recurse=True) - def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]: + def _collect_deltas_from_ui_and_work_queues(self) -> t.List[t.Union[Delta, APIRequest, CommandRequest]]: # The aggregation would try to get as many deltas as possible # from both the `api_delta_queue` and `delta_queue` # during the `state_accumulate_wait` time. @@ -267,8 +266,12 @@ def _collect_deltas_from_ui_and_work_queues(self) -> t.List[Delta]: while (time() - t0) < self.state_accumulate_wait: if self.api_delta_queue and should_get_delta_from_api: - delta_from_api: Delta = self.get_state_changed_from_queue(self.api_delta_queue) # TODO: rename + delta_from_api: t.Union[DeltaRequest, APIRequest, CommandRequest] = self.get_state_changed_from_queue( + self.api_delta_queue + ) # TODO: rename if delta_from_api: + if isinstance(delta_from_api, DeltaRequest): + delta_from_api = delta_from_api.delta deltas.append(delta_from_api) else: should_get_delta_from_api = False @@ -317,8 +320,19 @@ def maybe_apply_changes(self) -> bool: logger.debug(f"Received {[d.to_dict() for d in deltas]}") - state = self.state + # 1: Process the API / Command Requests first as they might affect the state. + state_deltas = [] for delta in deltas: + if isinstance(delta, (APIRequest, CommandRequest)): + _process_requests(self, delta) + else: + state_deltas.append(delta) + + # 2: Collect the state + state = self.state + + # 3: Apply the state delta + for delta in state_deltas: try: state += delta except Exception as e: @@ -351,8 +365,6 @@ def run_once(self): elif self.stage == AppStage.RESTARTING: return self._apply_restarting() - _process_command_requests(self) - t0 = time() try: @@ -411,8 +423,6 @@ def _run(self) -> bool: self._reset_run_time_monitor() - _populate_commands_endpoint(self) - while not done: done = self.run_once() diff --git a/src/lightning_app/core/flow.py b/src/lightning_app/core/flow.py index f6b6e34e81538..41c46cd868307 100644 --- a/src/lightning_app/core/flow.py +++ b/src/lightning_app/core/flow.py @@ -634,3 +634,36 @@ def my_remote_method(self, name): lightning my_command_name --args name=my_own_name """ raise NotImplementedError + + def configure_api(self): + """Configure the API routes of the LightningFlow. + + Returns a list of HttpMethod such as Post or Get. + + .. code-block:: python + + from lightning_app import LightningFlow + from lightning_app.api import Post + + from pydantic import BaseModel + + + class HandlerModel(BaseModel): + name: str + + + class Flow(L.LightningFlow): + def __init__(self): + super().__init__() + self.names = [] + + def handler(self, config: HandlerModel) -> None: + self.names.append(config.name) + + def configure_api(self): + return [Post("/v1/api/request", self.handler)] + + Once the app is running, you can access the Swagger UI of the app + under the ``/docs`` route. + """ + raise NotImplementedError diff --git a/src/lightning_app/core/queues.py b/src/lightning_app/core/queues.py index efac8230047e0..2b7295d7f327f 100644 --- a/src/lightning_app/core/queues.py +++ b/src/lightning_app/core/queues.py @@ -36,9 +36,7 @@ ORCHESTRATOR_COPY_REQUEST_CONSTANT = "ORCHESTRATOR_COPY_REQUEST" ORCHESTRATOR_COPY_RESPONSE_CONSTANT = "ORCHESTRATOR_COPY_RESPONSE" WORK_QUEUE_CONSTANT = "WORK_QUEUE" -COMMANDS_REQUESTS_QUEUE_CONSTANT = "COMMANDS_REQUESTS_QUEUE" -COMMANDS_RESPONSES_QUEUE_CONSTANT = "COMMANDS_RESPONSES_QUEUE" -COMMANDS_METADATA_QUEUE_CONSTANT = "COMMANDS_METADATA_QUEUE" +API_RESPONSE_QUEUE_CONSTANT = "API_RESPONSE_QUEUE" class QueuingSystem(Enum): @@ -54,18 +52,8 @@ def _get_queue(self, queue_name: str) -> "BaseQueue": else: return SingleProcessQueue(queue_name, default_timeout=STATE_UPDATE_TIMEOUT) - def get_commands_requests_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = f"{queue_id}_{COMMANDS_REQUESTS_QUEUE_CONSTANT}" if queue_id else COMMANDS_REQUESTS_QUEUE_CONSTANT - return self._get_queue(queue_name) - - def get_commands_responses_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = ( - f"{queue_id}_{COMMANDS_RESPONSES_QUEUE_CONSTANT}" if queue_id else COMMANDS_RESPONSES_QUEUE_CONSTANT - ) - return self._get_queue(queue_name) - - def get_commands_metadata_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = f"{queue_id}_{COMMANDS_METADATA_QUEUE_CONSTANT}" if queue_id else COMMANDS_METADATA_QUEUE_CONSTANT + def get_api_response_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": + queue_name = f"{queue_id}_{API_RESPONSE_QUEUE_CONSTANT}" if queue_id else API_RESPONSE_QUEUE_CONSTANT return self._get_queue(queue_name) def get_readiness_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": @@ -98,10 +86,6 @@ def get_api_delta_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": queue_name = f"{queue_id}_{API_DELTA_QUEUE_CONSTANT}" if queue_id else API_DELTA_QUEUE_CONSTANT return self._get_queue(queue_name) - def get_api_refresh_queue(self, queue_id: Optional[str] = None) -> "BaseQueue": - queue_name = f"{queue_id}_{API_REFRESH_QUEUE_CONSTANT}" if queue_id else API_REFRESH_QUEUE_CONSTANT - return self._get_queue(queue_name) - def get_orchestrator_request_queue(self, work_name: str, queue_id: Optional[str] = None) -> "BaseQueue": queue_name = ( f"{queue_id}_{ORCHESTRATOR_REQUEST_CONSTANT}_{work_name}" diff --git a/src/lightning_app/runners/backends/backend.py b/src/lightning_app/runners/backends/backend.py index 87bb103823fd2..a944cd4aa9093 100644 --- a/src/lightning_app/runners/backends/backend.py +++ b/src/lightning_app/runners/backends/backend.py @@ -82,11 +82,8 @@ def _prepare_queues(self, app): kw = dict(queue_id=self.queue_id) app.delta_queue = self.queues.get_delta_queue(**kw) app.readiness_queue = self.queues.get_readiness_queue(**kw) - app.commands_requests_queue = self.queues.get_commands_requests_queue(**kw) - app.commands_responses_queue = self.queues.get_commands_responses_queue(**kw) - app.commands_metadata_queue = self.queues.get_commands_metadata_queue(**kw) + app.api_response_queue = self.queues.get_api_response_queue(**kw) app.error_queue = self.queues.get_error_queue(**kw) - app.delta_queue = self.queues.get_delta_queue(**kw) app.api_publish_state_queue = self.queues.get_api_state_publish_queue(**kw) app.api_delta_queue = self.queues.get_api_delta_queue(**kw) app.request_queues = {} diff --git a/src/lightning_app/runners/multiprocess.py b/src/lightning_app/runners/multiprocess.py index 92ec900d89c65..16e373b0a37a2 100644 --- a/src/lightning_app/runners/multiprocess.py +++ b/src/lightning_app/runners/multiprocess.py @@ -3,10 +3,13 @@ from dataclasses import dataclass from typing import Any, Callable, Optional, Union +from lightning_app.api.http_methods import _add_tags_to_api, _validate_api from lightning_app.core.api import start_server from lightning_app.runners.backends import Backend from lightning_app.runners.runtime import Runtime from lightning_app.storage.orchestrator import StorageOrchestrator +from lightning_app.utilities.app_helpers import is_overridden +from lightning_app.utilities.commands.base import _commands_to_api, _prepare_commands from lightning_app.utilities.component import _set_flow_context, _set_frontend_context from lightning_app.utilities.load_app import extract_metadata_from_app from lightning_app.utilities.network import find_free_network_port @@ -60,15 +63,25 @@ def dispatch(self, *args: Any, on_before_run: Optional[Callable] = None, **kwarg if self.start_server: self.app.should_publish_changes_to_api = True has_started_queue = self.backend.queues.get_has_server_started_queue() + + apis = [] + if is_overridden("configure_api", self.app.root): + apis = self.app.root.configure_api() + _validate_api(apis) + _add_tags_to_api(apis, ["app_api"]) + + if is_overridden("configure_commands", self.app.root): + commands = _prepare_commands(self.app) + apis += _commands_to_api(commands) + kwargs = dict( + apis=apis, host=self.host, port=self.port, + api_response_queue=self.app.api_response_queue, api_publish_state_queue=self.app.api_publish_state_queue, api_delta_queue=self.app.api_delta_queue, has_started_queue=has_started_queue, - commands_requests_queue=self.app.commands_requests_queue, - commands_responses_queue=self.app.commands_responses_queue, - commands_metadata_queue=self.app.commands_metadata_queue, spec=extract_metadata_from_app(self.app), ) server_proc = multiprocessing.Process(target=start_server, kwargs=kwargs) diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index fcce96ec64407..6000114c3d4d6 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -49,16 +49,42 @@ def _is_url(id: Optional[str]) -> bool: return False +def _get_metadata_from_openapi(paths: Dict, path: str): + parameters = paths[path]["post"].get("parameters", {}) + tag = paths[path]["post"].get("tags", [None])[0] + cls_path = paths[path]["post"].get("cls_path", None) + cls_name = paths[path]["post"].get("cls_name", None) + + metadata = {"tag": tag, "parameters": {}} + + if cls_path: + metadata["cls_path"] = cls_path + + if cls_name: + metadata["cls_name"] = cls_name + + if not parameters: + return metadata + + metadata["parameters"].update({d["name"]: d["schema"]["type"] for d in parameters}) + return metadata + + +def _extract_command_from_openapi(openapi_resp: Dict) -> Dict[str, Dict[str, str]]: + command_paths = [p for p in openapi_resp["paths"] if p.startswith("/command/")] + return {p.replace("/command/", ""): _get_metadata_from_openapi(openapi_resp["paths"], p) for p in command_paths} + + def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Optional[str]): """This function is used to retrieve the current url associated with an id.""" if _is_url(app_id_or_name_or_url): url = app_id_or_name_or_url assert url - resp = requests.get(url + "/api/v1/commands") + resp = requests.get(url + "/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return url, resp.json() + return url, _extract_command_from_openapi(resp.json()) # 2: If no identifier has been provided, evaluate the local application failed_locally = False @@ -66,10 +92,10 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti if app_id_or_name_or_url is None: try: url = f"http://localhost:{APP_SERVER_PORT}" - resp = requests.get(f"{url}/api/v1/commands") + resp = requests.get(f"{url}/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return url, resp.json() + return url, _extract_command_from_openapi(resp.json()) except requests.exceptions.ConnectionError: failed_locally = True @@ -88,8 +114,8 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti if lightningapp.id == app_id_or_name_or_url or lightningapp.name == app_id_or_name_or_url: if lightningapp.status.url == "": raise Exception("The application is starting. Try in a few moments.") - resp = requests.get(lightningapp.status.url + "/api/v1/commands") + resp = requests.get(lightningapp.status.url + "/openapi.json") if resp.status_code != 200: raise Exception(f"The server didn't process the request properly. Found {resp.json()}") - return lightningapp.status.url, resp.json() + return lightningapp.status.url, _extract_command_from_openapi(resp.json()) return None, None diff --git a/src/lightning_app/utilities/commands/base.py b/src/lightning_app/utilities/commands/base.py index b87b41b05df42..c74926f542744 100644 --- a/src/lightning_app/utilities/commands/base.py +++ b/src/lightning_app/utilities/commands/base.py @@ -1,6 +1,5 @@ import errno import inspect -import logging import os import os.path as osp import shutil @@ -8,19 +7,18 @@ from getpass import getuser from importlib.util import module_from_spec, spec_from_file_location from tempfile import gettempdir -from typing import Any, Callable, Dict, List, Optional, Tuple -from uuid import uuid4 +from typing import Any, Callable, Dict, List, Optional, Union import requests from pydantic import BaseModel +from lightning_app.api.http_methods import Post +from lightning_app.api.request_types import APIRequest, CommandRequest from lightning_app.utilities.app_helpers import is_overridden from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.network import LightningClient from lightning_app.utilities.state import AppState -_logger = logging.getLogger(__name__) - def makedirs(path: str): r"""Recursive directory creation function.""" @@ -31,31 +29,18 @@ def makedirs(path: str): raise e -class _ClientCommandConfig(BaseModel): - command: str - affiliation: str - params: Dict[str, str] - is_client_command: bool - cls_path: str - cls_name: str - owner: str - requirements: Optional[List[str]] - - class ClientCommand: def __init__(self, method: Callable, requirements: Optional[List[str]] = None) -> None: self.method = method flow = getattr(method, "__self__", None) self.owner = flow.name if flow else None self.requirements = requirements - self.metadata = None self.models: Optional[Dict[str, BaseModel]] = None self.app_url = None self._state = None - def _setup(self, metadata: Dict[str, Any], models: Dict[str, BaseModel], app_url: str) -> None: - self.metadata = metadata - self.models = models + def _setup(self, command_name: str, app_url: str) -> None: + self.command_name = command_name self.app_url = app_url @property @@ -72,67 +57,50 @@ def state(self): def run(self, **cli_kwargs) -> None: """Overrides with the logic to execute on the client side.""" - def invoke_handler(self, **kwargs: Any) -> Dict[str, Any]: - from lightning.app.utilities.state import headers_for - - assert kwargs.keys() == self.models.keys() - for k, v in kwargs.items(): - assert isinstance(v, self.models[k]) - json = { - "command_name": self.metadata["command"], - "command_arguments": {k: v.json() for k, v in kwargs.items()}, - "affiliation": self.metadata["affiliation"], - "id": str(uuid4()), - } - resp = requests.post(self.app_url + "/api/v1/commands", json=json, headers=headers_for({})) + def invoke_handler(self, config: BaseModel) -> Dict[str, Any]: + resp = requests.post(self.app_url + f"/command/{self.command_name}", data=config.json()) assert resp.status_code == 200, resp.json() return resp.json() def _to_dict(self): return {"owner": self.owner, "requirements": self.requirements} - def __call__(self, **kwargs: Any) -> Any: - assert self.models - input = {} - for k, v in kwargs.items(): - input[k] = self.models[k].parse_raw(v) - return self.method(**input) + def __call__(self, **kwargs): + return self.method(**kwargs) def _download_command( - command_metadata: Dict[str, Any], - app_id: Optional[str], + command_name: str, + cls_path: str, + cls_name: str, + app_id: Optional[str] = None, debug_mode: bool = False, -) -> Tuple[ClientCommand, Dict[str, BaseModel]]: +) -> ClientCommand: # TODO: This is a skateboard implementation and the final version will rely on versioned # immutable commands for security concerns - config = _ClientCommandConfig(**command_metadata) tmpdir = osp.join(gettempdir(), f"{getuser()}_commands") makedirs(tmpdir) - target_file = osp.join(tmpdir, f"{config.command}.py") + target_file = osp.join(tmpdir, f"{command_name}.py") if app_id: client = LightningClient() project_id = _get_project(client).project_id response = client.lightningapp_instance_service_list_lightningapp_instance_artifacts(project_id, app_id) for artifact in response.artifacts: - if f"commands/{config.command}.py" == artifact.filename: + if f"commands/{command_name}.py" == artifact.filename: r = requests.get(artifact.url, allow_redirects=True) with open(target_file, "wb") as f: f.write(r.content) else: if not debug_mode: - shutil.copy(config.cls_path, target_file) + shutil.copy(cls_path, target_file) - cls_name = config.cls_name - spec = spec_from_file_location(config.cls_name, config.cls_path if debug_mode else target_file) + spec = spec_from_file_location(cls_name, cls_path if debug_mode else target_file) mod = module_from_spec(spec) sys.modules[cls_name] = mod spec.loader.exec_module(mod) - command = getattr(mod, cls_name)(method=None, requirements=config.requirements) - models = {k: getattr(mod, v) for k, v in config.params.items()} - if debug_mode: - shutil.rmtree(tmpdir) - return command, models + command = getattr(mod, cls_name)(method=None, requirements=[]) + shutil.rmtree(tmpdir) + return command def _to_annotation(anno: str) -> str: @@ -142,7 +110,7 @@ def _to_annotation(anno: str) -> str: return anno -def _command_to_method_and_metadata(command: ClientCommand) -> Tuple[Callable, Dict[str, Any]]: +def _validate_client_command(command: ClientCommand): """Extract method and its metadata from a ClientCommand.""" params = inspect.signature(command.method).parameters command_metadata = { @@ -170,8 +138,6 @@ def _command_to_method_and_metadata(command: ClientCommand) -> Tuple[Callable, D raise Exception( f"The provided annotation for the argument {k} shouldn't an instance of pydantic BaseModel." ) - command.models[k] = config - return method, command_metadata def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]: @@ -192,55 +158,68 @@ def _upload_command(command_name: str, command: ClientCommand) -> Optional[str]: return filepath -def _populate_commands_endpoint(app): +def _prepare_commands(app) -> List: if not is_overridden("configure_commands", app.root): - return + return [] - # 1: Populate commands metadata + # 1: Upload the command to s3. commands = app.root.configure_commands() - commands_metadata = [] - command_names = set() for command_mapping in commands: for command_name, command in command_mapping.items(): - is_client_command = isinstance(command, ClientCommand) - extras = {} - if is_client_command: + if isinstance(command, ClientCommand): _upload_command(command_name, command) - command, extras = _command_to_method_and_metadata(command) - if command_name in command_names: - raise Exception(f"The component name {command_name} has already been used. They need to be unique.") - command_names.add(command_name) - params = inspect.signature(command).parameters - commands_metadata.append( - { - "command": command_name, - "affiliation": command.__self__.name, - "params": list(params.keys()), - "is_client_command": is_client_command, - **extras, - } - ) - # 1.2: Pass the collected commands through the queue to the Rest API. - app.commands_metadata_queue.put(commands_metadata) + # 2: Cache the commands on the app. app.commands = commands + return commands -def _process_command_requests(app): - if not is_overridden("configure_commands", app.root): - return - - # 1: Populate commands metadata - commands = app.commands - - # 2: Collect requests metadata - command_query = app.get_state_changed_from_queue(app.commands_requests_queue) - if command_query: - for command in commands: - for command_name, method in command.items(): - if command_query["command_name"] == command_name: - # 2.1: Evaluate the method associated to a specific command. - # Validation is done on the CLI side. - response = method(**command_query["command_arguments"]) - app.commands_responses_queue.put({"response": response, "id": command_query["id"]}) - app._has_updated = True +def _process_api_request(app, request: APIRequest) -> None: + flow = app.get_component_by_name(request.name) + method = getattr(flow, request.method_name) + response = method(*request.args, **request.kwargs) + app.api_response_queue.put({"response": response, "id": request.id}) + + +def _process_command_requests(app, request: CommandRequest) -> None: + for command in app.commands: + for command_name, method in command.items(): + if request.method_name == command_name: + # 2.1: Evaluate the method associated to a specific command. + # Validation is done on the CLI side. + response = method(*request.args, **request.kwargs) + app.api_response_queue.put({"response": response, "id": request.id}) + + +def _process_requests(app, request: Union[APIRequest, CommandRequest]) -> None: + """Convert user commands to API endpoint.""" + if isinstance(request, APIRequest): + _process_api_request(app, request) + else: + _process_command_requests(app, request) + + +def _collect_open_api_extras(command) -> Dict: + if not isinstance(command, ClientCommand): + return {} + return { + "cls_path": inspect.getfile(command.__class__), + "cls_name": command.__class__.__name__, + } + + +def _commands_to_api(commands: List[Dict[str, Union[Callable, ClientCommand]]]) -> List: + """Convert user commands to API endpoint.""" + api = [] + for command in commands: + for k, v in command.items(): + api.append( + Post( + f"/command/{k}", + v.method if isinstance(v, ClientCommand) else v, + method_name=k, + tags=["app_client_command"] if isinstance(v, ClientCommand) else ["app_command"], + openapi_extra=_collect_open_api_extras(v), + ) + ) + return api diff --git a/src/lightning_app/utilities/enum.py b/src/lightning_app/utilities/enum.py index dbf20413aa9d9..2b88d93169930 100644 --- a/src/lightning_app/utilities/enum.py +++ b/src/lightning_app/utilities/enum.py @@ -72,3 +72,9 @@ def make_status(stage: str, message: Optional[str] = None, reason: Optional[str] class CacheCallsKeys: LATEST_CALL_HASH = "latest_call_hash" + + +class OpenAPITags: + APP_CLIENT_COMMAND = "app_client_command" + APP_COMMAND = "app_command" + APP_API = "app_api" diff --git a/src/lightning_app/utilities/network.py b/src/lightning_app/utilities/network.py index 7fd03750a515d..050734723acc1 100644 --- a/src/lightning_app/utilities/network.py +++ b/src/lightning_app/utilities/network.py @@ -48,11 +48,12 @@ def _configure_session() -> Session: return http -def _check_service_url_is_ready(url: str, timeout: float = 100) -> bool: +def _check_service_url_is_ready(url: str, timeout: float = 5) -> bool: try: response = requests.get(url, timeout=timeout) return response.status_code in (200, 404) except (ConnectionError, ConnectTimeout, ReadTimeout): + logger.debug(f"The url {url} is not ready.") return False diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index edd2896d1951d..1b2bf2fb52fd9 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -2,15 +2,27 @@ import multiprocessing as mp import os from copy import deepcopy +from multiprocessing import Process +from time import sleep from unittest import mock import pytest +import requests from deepdiff import DeepDiff, Delta from httpx import AsyncClient +from pydantic import BaseModel from lightning_app import LightningApp, LightningFlow, LightningWork +from lightning_app.api.http_methods import Post from lightning_app.core import api -from lightning_app.core.api import fastapi_service, global_app_state_store, start_server, UIRefresher +from lightning_app.core.api import ( + fastapi_service, + global_app_state_store, + register_global_routes, + start_server, + UIRefresher, +) +from lightning_app.core.constants import APP_SERVER_PORT from lightning_app.runners import MultiProcessRuntime, SingleProcessRuntime from lightning_app.storage.drive import Drive from lightning_app.testing.helpers import MockQueue @@ -20,6 +32,8 @@ from lightning_app.utilities.redis import check_if_redis_running from lightning_app.utilities.state import AppState, headers_for +register_global_routes() + class WorkA(LightningWork): def __init__(self): @@ -161,12 +175,11 @@ def test_update_publish_state_and_maybe_refresh_ui(): app = AppStageTestingApp(FlowA(), debug=True) publish_state_queue = MockQueue("publish_state_queue") - commands_metadata_queue = MockQueue("commands_metadata_queue") - commands_responses_queue = MockQueue("commands_metadata_queue") + api_response_queue = MockQueue("api_response_queue") publish_state_queue.put(app.state_with_changes) - thread = UIRefresher(publish_state_queue, commands_metadata_queue, commands_responses_queue) + thread = UIRefresher(publish_state_queue, api_response_queue) thread.run_once() assert global_app_state_store.get_app_state("1234") == app.state_with_changes @@ -192,18 +205,14 @@ def get(self, timeout: int = 0): publish_state_queue = InfiniteQueue("publish_state_queue") change_state_queue = MockQueue("change_state_queue") has_started_queue = MockQueue("has_started_queue") - commands_requests_queue = MockQueue("commands_requests_queue") - commands_responses_queue = MockQueue("commands_responses_queue") - commands_metadata_queue = MockQueue("commands_metadata_queue") + api_response_queue = MockQueue("api_response_queue") state = app.state_with_changes publish_state_queue.put(state) spec = extract_metadata_from_app(app) ui_refresher = start_server( publish_state_queue, change_state_queue, - commands_requests_queue, - commands_responses_queue, - commands_metadata_queue, + api_response_queue, has_started_queue=has_started_queue, uvicorn_run=False, spec=spec, @@ -343,16 +352,12 @@ def test_start_server_started(): api_publish_state_queue = mp.Queue() api_delta_queue = mp.Queue() has_started_queue = mp.Queue() - commands_requests_queue = mp.Queue() - commands_responses_queue = mp.Queue() - commands_metadata_queue = mp.Queue() + api_response_queue = mp.Queue() kwargs = dict( api_publish_state_queue=api_publish_state_queue, api_delta_queue=api_delta_queue, has_started_queue=has_started_queue, - commands_requests_queue=commands_requests_queue, - commands_responses_queue=commands_responses_queue, - commands_metadata_queue=commands_metadata_queue, + api_response_queue=api_response_queue, port=1111, ) @@ -372,18 +377,14 @@ def test_start_server_info_message(ui_refresher, uvicorn_run, caplog, monkeypatc api_publish_state_queue = MockQueue() api_delta_queue = MockQueue() has_started_queue = MockQueue() - commands_requests_queue = MockQueue() - commands_responses_queue = MockQueue() - commands_metadata_queue = MockQueue() + api_response_queue = MockQueue() kwargs = dict( host=host, port=1111, api_publish_state_queue=api_publish_state_queue, api_delta_queue=api_delta_queue, has_started_queue=has_started_queue, - commands_requests_queue=commands_requests_queue, - commands_responses_queue=commands_responses_queue, - commands_metadata_queue=commands_metadata_queue, + api_response_queue=api_response_queue, ) monkeypatch.setattr(api, "logger", logging.getLogger()) @@ -395,3 +396,65 @@ def test_start_server_info_message(ui_refresher, uvicorn_run, caplog, monkeypatc ui_refresher.assert_called_once() uvicorn_run.assert_called_once_with(host="0.0.0.1", port=1111, log_level="error", app=mock.ANY) + + +class InputRequestModel(BaseModel): + name: str + + +class OutputRequestModel(BaseModel): + name: str + counter: int + + +class FlowAPI(LightningFlow): + def __init__(self): + super().__init__() + self.counter = 0 + + def run(self): + if self.counter == 2: + sleep(0.5) + self._exit() + + def request(self, config: InputRequestModel) -> OutputRequestModel: + self.counter += 1 + return OutputRequestModel(name=config.name, counter=self.counter) + + def configure_api(self): + return [Post("/api/v1/request", self.request)] + + +def target(): + app = LightningApp(FlowAPI()) + MultiProcessRuntime(app).dispatch() + + +def test_configure_api(): + + process = Process(target=target) + process.start() + time_left = 15 + while time_left > 0: + try: + requests.get(f"http://localhost:{APP_SERVER_PORT}/healthz") + break + except requests.exceptions.ConnectionError: + sleep(0.1) + time_left -= 0.1 + + response = requests.post( + f"http://localhost:{APP_SERVER_PORT}/api/v1/request", data=InputRequestModel(name="hello").json() + ) + assert response.json() == {"name": "hello", "counter": 1} + response = requests.post( + f"http://localhost:{APP_SERVER_PORT}/api/v1/request", data=InputRequestModel(name="hello").json() + ) + assert response.json() == {"name": "hello", "counter": 2} + time_left = 15 + while time_left > 0: + if process.exitcode == 0: + break + sleep(0.1) + time_left -= 0.1 + assert process.exitcode == 0 diff --git a/tests/tests_app/utilities/test_app_logs.py b/tests/tests_app/utilities/test_app_logs.py index e7384dd72d6e2..7a0fe087e7c29 100644 --- a/tests/tests_app/utilities/test_app_logs.py +++ b/tests/tests_app/utilities/test_app_logs.py @@ -1,4 +1,5 @@ from datetime import datetime +from time import sleep from unittest.mock import MagicMock from lightning_app.utilities.app_logs import _LogEvent @@ -6,6 +7,7 @@ def test_log_event(): event_1 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) + sleep(0.1) event_2 = _LogEvent("", datetime.now(), MagicMock(), MagicMock()) assert event_1 < event_2 assert event_1 <= event_2 diff --git a/tests/tests_app/utilities/test_commands.py b/tests/tests_app/utilities/test_commands.py index ed7f386395282..1be35a3a2e290 100644 --- a/tests/tests_app/utilities/test_commands.py +++ b/tests/tests_app/utilities/test_commands.py @@ -14,7 +14,7 @@ from lightning_app.core.constants import APP_SERVER_PORT from lightning_app.runners import MultiProcessRuntime from lightning_app.testing.helpers import RunIf -from lightning_app.utilities.commands.base import _command_to_method_and_metadata, _download_command, ClientCommand +from lightning_app.utilities.commands.base import _download_command, _validate_client_command, ClientCommand from lightning_app.utilities.state import AppState @@ -25,7 +25,6 @@ class SweepConfig(BaseModel): class SweepCommand(ClientCommand): def run(self) -> None: - print(sys.argv) parser = argparse.ArgumentParser() parser.add_argument("--sweep_name", type=str) parser.add_argument("--num_trials", type=int) @@ -91,15 +90,15 @@ def run_failure_2(name: CustomModel): @RunIf(skip_windows=True) -def test_command_to_method_and_metadata(): +def test_validate_client_command(): with pytest.raises(Exception, match="The provided annotation for the argument name"): - _command_to_method_and_metadata(ClientCommand(run_failure_0)) + _validate_client_command(ClientCommand(run_failure_0)) with pytest.raises(Exception, match="annotate your method"): - _command_to_method_and_metadata(ClientCommand(run_failure_1)) + _validate_client_command(ClientCommand(run_failure_1)) with pytest.raises(Exception, match="lightning_app/utilities/commands/base.py"): - _command_to_method_and_metadata(ClientCommand(run_failure_2)) + _validate_client_command(ClientCommand(run_failure_2)) def test_client_commands(monkeypatch): @@ -115,17 +114,13 @@ def test_client_commands(monkeypatch): url = "http//" kwargs = {"something": "1", "something_else": "1"} command = DummyCommand(run) - _, command_metadata = _command_to_method_and_metadata(command) - command_metadata.update( - { - "command": "dummy", - "affiliation": "root", - "is_client_command": True, - "owner": "root", - } + _validate_client_command(command) + client_command = _download_command( + command_name="something", + cls_path=__file__, + cls_name="DummyCommand", ) - client_command, models = _download_command(command_metadata, None) - client_command._setup(metadata=command_metadata, models=models, app_url=url) + client_command._setup("something", app_url=url) client_command.run(**kwargs) @@ -153,10 +148,12 @@ def test_configure_commands(monkeypatch): state = AppState() state._request_state() assert state.names == ["something"] - monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name", "my_name", "--num_trials", "1"]) + monkeypatch.setattr(sys, "argv", ["lightning", "sweep", "--sweep_name=my_name", "--num_trials=1"]) app_command() time_left = 15 - while time_left > 0 and process.exitcode != 0: + while time_left > 0: + if process.exitcode == 0: + break sleep(0.1) time_left -= 0.1 assert process.exitcode == 0 diff --git a/tests/tests_app_examples/test_commands.py b/tests/tests_app_examples/test_commands.py deleted file mode 100644 index 236e587e23101..0000000000000 --- a/tests/tests_app_examples/test_commands.py +++ /dev/null @@ -1,32 +0,0 @@ -import os -from subprocess import Popen -from time import sleep -from unittest import mock - -import pytest -from tests_app import _PROJECT_ROOT - -from lightning_app.testing.testing import run_app_in_cloud - - -@mock.patch.dict(os.environ, {"SKIP_LIGHTING_UTILITY_WHEELS_BUILD": "0"}) -@pytest.mark.cloud -def test_commands_example_cloud() -> None: - with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands")) as ( - admin_page, - _, - fetch_logs, - _, - ): - app_id = admin_page.url.split("/")[-1] - cmd = f"lightning trigger_with_client_command --name=something --app_id {app_id}" - Popen(cmd, shell=True).wait() - cmd = f"lightning trigger_without_client_command --name=else --app_id {app_id}" - Popen(cmd, shell=True).wait() - - has_logs = False - while not has_logs: - for log in fetch_logs(["flow"]): - if "['something', 'else']" in log: - has_logs = True - sleep(1) diff --git a/tests/tests_app_examples/test_commands_and_api.py b/tests/tests_app_examples/test_commands_and_api.py new file mode 100644 index 0000000000000..8d84cf4847ebd --- /dev/null +++ b/tests/tests_app_examples/test_commands_and_api.py @@ -0,0 +1,42 @@ +import os +from subprocess import Popen +from time import sleep + +import pytest +import requests +from tests_app import _PROJECT_ROOT + +from lightning_app.testing.testing import run_app_in_cloud + + +@pytest.mark.cloud +def test_commands_and_api_example_cloud() -> None: + with run_app_in_cloud(os.path.join(_PROJECT_ROOT, "examples/app_commands_and_api")) as ( + admin_page, + view_page, + fetch_logs, + _, + ): + # 1: Collect the app_id + app_id = admin_page.url.split("/")[-1] + + # 2: Send the first command with the client + cmd = f"lightning command_with_client --name=this --app_id {app_id}" + Popen(cmd, shell=True).wait() + + # 3: Send the second command without a client + cmd = f"lightning command_without_client --name=is --app_id {app_id}" + Popen(cmd, shell=True).wait() + + # 4: Send a request to the Rest API directly. + base_url = view_page.url.replace("/view", "").replace("/child_flow", "") + resp = requests.post(base_url + "/user/command_without_client?name=awesome") + assert resp.status_code == 200, resp.json() + + # 5: Validate the logs. + has_logs = False + while not has_logs: + for log in fetch_logs(): + if "['this', 'is', 'awesome']" in log: + has_logs = True + sleep(1) From 7c8c996f6acbcd3f497b6274bc96e7767e5695b1 Mon Sep 17 00:00:00 2001 From: Rick Izzo Date: Thu, 11 Aug 2022 09:22:59 -0400 Subject: [PATCH 157/230] Feature GRID-9731: Update Lightning Cloud.py Backend to Accept Drive Specs (2/2) (#14106) initial work adding drives to create work API from framework cloud dispatcher --- requirements/app/base.txt | 2 +- src/lightning_app/runners/cloud.py | 43 ++++ src/lightning_app/storage/drive.py | 2 +- tests/tests_app/runners/test_cloud.py | 356 ++++++++++++++++++++++++++ 4 files changed, 401 insertions(+), 2 deletions(-) diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 02eeb04bfa218..fcde2f18a300a 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,4 +1,4 @@ -lightning-cloud==0.5.0 +lightning-cloud==0.5.3 packaging deepdiff>=5.7.0, <=5.8.1 starsessions diff --git a/src/lightning_app/runners/cloud.py b/src/lightning_app/runners/cloud.py index 957b60b5d2ab5..2cd98ebe4cf68 100644 --- a/src/lightning_app/runners/cloud.py +++ b/src/lightning_app/runners/cloud.py @@ -18,15 +18,22 @@ Gridv1ImageSpec, V1BuildSpec, V1DependencyFileInfo, + V1Drive, + V1DriveSpec, + V1DriveStatus, + V1DriveType, V1EnvVar, V1Flowserver, V1LightningappInstanceSpec, V1LightningappInstanceState, + V1LightningworkDrives, V1LightningworkSpec, + V1Metadata, V1NetworkConfig, V1PackageManager, V1ProjectClusterBinding, V1PythonDependencyInfo, + V1SourceType, V1UserRequestedComputeConfig, V1Work, ) @@ -36,6 +43,7 @@ from lightning_app.runners.backends.cloud import CloudBackend from lightning_app.runners.runtime import Runtime from lightning_app.source_code import LocalSourceCodeDir +from lightning_app.storage import Drive from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash from lightning_app.utilities.packaging.app_config import AppConfig, find_config_file @@ -107,10 +115,45 @@ def dispatch( preemptible=work.cloud_compute.preemptible, shm_size=work.cloud_compute.shm_size, ) + + drive_specs: List[V1LightningworkDrives] = [] + for drive_attr_name, drive in [ + (k, getattr(work, k)) for k in work._state if isinstance(getattr(work, k), Drive) + ]: + if drive.protocol == "lit://": + drive_type = V1DriveType.NO_MOUNT_S3 + source_type = V1SourceType.S3 + elif drive.protocol == "s3://": + drive_type = V1DriveType.INDEXED_S3 + source_type = V1SourceType.S3 + else: + raise RuntimeError( + f"unknown drive protocol `{drive.protocol}`. Please verify this " + f"drive type has been configured for use in the cloud dispatcher." + ) + + drive_specs.append( + V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name=f"{work.name}.{drive_attr_name}", + ), + spec=V1DriveSpec( + drive_type=drive_type, + source_type=source_type, + source=f"{drive.protocol}{drive.id}", + ), + status=V1DriveStatus(), + ), + mount_location=str(drive.root_folder), + ), + ) + random_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5)) spec = V1LightningworkSpec( build_spec=build_spec, cluster_id=cluster_id, + drives=drive_specs, user_requested_compute_config=user_compute_config, network_config=[V1NetworkConfig(name=random_name, port=work.port)], ) diff --git a/src/lightning_app/storage/drive.py b/src/lightning_app/storage/drive.py index b69d2581851b8..f72ad38b6e130 100644 --- a/src/lightning_app/storage/drive.py +++ b/src/lightning_app/storage/drive.py @@ -59,7 +59,7 @@ def __init__( if self.protocol != "s3://" and "/" in self.id: raise Exception(f"The id should be unique to identify your drive. Found `{self.id}`.") - self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else os.getcwd() + self.root_folder = pathlib.Path(root_folder).resolve() if root_folder else pathlib.Path(os.getcwd()) if not os.path.isdir(self.root_folder): raise Exception(f"The provided root_folder isn't a directory: {root_folder}") self.component_name = component_name diff --git a/tests/tests_app/runners/test_cloud.py b/tests/tests_app/runners/test_cloud.py index 4b1cf08e8554d..640eb9c114c2d 100644 --- a/tests/tests_app/runners/test_cloud.py +++ b/tests/tests_app/runners/test_cloud.py @@ -1,4 +1,5 @@ import logging +from copy import copy from pathlib import Path from unittest import mock from unittest.mock import MagicMock @@ -9,21 +10,29 @@ Gridv1ImageSpec, V1BuildSpec, V1DependencyFileInfo, + V1Drive, + V1DriveSpec, + V1DriveStatus, + V1DriveType, V1LightningappInstanceState, + V1LightningworkDrives, V1LightningworkSpec, V1ListLightningappInstancesResponse, V1ListMembershipsResponse, V1Membership, + V1Metadata, V1NetworkConfig, V1PackageManager, V1ProjectClusterBinding, V1PythonDependencyInfo, + V1SourceType, V1UserRequestedComputeConfig, V1Work, ) from lightning_app import LightningApp, LightningWork from lightning_app.runners import backends, cloud +from lightning_app.storage import Drive from lightning_app.utilities.cloud import _get_project from lightning_app.utilities.dependency_caching import get_hash @@ -33,6 +42,25 @@ def run(self): print("my run") +class WorkWithSingleDrive(LightningWork): + def __init__(self): + super().__init__() + self.drive = None + + def run(self): + pass + + +class WorkWithTwoDrives(LightningWork): + def __init__(self): + super().__init__() + self.lit_drive = None + self.s3_drive = None + + def run(self): + pass + + class TestAppCreationClient: """Testing the calls made using GridRestClient to create the app.""" @@ -250,6 +278,134 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir): ), image="random_base_public_image", ), + drives=[], + user_requested_compute_config=V1UserRequestedComputeConfig( + name="default", count=1, disk_size=0, preemptible=False, shm_size=0 + ), + network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], + ), + ) + ], + ) + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + + # running dispatch with disabled dependency cache + mock_client.reset_mock() + monkeypatch.setattr(cloud, "DISABLE_DEPENDENCY_CACHE", True) + expected_body.dependency_cache_key = None + cloud_runtime.dispatch() + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + else: + mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with( + "test-project-id", mock.ANY, mock.ANY, mock.ANY + ) + + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) + @pytest.mark.parametrize("lightningapps", [[], [MagicMock()]]) + def test_call_with_work_app_and_attached_drives(self, lightningapps, monkeypatch, tmpdir): + source_code_root_dir = Path(tmpdir / "src").absolute() + source_code_root_dir.mkdir() + Path(source_code_root_dir / ".lightning").write_text("name: myapp") + requirements_file = Path(source_code_root_dir / "requirements.txt") + Path(requirements_file).touch() + + mock_client = mock.MagicMock() + if lightningapps: + lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=lightningapps) + ) + lightning_app_instance = MagicMock() + mock_client.lightningapp_v2_service_create_lightningapp_release = MagicMock(return_value=lightning_app_instance) + mock_client.lightningapp_v2_service_create_lightningapp_release_instance = MagicMock( + return_value=lightning_app_instance + ) + existing_instance = MagicMock() + existing_instance.status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_service_get_lightningapp = MagicMock(return_value=existing_instance) + cloud_backend = mock.MagicMock() + cloud_backend.client = mock_client + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) + app = mock.MagicMock() + flow = mock.MagicMock() + + mocked_drive = MagicMock(spec=Drive) + setattr(mocked_drive, "id", "foobar") + setattr(mocked_drive, "protocol", "lit://") + setattr(mocked_drive, "component_name", "test-work") + setattr(mocked_drive, "allow_duplicates", False) + setattr(mocked_drive, "root_folder", tmpdir) + # deepcopy on a MagicMock instance will return an empty magicmock instance. To + # overcome this we set the __deepcopy__ method `return_value` to equal what + # should be the results of the deepcopy operation (an instance of the original class) + mocked_drive.__deepcopy__.return_value = copy(mocked_drive) + + work = WorkWithSingleDrive() + monkeypatch.setattr(work, "drive", mocked_drive) + monkeypatch.setattr(work, "_state", {"_port", "drive"}) + monkeypatch.setattr(work, "_name", "test-work") + monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"]) + monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"]) + monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image") + monkeypatch.setattr(work._cloud_compute, "disk_size", 0) + monkeypatch.setattr(work._cloud_compute, "preemptible", False) + monkeypatch.setattr(work, "_port", 8080) + + flow.works = lambda recurse: [work] + app.flows = [flow] + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) + monkeypatch.setattr( + "lightning_app.runners.cloud._get_project", + lambda x: V1Membership(name="test-project", project_id="test-project-id"), + ) + cloud_runtime.dispatch() + + if lightningapps: + expected_body = Body8( + description=None, + local_source=True, + app_entrypoint_file="entrypoint.py", + enable_app_server=True, + flow_servers=[], + dependency_cache_key=get_hash(requirements_file), + image_spec=Gridv1ImageSpec( + dependency_file_info=V1DependencyFileInfo( + package_manager=V1PackageManager.PIP, path="requirements.txt" + ) + ), + works=[ + V1Work( + name="test-work", + spec=V1LightningworkSpec( + build_spec=V1BuildSpec( + commands=["echo 'start'"], + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0" + ), + image="random_base_public_image", + ), + drives=[ + V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name="test-work.drive", + ), + spec=V1DriveSpec( + drive_type=V1DriveType.NO_MOUNT_S3, + source_type=V1SourceType.S3, + source="lit://foobar", + ), + status=V1DriveStatus(), + ), + mount_location=str(tmpdir), + ), + ], user_requested_compute_config=V1UserRequestedComputeConfig( name="default", count=1, disk_size=0, preemptible=False, shm_size=0 ), @@ -275,6 +431,206 @@ def test_call_with_work_app(self, lightningapps, monkeypatch, tmpdir): "test-project-id", mock.ANY, mock.ANY, mock.ANY ) + @mock.patch("lightning_app.runners.backends.cloud.LightningClient", mock.MagicMock()) + @pytest.mark.parametrize("lightningapps", [[], [MagicMock()]]) + def test_call_with_work_app_and_multiple_attached_drives(self, lightningapps, monkeypatch, tmpdir): + source_code_root_dir = Path(tmpdir / "src").absolute() + source_code_root_dir.mkdir() + Path(source_code_root_dir / ".lightning").write_text("name: myapp") + requirements_file = Path(source_code_root_dir / "requirements.txt") + Path(requirements_file).touch() + + mock_client = mock.MagicMock() + if lightningapps: + lightningapps[0].status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_instance_service_list_lightningapp_instances.return_value = ( + V1ListLightningappInstancesResponse(lightningapps=lightningapps) + ) + lightning_app_instance = MagicMock() + mock_client.lightningapp_v2_service_create_lightningapp_release = MagicMock(return_value=lightning_app_instance) + mock_client.lightningapp_v2_service_create_lightningapp_release_instance = MagicMock( + return_value=lightning_app_instance + ) + existing_instance = MagicMock() + existing_instance.status.phase = V1LightningappInstanceState.STOPPED + mock_client.lightningapp_service_get_lightningapp = MagicMock(return_value=existing_instance) + cloud_backend = mock.MagicMock() + cloud_backend.client = mock_client + monkeypatch.setattr(backends, "CloudBackend", mock.MagicMock(return_value=cloud_backend)) + monkeypatch.setattr(cloud, "LocalSourceCodeDir", mock.MagicMock()) + monkeypatch.setattr(cloud, "_prepare_lightning_wheels_and_requirements", mock.MagicMock()) + app = mock.MagicMock() + flow = mock.MagicMock() + + mocked_lit_drive = MagicMock(spec=Drive) + setattr(mocked_lit_drive, "id", "foobar") + setattr(mocked_lit_drive, "protocol", "lit://") + setattr(mocked_lit_drive, "component_name", "test-work") + setattr(mocked_lit_drive, "allow_duplicates", False) + setattr(mocked_lit_drive, "root_folder", tmpdir) + # deepcopy on a MagicMock instance will return an empty magicmock instance. To + # overcome this we set the __deepcopy__ method `return_value` to equal what + # should be the results of the deepcopy operation (an instance of the original class) + mocked_lit_drive.__deepcopy__.return_value = copy(mocked_lit_drive) + + mocked_s3_drive = MagicMock(spec=Drive) + setattr(mocked_s3_drive, "id", "some-bucket/path/") + setattr(mocked_s3_drive, "protocol", "s3://") + setattr(mocked_s3_drive, "component_name", "test-work") + setattr(mocked_s3_drive, "allow_duplicates", False) + setattr(mocked_s3_drive, "root_folder", "/hello/") + # deepcopy on a MagicMock instance will return an empty magicmock instance. To + # overcome this we set the __deepcopy__ method `return_value` to equal what + # should be the results of the deepcopy operation (an instance of the original class) + mocked_s3_drive.__deepcopy__.return_value = copy(mocked_s3_drive) + + work = WorkWithTwoDrives() + monkeypatch.setattr(work, "lit_drive", mocked_lit_drive) + monkeypatch.setattr(work, "s3_drive", mocked_s3_drive) + monkeypatch.setattr(work, "_state", {"_port", "_name", "lit_drive", "s3_drive"}) + monkeypatch.setattr(work, "_name", "test-work") + monkeypatch.setattr(work._cloud_build_config, "build_commands", lambda: ["echo 'start'"]) + monkeypatch.setattr(work._cloud_build_config, "requirements", ["torch==1.0.0", "numpy==1.0.0"]) + monkeypatch.setattr(work._cloud_build_config, "image", "random_base_public_image") + monkeypatch.setattr(work._cloud_compute, "disk_size", 0) + monkeypatch.setattr(work._cloud_compute, "preemptible", False) + monkeypatch.setattr(work, "_port", 8080) + + flow.works = lambda recurse: [work] + app.flows = [flow] + cloud_runtime = cloud.CloudRuntime(app=app, entrypoint_file=(source_code_root_dir / "entrypoint.py")) + monkeypatch.setattr( + "lightning_app.runners.cloud._get_project", + lambda x: V1Membership(name="test-project", project_id="test-project-id"), + ) + cloud_runtime.dispatch() + + if lightningapps: + s3_drive_spec = V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name="test-work.s3_drive", + ), + spec=V1DriveSpec( + drive_type=V1DriveType.INDEXED_S3, + source_type=V1SourceType.S3, + source="s3://some-bucket/path/", + ), + status=V1DriveStatus(), + ), + mount_location="/hello/", + ) + lit_drive_spec = V1LightningworkDrives( + drive=V1Drive( + metadata=V1Metadata( + name="test-work.lit_drive", + ), + spec=V1DriveSpec( + drive_type=V1DriveType.NO_MOUNT_S3, + source_type=V1SourceType.S3, + source="lit://foobar", + ), + status=V1DriveStatus(), + ), + mount_location=str(tmpdir), + ) + + # order of drives in the spec is non-deterministic, so there are two options + # depending for the expected body value on which drive is ordered in the list first. + + expected_body_option_1 = Body8( + description=None, + local_source=True, + app_entrypoint_file="entrypoint.py", + enable_app_server=True, + flow_servers=[], + dependency_cache_key=get_hash(requirements_file), + image_spec=Gridv1ImageSpec( + dependency_file_info=V1DependencyFileInfo( + package_manager=V1PackageManager.PIP, path="requirements.txt" + ) + ), + works=[ + V1Work( + name="test-work", + spec=V1LightningworkSpec( + build_spec=V1BuildSpec( + commands=["echo 'start'"], + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0" + ), + image="random_base_public_image", + ), + drives=[lit_drive_spec, s3_drive_spec], + user_requested_compute_config=V1UserRequestedComputeConfig( + name="default", count=1, disk_size=0, preemptible=False, shm_size=0 + ), + network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], + ), + ) + ], + ) + + expected_body_option_2 = Body8( + description=None, + local_source=True, + app_entrypoint_file="entrypoint.py", + enable_app_server=True, + flow_servers=[], + dependency_cache_key=get_hash(requirements_file), + image_spec=Gridv1ImageSpec( + dependency_file_info=V1DependencyFileInfo( + package_manager=V1PackageManager.PIP, path="requirements.txt" + ) + ), + works=[ + V1Work( + name="test-work", + spec=V1LightningworkSpec( + build_spec=V1BuildSpec( + commands=["echo 'start'"], + python_dependencies=V1PythonDependencyInfo( + package_manager=V1PackageManager.PIP, packages="torch==1.0.0\nnumpy==1.0.0" + ), + image="random_base_public_image", + ), + drives=[s3_drive_spec, lit_drive_spec], + user_requested_compute_config=V1UserRequestedComputeConfig( + name="default", count=1, disk_size=0, preemptible=False, shm_size=0 + ), + network_config=[V1NetworkConfig(name=mock.ANY, host=None, port=8080)], + ), + ) + ], + ) + + # try both options for the expected body to avoid false + # positive test failures depending on system randomness + + expected_body = expected_body_option_1 + try: + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + except Exception: + expected_body = expected_body_option_2 + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + + # running dispatch with disabled dependency cache + mock_client.reset_mock() + monkeypatch.setattr(cloud, "DISABLE_DEPENDENCY_CACHE", True) + expected_body.dependency_cache_key = None + cloud_runtime.dispatch() + mock_client.lightningapp_v2_service_create_lightningapp_release.assert_called_once_with( + "test-project-id", mock.ANY, expected_body + ) + else: + mock_client.lightningapp_v2_service_create_lightningapp_release_instance.assert_called_once_with( + "test-project-id", mock.ANY, mock.ANY, mock.ANY + ) + @mock.patch("lightning_app.core.queues.QueuingSystem", MagicMock()) @mock.patch("lightning_app.runners.backends.cloud.LightningClient", MagicMock()) From 98ded4524f373d906aac475b6a7599b6f1661c39 Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Thu, 11 Aug 2022 08:19:21 -0700 Subject: [PATCH 158/230] [CLI] change cluster creation cost savings mode default (#14132) * [CLI] change cluster creation cost savings mode default instead of having customers opt-into cost savings mode, we'll ask them to opt-out of cost savings mode. --- src/lightning_app/CHANGELOG.md | 2 +- src/lightning_app/cli/lightning_cli_create.py | 14 +++++++------- tests/tests_app/cli/test_cli.py | 17 ++++++++++------- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 7158d1ff7a2da..2aa5c7cdd837c 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -28,7 +28,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) ### Changed -- +- Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132)) ### Changed diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index d400db4b6f337..c9cea2a5676f9 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -33,14 +33,14 @@ def create(): help="Instance types that you want to support, for computer jobs within the cluster.", ) @click.option( - "--cost-savings", - "cost_savings", + "--enable-performance", + "enable_performance", type=bool, required=False, default=False, is_flag=True, - help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for cost savings. - This makes runs cheaper but start-up times may increase.""", + help=""""Use this flag to ensure that the cluster is created with a profile that is optimized for performance. + This makes runs more expensive but start-up times decrease.""", ) @click.option( "--edit-before-creation", @@ -65,12 +65,12 @@ def create_cluster( provider: str, instance_types: str, edit_before_creation: bool, - cost_savings: bool, + enable_performance: bool, wait: bool, **kwargs, ): """Create a Lightning AI BYOC compute cluster with your cloud provider credentials.""" - if provider != "aws": + if provider.lower() != "aws": click.echo("Only AWS is supported for now. But support for more providers is coming soon.") return cluster_manager = AWSClusterManager() @@ -81,6 +81,6 @@ def create_cluster( external_id=external_id, instance_types=instance_types.split(",") if instance_types is not None else None, edit_before_creation=edit_before_creation, - cost_savings=cost_savings, + cost_savings=not enable_performance, wait=wait, ) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 8cc5dd50f836e..48e1a26bb6f2b 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -71,14 +71,17 @@ def test_main_lightning_cli_help(): @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") @pytest.mark.parametrize( - "instance_types,expected_instance_types", + "extra_arguments,expected_instance_types,expected_cost_savings_mode", [ - (["--instance-types", "t3.xlarge"], ["t3.xlarge"]), - (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"]), - ([], None), + (["--instance-types", "t3.xlarge"], ["t3.xlarge"], True), + (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"], True), + ([], None, True), + (["--enable-performance"], None, False), ], ) -def test_create_cluster(create_command: mock.MagicMock, instance_types, expected_instance_types): +def test_create_cluster( + create_command: mock.MagicMock, extra_arguments, expected_instance_types, expected_cost_savings_mode +): runner = CliRunner() runner.invoke( create_cluster, @@ -91,7 +94,7 @@ def test_create_cluster(create_command: mock.MagicMock, instance_types, expected "--role-arn", "arn:aws:iam::1234567890:role/lai-byoc", ] - + instance_types, + + extra_arguments, ) create_command.assert_called_once_with( @@ -101,7 +104,7 @@ def test_create_cluster(create_command: mock.MagicMock, instance_types, expected external_id="dummy", instance_types=expected_instance_types, edit_before_creation=False, - cost_savings=False, + cost_savings=expected_cost_savings_mode, wait=False, ) From 3b18da3eafa8ece27cde46ad978f765a1390d72d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 17:49:46 +0200 Subject: [PATCH 159/230] Fix saving hyperparameters in a composition where parent is not a LM or LDM (#14151) Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 4 ++++ src/pytorch_lightning/utilities/parsing.py | 17 ++++++++++++----- tests/tests_pytorch/models/test_hparams.py | 19 +++++++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 97bb317b02a14..04db3d1908bb2 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -70,6 +70,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) + + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/utilities/parsing.py b/src/pytorch_lightning/utilities/parsing.py index 81877f1dffba7..073423ab60773 100644 --- a/src/pytorch_lightning/utilities/parsing.py +++ b/src/pytorch_lightning/utilities/parsing.py @@ -162,7 +162,10 @@ def get_init_args(frame: types.FrameType) -> Dict[str, Any]: def collect_init_args( - frame: types.FrameType, path_args: List[Dict[str, Any]], inside: bool = False + frame: types.FrameType, + path_args: List[Dict[str, Any]], + inside: bool = False, + classes: Tuple[Type, ...] = (), ) -> List[Dict[str, Any]]: """Recursively collects the arguments passed to the child constructors in the inheritance tree. @@ -170,6 +173,7 @@ def collect_init_args( frame: the current stack frame path_args: a list of dictionaries containing the constructor args in all parent classes inside: track if we are inside inheritance path, avoid terminating too soon + classes: the classes in which to inspect the frames Return: A list of dictionaries where each dictionary contains the arguments passed to the @@ -181,13 +185,13 @@ def collect_init_args( if not isinstance(frame.f_back, types.FrameType): return path_args - if "__class__" in local_vars: + if "__class__" in local_vars and (not classes or issubclass(local_vars["__class__"], classes)): local_args = get_init_args(frame) # recursive update path_args.append(local_args) - return collect_init_args(frame.f_back, path_args, inside=True) + return collect_init_args(frame.f_back, path_args, inside=True, classes=classes) if not inside: - return collect_init_args(frame.f_back, path_args, inside) + return collect_init_args(frame.f_back, path_args, inside, classes=classes) return path_args @@ -225,7 +229,10 @@ def save_hyperparameters( init_args = {f.name: getattr(obj, f.name) for f in fields(obj)} else: init_args = {} - for local_args in collect_init_args(frame, []): + + from pytorch_lightning.core.mixins import HyperparametersMixin + + for local_args in collect_init_args(frame, [], classes=(HyperparametersMixin,)): init_args.update(local_args) if ignore is None: diff --git a/tests/tests_pytorch/models/test_hparams.py b/tests/tests_pytorch/models/test_hparams.py index c130381c7832d..84311d6f780fb 100644 --- a/tests/tests_pytorch/models/test_hparams.py +++ b/tests/tests_pytorch/models/test_hparams.py @@ -29,6 +29,7 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.core.datamodule import LightningDataModule +from pytorch_lightning.core.mixins import HyperparametersMixin from pytorch_lightning.core.saving import load_hparams_from_yaml, save_hparams_to_yaml from pytorch_lightning.demos.boring_classes import BoringDataModule, BoringModel, RandomDataset from pytorch_lightning.utilities import _OMEGACONF_AVAILABLE, AttributeDict, is_picklable @@ -399,6 +400,24 @@ def _raw_checkpoint_path(trainer) -> str: return raw_checkpoint_path +@pytest.mark.parametrize("base_class", (HyperparametersMixin, LightningModule, LightningDataModule)) +def test_save_hyperparameters_under_composition(base_class): + """Test that in a composition where the parent is not a Lightning-like module, the parent's arguments don't get + collected.""" + + class ChildInComposition(base_class): + def __init__(self, same_arg): + super().__init__() + self.save_hyperparameters() + + class NotPLSubclass: # intentionally not subclassing LightningModule/LightningDataModule + def __init__(self, same_arg="parent_default", other_arg="other"): + self.child = ChildInComposition(same_arg="cocofruit") + + parent = NotPLSubclass() + assert parent.child.hparams == dict(same_arg="cocofruit") + + class LocalVariableModelSuperLast(BoringModel): """This model has the super().__init__() call at the end.""" From 56533368afe14407867dc999a65b799d0f4bd89b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 11 Aug 2022 18:17:56 +0200 Subject: [PATCH 160/230] Remove DeepSpeed version restriction from Lite (#13967) --- .azure/gpu-tests.yml | 2 +- requirements/pytorch/strategies.txt | 2 +- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/lite/lite.py | 15 --------------- tests/tests_pytorch/lite/test_lite.py | 13 +------------ 5 files changed, 6 insertions(+), 29 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 68ba6974a3527..8ae670d265ced 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -75,7 +75,7 @@ jobs: CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" pip install -e .[strategies] - pip install deepspeed>0.6.4 # TODO: remove when docker images are upgraded + pip install -U deepspeed # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt pip list env: diff --git a/requirements/pytorch/strategies.txt b/requirements/pytorch/strategies.txt index 4e916fbc6c61f..c5fc92a67a837 100644 --- a/requirements/pytorch/strategies.txt +++ b/requirements/pytorch/strategies.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment fairscale>=0.4.5, <=0.4.6 -deepspeed>=0.6.0, <0.7.0 +deepspeed>=0.6.0, <=0.7.0 # no need to install with [pytorch] as pytorch is already installed horovod>=0.21.2, !=0.24.0, <0.25.1 hivemind>=1.0.1, <=1.0.1; sys_platform == 'linux' diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 04db3d1908bb2..6d67d2d58643a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -22,6 +22,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([13961](https://github.com/Lightning-AI/lightning/pull/13961)) +- Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) + + ### Deprecated - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 5125bf4486a9d..981eed30635f6 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -40,7 +40,6 @@ has_iterable_dataset, ) from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import seed_everything @@ -106,8 +105,6 @@ def __init__( self._precision_plugin = self._strategy.precision_plugin self._models_setup: int = 0 - self._check_deepspeed_support() - # wrap the run method so we can inject setup logic or spawn processes for the user setattr(self, "run", partial(self._run_impl, self.run)) @@ -459,18 +456,6 @@ def _check_strategy_support(self, strategy: Optional[Union[str, Strategy]]) -> N f" Choose one of {supported} or pass in a `Strategy` instance." ) - def _check_deepspeed_support(self) -> None: - if ( - isinstance(self._strategy, DeepSpeedStrategy) - and self._strategy.zero_stage_3 - and _RequirementAvailable("deepspeed>=0.6.5") - ): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise RuntimeError( - "DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite and `deepspeed>=0.6.5`." - " Please downgrade deepspeed to 0.6.4 or check if a newer version of Lightning is available." - ) - @staticmethod def _supported_device_types() -> Sequence[_AcceleratorType]: return ( diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 2215ab3129780..86a0a5a82195a 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import contextlib import os from copy import deepcopy from unittest import mock @@ -30,7 +29,6 @@ from pytorch_lightning.strategies import DeepSpeedStrategy, Strategy from pytorch_lightning.utilities import _StrategyType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _RequirementAvailable from pytorch_lightning.utilities.seed import pl_worker_init_function from tests_pytorch.helpers.runif import RunIf @@ -480,13 +478,4 @@ def run(self): assert self.broadcast(True) assert self.is_global_zero == (self.local_rank == 0) - if _RequirementAvailable("deepspeed>=0.6.5"): - # https://github.com/microsoft/DeepSpeed/issues/2139 - raise_if_deepspeed_incompatible = pytest.raises( - RuntimeError, match="DeepSpeed ZeRO-3 is not supported with this version of Lightning Lite" - ) - else: - raise_if_deepspeed_incompatible = contextlib.suppress() - - with raise_if_deepspeed_incompatible: - Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() + Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run() From d0f82abe35c271247d58da35442719e01a54604c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 11 Aug 2022 18:55:01 +0200 Subject: [PATCH 161/230] Configure the check-group app (#14165) Co-authored-by: Jirka --- .github/checkgroup.yml | 165 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 165 insertions(+) create mode 100644 .github/checkgroup.yml diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml new file mode 100644 index 0000000000000..8f1d3c6fb5e86 --- /dev/null +++ b/.github/checkgroup.yml @@ -0,0 +1,165 @@ +custom_service_name: "Lightning CI required checker" +subprojects: + - id: "CI: CircleCI" + paths: + - ".circleci/**" + checks: + - "test-on-tpus" + + - id: "CI: Azure" + paths: + - ".azure/**" + checks: + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + + - id: "pytorch_lightning" + paths: + # all examples don't need to be added because they aren't used in CI, but these are + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch*.yml" + - ".github/workflows/docs-*.yml" + checks: + - "conda (3.8, 1.10)" + - "conda (3.8, 1.9)" + - "conda (3.9, 1.11)" + - "conda (3.9, 1.12)" + - "cpu (macOS-11, 3.10, latest, stable)" + - "cpu (macOS-11, 3.7, latest, stable)" + - "cpu (macOS-11, 3.7, oldest, stable)" + - "cpu (ubuntu-20.04, 3.10, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, latest, stable)" + - "cpu (ubuntu-20.04, 3.7, oldest, stable)" + - "cpu (windows-2022, 3.10, latest, stable)" + - "cpu (windows-2022, 3.7, latest, stable)" + - "cpu (windows-2022, 3.7, oldest, stable)" + - "doctest (pytorch)" + - "make-docs (pytorch)" + - "mypy" + - "PR Gatekeeper (pytorch)" + - "pytorch-lightning (GPUs)" + - "pytorch-lightning (GPUs) (testing PyTorch - stable)" + - "pytorch-lightning (HPUs)" + - "pytorch-lightning (IPUs)" + - "slow (macOS-11, 3.7, 1.11)" + - "slow (ubuntu-20.04, 3.7, 1.11)" + - "slow (windows-2022, 3.7, 1.11)" + - "test-on-tpus" + + - id: "pytorch_lightning: Docs" + paths: + - "docs/source-pytorch/**" + - ".github/workflows/docs-*.yml" + - "requirements/pytorch/**" + checks: + - "doctest (pytorch)" + - "make-docs (pytorch)" + + - id: "pytorch_lightning: Docker" + paths: + - "dockers/**" + checks: + - "build-conda (3.8, 1.10)" + - "build-conda (3.8, 1.9)" + - "build-conda (3.9, 1.11)" + - "build-conda (3.9, 1.12)" + - "build-cuda (3.8, 1.9, 11.1.1)" + - "build-cuda (3.9, 1.10, 11.3.1)" + - "build-cuda (3.9, 1.11, 11.3.1)" + - "build-cuda (3.9, 1.12, 11.3.1)" + - "build-cuda (3.9, 1.9, 11.1.1)" + - "build-hpu (1.5.0, 1.11.0)" + - "build-ipu (3.9, 1.9)" + - "build-NGC" + - "build-pl (3.9, 1.10, 11.3.1)" + - "build-pl (3.9, 1.11, 11.3.1)" + - "build-pl (3.9, 1.12, 11.3.1)" + - "build-pl (3.9, 1.9, 11.1.1)" + - "build-xla (3.7, 1.12)" + + - id: "pytorch_lightning: mypy" + paths: + - ".github/workflows/code-checks.yml" + - "pyproject.toml" # includes mypy config + checks: + - "mypy" + + - id: "lightning_app" + paths: + - ".github/workflows/ci-app*.yml" + - "examples/app_**" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app/**" + - "tests/tests_app_examples/**" + - "tests/tests_clusters/**" + # the examples are used in the app CI + - "examples/app_*" + checks: + - "Cloud Test (boring_app)" + - "Cloud Test (collect_failures)" + - "Cloud Test (commands_and_api)" + - "Cloud Test (custom_work_dependencies)" + - "Cloud Test (drive)" + - "Cloud Test (idle_timeout)" + - "Cloud Test (payload)" + - "Cloud Test (template_jupyterlab)" + - "Cloud Test (template_react_ui)" + - "Cloud Test (template_streamlit_ui)" + - "Cloud Test (v0_app)" + - "doctest (app)" + - "make-docs (app)" + - "pytest (macOS-11, 3.8, latest)" + - "pytest (macOS-11, 3.8, oldest)" + - "pytest (ubuntu-20.04, 3.8, latest)" + - "pytest (ubuntu-20.04, 3.8, oldest)" + - "pytest (windows-2022, 3.8, latest)" + - "pytest (windows-2022, 3.8, oldest)" + + - id: "lightning_app: Docs" + paths: + - "docs/source-app/**" + - ".github/workflows/docs-*.yml" + - "requirements/app/**" + checks: + - "doctest (app)" + - "make-docs (app)" + + - id: "install" + paths: + - ".actions/setup_tools.py" + - ".github/workflows/ci-pkg-install.yml" + - "setup.py" + - "src/lightning/**" + # all __about__, __version__, __setup__ + - "src/*/__*.py" + checks: + - "install-meta-pypi (macOS-11, 3.8)" + - "install-meta-pypi (ubuntu-20.04, 3.8)" + - "install-meta-pypi (windows-2022, 3.8)" + - "install-meta-src (macOS-11, 3.8)" + - "install-meta-src (macOS-11, lightning, 3.8)" + - "install-meta-src (ubuntu-20.04, 3.8)" + - "install-meta-src (ubuntu-20.04, lightning, 3.8)" + - "install-meta-src (windows-2022, 3.8)" + - "install-meta-src (windows-2022, lightning, 3.8)" + - "install-standalone (macOS-11, app, 3.8)" + - "install-standalone (macOS-11, pytorch, 3.8)" + - "install-standalone (ubuntu-20.04, app, 3.8)" + - "install-standalone (ubuntu-20.04, pytorch, 3.8)" + - "install-standalone (windows-2022, app, 3.8)" + - "install-standalone (windows-2022, pytorch, 3.8)" From 31ecf9bfac32e226eb670e743c79dbceb4f88345 Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Thu, 11 Aug 2022 11:34:24 -0700 Subject: [PATCH 162/230] [CLI] adjust command description (#14130) * adjust CLI copy Co-authored-by: RobertLaurella <99420295+RobertLaurella@users.noreply.github.com> --- src/lightning_app/cli/lightning_cli.py | 10 +++++----- src/lightning_app/cli/lightning_cli_create.py | 2 +- src/lightning_app/cli/lightning_cli_delete.py | 2 +- src/lightning_app/cli/lightning_cli_list.py | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 6a6e41df57026..81d2a773b4619 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -143,7 +143,7 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: @_main.command() def login(): - """Log in to your Lightning.ai account.""" + """Log in to your lightning.ai account.""" auth = Auth() auth.clear() @@ -156,7 +156,7 @@ def login(): @_main.command() def logout(): - """Log out of your Lightning.ai account.""" + """Log out of your lightning.ai account.""" Auth().clear() @@ -215,7 +215,7 @@ def on_before_run(*args): @_main.group() def run(): - """Run your application.""" + """Run a Lightning application locally or on the cloud.""" @run.command("app") @@ -321,7 +321,7 @@ def stop(): @_main.group() def install(): - """Install Lightning apps and components.""" + """Install a Lightning App and/or component.""" @install.command("app") @@ -379,7 +379,7 @@ def install_component(name, yes, version): @_main.group() def init(): - """Init a Lightning app and component.""" + """Init a Lightning App and/or component.""" @init.command("app") diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index c9cea2a5676f9..7e9a6b9d2143b 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -5,7 +5,7 @@ @click.group("create") def create(): - """Create Lightning AI BYOC managed resources.""" + """Create Lightning AI self-managed resources (clusters, etc…)""" pass diff --git a/src/lightning_app/cli/lightning_cli_delete.py b/src/lightning_app/cli/lightning_cli_delete.py index c304b130bdf5d..366f4aa01e995 100644 --- a/src/lightning_app/cli/lightning_cli_delete.py +++ b/src/lightning_app/cli/lightning_cli_delete.py @@ -5,7 +5,7 @@ @click.group("delete") def delete(): - """Delete Lightning AI BYOC managed resources.""" + """Delete Lightning AI self-managed resources (clusters, etc…)""" pass diff --git a/src/lightning_app/cli/lightning_cli_list.py b/src/lightning_app/cli/lightning_cli_list.py index d0d1d34a6dd4d..7d38b5b57760f 100644 --- a/src/lightning_app/cli/lightning_cli_list.py +++ b/src/lightning_app/cli/lightning_cli_list.py @@ -6,7 +6,7 @@ @click.group(name="list") def get_list(): - """List your Lightning AI BYOC managed resources.""" + """List Lightning AI self-managed resources (clusters, etc…)""" pass From e53c4e8e6c14c92968df9bed8861e578bfe731aa Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Thu, 11 Aug 2022 22:10:05 +0100 Subject: [PATCH 163/230] Fix mypy errors attributed to `pytorch_lightning. strategies.sharded_spawn` (#14102) Co-authored-by: rohitgr7 Co-authored-by: Jirka Borovec Co-authored-by: awaelchli --- pyproject.toml | 1 - src/pytorch_lightning/overrides/base.py | 1 + src/pytorch_lightning/strategies/sharded_spawn.py | 14 +++++++++----- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b5e806bc69900..9f7cc28d0b002 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,6 @@ module = [ "pytorch_lightning.profilers.base", "pytorch_lightning.profilers.pytorch", "pytorch_lightning.strategies.sharded", - "pytorch_lightning.strategies.sharded_spawn", "pytorch_lightning.trainer.callback_hook", "pytorch_lightning.trainer.connectors.data_connector", "pytorch_lightning.trainer.supporters", diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py index 26c2837bda7e3..3e9fda2f966f5 100644 --- a/src/pytorch_lightning/overrides/base.py +++ b/src/pytorch_lightning/overrides/base.py @@ -75,6 +75,7 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any: trainer = pl_module._trainer if trainer is not None: + assert isinstance(self.module, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) if trainer.training: output = self.module.training_step(*inputs, **kwargs) # In manual_optimization, we need to prevent DDP reducer as diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py index 4550e397ded80..882302e101cb6 100644 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ b/src/pytorch_lightning/strategies/sharded_spawn.py @@ -12,13 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple from torch import Tensor from torch.nn import Module from torch.optim import Optimizer import pytorch_lightning as pl +from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn @@ -42,7 +43,9 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy): def configure_ddp(self) -> None: # set up optimizers after the wrapped module has been moved to the device + assert self.lightning_module is not None self.setup_optimizers(self.lightning_module.trainer) + assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model, self.optimizers = self._setup_model_and_optimizers( model=LightningShardedDataParallel(self.model), optimizers=self.optimizers ) @@ -69,12 +72,13 @@ def _reinit_optimizers_with_oss(self, optimizers: List[Optimizer]) -> List["OSS" return optimizers def _wrap_optimizers(self, optimizers: List[Optimizer]) -> List["OSS"]: - if self.model is not None and self.model.trainer.state.fn != TrainerFn.FITTING: + assert self.lightning_module + if self.model is not None and self.lightning_module.trainer.state.fn != TrainerFn.FITTING: return optimizers return self._reinit_optimizers_with_oss(optimizers) - def optimizer_state(self, optimizer: "OSS") -> Optional[dict]: + def optimizer_state(self, optimizer: "OSS") -> Dict[str, Any]: if isinstance(optimizer, OSS): optimizer.consolidate_state_dict() return self._optim_state_dict(optimizer) @@ -93,7 +97,7 @@ def block_backward_sync(self) -> Generator: yield None @rank_zero_only - def _optim_state_dict(self, optimizer): + def _optim_state_dict(self, optimizer: Optimizer) -> Dict[str, Any]: """ Retrieves state dict only on rank 0, which contains the entire optimizer state after calling :meth:`consolidate_state_dict`. @@ -112,7 +116,7 @@ def lightning_module(self) -> Optional["pl.LightningModule"]: def pre_backward(self, closure_loss: Tensor) -> None: pass - def post_training_step(self): + def post_training_step(self) -> None: pass @classmethod From 2d9e00fab64c8b19a8646f755a95bcb092aa710f Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 12 Aug 2022 04:51:53 +0530 Subject: [PATCH 164/230] Profile batch transfer and gradient clipping hooks (#14069) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ src/pytorch_lightning/core/module.py | 27 ++++++++++++------- .../plugins/precision/precision_plugin.py | 4 ++- .../trainer/connectors/data_connector.py | 22 +++++++-------- .../logger_connector/fx_validator.py | 5 ++++ .../trainer/connectors/test_data_connector.py | 26 +++++++++--------- .../trainer/logging_/test_logger_connector.py | 10 +++---- 7 files changed, 57 insertions(+), 40 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 6d67d2d58643a..409d3f51bd46f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -11,6 +11,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290)) +- Added profiling to these hooks: `on_before_batch_transfer`, `transfer_batch_to_device`, `on_after_batch_transfer`, `configure_gradient_clipping`, `clip_gradients` ([#14069](https://github.com/Lightning-AI/lightning/pull/14069)) + + - diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index f58503edd88cb..612bcc72d2806 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -37,7 +37,6 @@ from pytorch_lightning.core.optimizer import LightningOptimizer from pytorch_lightning.core.saving import ModelIO from pytorch_lightning.loggers import Logger, LoggerCollection -from pytorch_lightning.trainer.connectors.data_connector import _DataHookSelector from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType, warnings from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors @@ -291,16 +290,24 @@ def _apply_batch_transfer_handler( self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0 ) -> Any: device = device or self.device - datahook_selector = ( - _DataHookSelector(self, None) if self._trainer is None else self.trainer._data_connector._datahook_selector - ) - hook = datahook_selector.get_hook("on_before_batch_transfer") - batch = hook(batch, dataloader_idx) - hook = datahook_selector.get_hook("transfer_batch_to_device") - batch = hook(batch, device, dataloader_idx) - hook = datahook_selector.get_hook("on_after_batch_transfer") - batch = hook(batch, dataloader_idx) + def call_hook(hook_name, *args): + if self._trainer: + datahook_selector = self._trainer._data_connector._datahook_selector + obj = datahook_selector.get_instance(hook_name) + trainer_method = ( + self._trainer._call_lightning_module_hook + if isinstance(obj, self.__class__) + else self._trainer._call_lightning_datamodule_hook + ) + return trainer_method(hook_name, *args) + else: + hook = getattr(self, hook_name) + return hook(*args) + + batch = call_hook("on_before_batch_transfer", batch, dataloader_idx) + batch = call_hook("transfer_batch_to_device", batch, device, dataloader_idx) + batch = call_hook("on_after_batch_transfer", batch, dataloader_idx) return batch def print(self, *args, **kwargs) -> None: diff --git a/src/pytorch_lightning/plugins/precision/precision_plugin.py b/src/pytorch_lightning/plugins/precision/precision_plugin.py index 60dfb1ab6c92f..285a0f31e3955 100644 --- a/src/pytorch_lightning/plugins/precision/precision_plugin.py +++ b/src/pytorch_lightning/plugins/precision/precision_plugin.py @@ -182,7 +182,9 @@ def _clip_gradients( if not isinstance(model, pl.LightningModule) or not model.automatic_optimization: # the configuration validator disallows clipping on manual return - model.configure_gradient_clipping( + + model.trainer._call_lightning_module_hook( + "configure_gradient_clipping", optimizer, optimizer_idx, gradient_clip_val=clip_val, diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index e1aca404722db..1de8bee90d18f 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -14,7 +14,7 @@ import multiprocessing import os from dataclasses import dataclass, field -from typing import Any, Callable, Collection, List, Optional, Tuple, Union +from typing import Any, Collection, List, Optional, Tuple, Union from weakref import proxy from torch.utils.data import BatchSampler, DataLoader, Sampler, SequentialSampler @@ -527,16 +527,16 @@ def is_module(self) -> bool: @dataclass class _DataHookSelector: - """Stores the info about the shared DataHooks within LightningModule and LightningDataModule. + """Stores the info about the shared DataHooks within ``LightningModule`` and ``LightningDataModule``. - The hook source can be + The hook source can be: - 1. a method from the :class:`~pytorch_lightning.core.module.LightningModule`, - 2. a method from the :class:`~pytorch_lightning.core.datamodule.LightningDataModule`, + 1. the :class:`~pytorch_lightning.core.module.LightningModule`, + 2. the :class:`~pytorch_lightning.core.datamodule.LightningDataModule`, Arguments: - model: A LightningModule - datamodule: A LightningDataModule + model: A ``LightningModule`` + datamodule: A ``LightningDataModule`` """ model: "pl.LightningModule" @@ -545,7 +545,7 @@ class _DataHookSelector: default=("on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer") ) - def get_hook(self, hook_name: str) -> Callable: + def get_instance(self, hook_name: str) -> Union["pl.LightningModule", "pl.LightningDataModule"]: if hook_name not in self._valid_hooks: raise ValueError( f"`{hook_name}` is not a shared hook within `LightningModule` and `LightningDataModule`." @@ -553,7 +553,7 @@ def get_hook(self, hook_name: str) -> Callable: ) if self.datamodule is None: - return getattr(self.model, hook_name) + return self.model if is_overridden(hook_name, self.datamodule): if is_overridden(hook_name, self.model): @@ -561,11 +561,11 @@ def get_hook(self, hook_name: str) -> Callable: f"You have overridden `{hook_name}` in both `LightningModule` and `LightningDataModule`." " It will use the implementation from `LightningDataModule` instance." ) - return getattr(self.datamodule, hook_name) + return self.datamodule if is_overridden(hook_name, self.model): warning_cache.warn( f"You have overridden `{hook_name}` in `LightningModule` but have passed in a" " `LightningDataModule`. It will use the implementation from `LightningModule` instance." ) - return getattr(self.model, hook_name) + return self.model diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index 6f60ba6f1aa2f..56ad53ef4ba04 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -44,6 +44,8 @@ class _LogOptions(TypedDict): allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), "lr_scheduler_step": None, + "configure_gradient_clipping": None, + "clip_gradients": None, "on_before_zero_grad": _LogOptions( allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), @@ -98,6 +100,9 @@ class _LogOptions(TypedDict): "on_epoch_end": _LogOptions( allowed_on_step=(False,), allowed_on_epoch=(True,), default_on_step=False, default_on_epoch=True ), + "on_before_batch_transfer": None, + "transfer_batch_to_device": None, + "on_after_batch_transfer": None, "on_batch_start": _LogOptions( allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 2650e46b7fa60..7273d7719834e 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -471,34 +471,34 @@ def test_no_datamodule_no_overridden(self, hook_name): model, _, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=None) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_with_datamodule_no_overridden(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_override_model_hook(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_override_datamodule_hook(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) setattr(dm, hook_name, self.overridden_func) with no_warning_call(match=f"have overridden `{hook_name}` in"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(dm, hook_name) + assert instance is dm def test_override_both_model_and_datamodule(self, hook_name): model, dm, trainer = self.reset_instances() @@ -506,24 +506,24 @@ def test_override_both_model_and_datamodule(self, hook_name): setattr(model, hook_name, self.overridden_func) setattr(dm, hook_name, self.overridden_func) with pytest.warns(UserWarning, match=f"have overridden `{hook_name}` in both"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(dm, hook_name) + assert instance is dm def test_with_datamodule_override_model(self, hook_name): model, dm, trainer = self.reset_instances() trainer._data_connector.attach_datamodule(model, datamodule=dm) setattr(model, hook_name, self.overridden_func) with pytest.warns(UserWarning, match=f"have overridden `{hook_name}` in `LightningModule`"): - hook = trainer._data_connector._datahook_selector.get_hook(hook_name) + instance = trainer._data_connector._datahook_selector.get_instance(hook_name) - assert hook == getattr(model, hook_name) + assert instance is model def test_invalid_hook_passed_in_datahook_selector(): dh_selector = _DataHookSelector(BoringModel(), None) with pytest.raises(ValueError, match="is not a shared hook"): - dh_selector.get_hook("setup") + dh_selector.get_instance("setup") def test_eval_distributed_sampler_warning(tmpdir): diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index 760e8eea2a85c..c2be22c61244b 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -187,11 +187,6 @@ def __init__(self, not_supported): { "log", "log_dict", - # the following are problematic as they do have `self._current_fx_name` defined some times but - # not others depending on where they were called. So we cannot reliably `self.log` in them - "on_before_batch_transfer", - "transfer_batch_to_device", - "on_after_batch_transfer", } ) # remove `nn.Module` hooks @@ -227,6 +222,9 @@ def test_fx_validator_integration(tmpdir): "on_pretrain_routine_end": "You can't", "train_dataloader": "You can't", "val_dataloader": "You can't", + "on_before_batch_transfer": "You can't", + "transfer_batch_to_device": "You can't", + "on_after_batch_transfer": "You can't", "on_validation_end": "You can't", "on_train_end": "You can't", "on_fit_end": "You can't", @@ -238,6 +236,8 @@ def test_fx_validator_integration(tmpdir): "on_validation_model_eval": "You can't", "on_validation_model_train": "You can't", "lr_scheduler_step": "You can't", + "configure_gradient_clipping": "You can't", + "clip_gradients": "You can't", "on_save_checkpoint": "You can't", "on_load_checkpoint": "You can't", "on_exception": "You can't", From 6789a066b52ebbcc94048cac72c0de9b350e74e5 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 12 Aug 2022 12:52:24 +0530 Subject: [PATCH 165/230] Avoid false positive warning about using `sync_dist` when using torchmetrics (#14143) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 4 +++- .../connectors/logger_connector/result.py | 2 +- .../core/test_metric_result_integration.py | 22 ++++++++++++++----- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 409d3f51bd46f..714d4340f1ba1 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -70,6 +70,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) +- Avoided false positive warning about using `sync_dist` when using torchmetrics ([#14143](https://github.com/Lightning-AI/lightning/pull/14143)) + + - Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) @@ -79,7 +82,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) - ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index 9eb88fda4891e..a28599b5f20be 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -525,7 +525,7 @@ def _get_cache(result_metric: _ResultMetric, on_step: bool) -> Optional[Tensor]: elif not on_step and result_metric.meta.on_epoch: if result_metric._computed is None: should = result_metric.meta.sync.should - if not result_metric.meta.sync.should and distributed_available(): + if not should and distributed_available() and result_metric.is_tensor: # ensure sync happens for FT since during a failure, the metrics are synced and saved to the # checkpoint, so during restart, metrics on rank 0 are from the accumulated ones from the previous # run, and on other ranks, they are 0. So we need to make sure they are synced in further training diff --git a/tests/tests_pytorch/core/test_metric_result_integration.py b/tests/tests_pytorch/core/test_metric_result_integration.py index cb8a51c5bf9ba..9672bb75b51f1 100644 --- a/tests/tests_pytorch/core/test_metric_result_integration.py +++ b/tests/tests_pytorch/core/test_metric_result_integration.py @@ -21,9 +21,11 @@ import torch import torch.distributed as dist import torch.multiprocessing as mp +import torchmetrics from torch.nn import ModuleDict, ModuleList from torchmetrics import Metric, MetricCollection +import pytorch_lightning as pl import tests_pytorch.helpers.utils as tutils from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint @@ -666,14 +668,22 @@ def on_train_start(self): @pytest.mark.parametrize("distributed_env", [True, False]) -def test_logger_sync_dist(distributed_env): - # self.log('bar', 7, ..., sync_dist=False) +@pytest.mark.parametrize("log_val", [torch.tensor(0.5), torchmetrics.Accuracy()]) +def test_logger_sync_dist(distributed_env, log_val): + pl.trainer.connectors.logger_connector.result.warning_cache.clear() + + # self.log('bar', 0.5, ..., sync_dist=False) meta = _Metadata("foo", "bar") meta.sync = _Sync(_should=False) - result_metric = _ResultMetric(metadata=meta, is_tensor=True) - result_metric.update(torch.tensor(7.0), 10) + is_tensor = isinstance(log_val, torch.Tensor) + + if not is_tensor: + log_val.update(torch.tensor([0, 1]), torch.tensor([0, 0], dtype=torch.long)) + + result_metric = _ResultMetric(metadata=meta, is_tensor=is_tensor) + result_metric.update(log_val, 10) - warning_ctx = pytest.warns if distributed_env else no_warning_call + warning_ctx = pytest.warns if distributed_env and is_tensor else no_warning_call with mock.patch( "pytorch_lightning.trainer.connectors.logger_connector.result.distributed_available", @@ -681,4 +691,4 @@ def test_logger_sync_dist(distributed_env): ): with warning_ctx(PossibleUserWarning, match=r"recommended to use `self.log\('bar', ..., sync_dist=True\)`"): value = _ResultCollection._get_cache(result_metric, on_step=False) - assert value == 7.0 + assert value == 0.5 From 807f9d8c9652f989edda80de226cf2078cd141c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Fri, 12 Aug 2022 10:24:04 +0200 Subject: [PATCH 166/230] Replace unwrapping logic in strategies (#13738) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Rohit Gupta --- src/pytorch_lightning/CHANGELOG.md | 10 +++ src/pytorch_lightning/overrides/base.py | 78 +++++++++++++++---- .../overrides/data_parallel.py | 24 ++++-- .../overrides/distributed.py | 11 ++- src/pytorch_lightning/overrides/fairscale.py | 29 +++++-- .../plugins/precision/sharded_native_amp.py | 2 +- src/pytorch_lightning/strategies/bagua.py | 27 +++---- src/pytorch_lightning/strategies/ddp.py | 8 +- src/pytorch_lightning/strategies/deepspeed.py | 18 ++--- src/pytorch_lightning/strategies/ipu.py | 8 +- src/pytorch_lightning/strategies/parallel.py | 5 -- src/pytorch_lightning/strategies/sharded.py | 25 +++--- .../strategies/sharded_spawn.py | 26 +++---- src/pytorch_lightning/strategies/strategy.py | 11 +-- src/pytorch_lightning/strategies/tpu_spawn.py | 10 +-- src/pytorch_lightning/trainer/trainer.py | 8 +- tests/tests_pytorch/accelerators/test_ipu.py | 50 ++++++------ .../deprecated_api/test_remove_1-10.py | 44 +++++++++++ tests/tests_pytorch/helpers/runif.py | 2 +- tests/tests_pytorch/models/test_amp.py | 9 --- tests/tests_pytorch/overrides/test_base.py | 3 +- .../precision/test_sharded_precision.py | 2 +- .../strategies/test_sharded_strategy.py | 10 +-- .../connectors/test_callback_connector.py | 8 +- .../trainer/flags/test_overfit_batches.py | 2 +- tests/tests_pytorch/utilities/test_imports.py | 3 +- 26 files changed, 274 insertions(+), 159 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 714d4340f1ba1..4f986257f33ed 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -28,6 +28,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) +- Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) + + ### Deprecated - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) @@ -39,6 +42,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the calls to `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) +- Deprecated the `unwrap_lightning_module` and `unwrap_lightning_module_sharded` utility functions in favor of accessing the unwrapped `LightningModule` on the strategy directly ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) + + +- Deprecated the `pl_module` argument in `LightningParallelModule`, `LightningDistributedModule`, `LightningShardedDataParallel`, `LightningBaguaModule` and `LightningDeepSpeedModule` wrapper classes ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) + + + ### Removed - Removed the deprecated `Trainer.training_type_plugin` property in favor of `Trainer.strategy` ([#14011](https://github.com/Lightning-AI/lightning/pull/14011)) diff --git a/src/pytorch_lightning/overrides/base.py b/src/pytorch_lightning/overrides/base.py index 3e9fda2f966f5..07f30c271b207 100644 --- a/src/pytorch_lightning/overrides/base.py +++ b/src/pytorch_lightning/overrides/base.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Union +from typing import Any, Optional, Union import torch import torch.nn as nn @@ -20,6 +20,7 @@ import pytorch_lightning as pl from pytorch_lightning.core.mixins import DeviceDtypeModuleMixin +from pytorch_lightning.utilities import rank_zero_deprecation class _LightningPrecisionModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module): @@ -54,30 +55,47 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: class _LightningModuleWrapperBase(DeviceDtypeModuleMixin, torch.nn.Module): - def __init__(self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]) -> None: + def __init__( + self, forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] + ) -> None: """Wraps the user's LightningModule and redirects the forward call to the appropriate method, either ``training_step``, ``validation_step``, ``test_step``, or ``predict_step``. Inheriting classes may also modify the inputs or outputs of forward. Args: - pl_module: the model to wrap + forward_module: The module to wrap. If it's not a LightningModule, it must have an attribute ``.module`` + pointing to a LightningModule reference. """ super().__init__() - self.module = pl_module + if not isinstance(forward_module, pl.LightningModule) and ( + not isinstance(getattr(forward_module, "module", None), pl.LightningModule) + ): + raise ValueError( + "`forward_module` must be a `LightningModule` instance or have an attribute `.module` pointing to one," + f" got: {forward_module.__class__.__qualname__}" + ) + # TODO: In v1.10.0, remove the Optional type from forward_module and remove the assertion + assert forward_module is not None + self._forward_module = forward_module # set the parameters_to_ignore from LightningModule. - _ddp_params_and_buffers_to_ignore = getattr(pl_module, "_ddp_params_and_buffers_to_ignore", []) + _ddp_params_and_buffers_to_ignore = getattr(self._forward_module, "_ddp_params_and_buffers_to_ignore", []) self._ddp_params_and_buffers_to_ignore = [f"module.{p}" for p in _ddp_params_and_buffers_to_ignore] + @property + def lightning_module(self) -> "pl.LightningModule": + if isinstance(self._forward_module, pl.LightningModule): + return self._forward_module + return self._forward_module.module + def forward(self, *inputs: Any, **kwargs: Any) -> Any: - pl_module = unwrap_lightning_module(self.module) + pl_module = self.lightning_module trainer = pl_module._trainer if trainer is not None: - assert isinstance(self.module, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) if trainer.training: - output = self.module.training_step(*inputs, **kwargs) + output = self._forward_module.training_step(*inputs, **kwargs) # In manual_optimization, we need to prevent DDP reducer as # it is done manually in `LightningModule.manual_backward` # `require_backward_grad_sync` will be reset in the @@ -86,27 +104,53 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any: trainer.model.require_backward_grad_sync = False # type: ignore[assignment] return output if trainer.testing: - return self.module.test_step(*inputs, **kwargs) + return self._forward_module.test_step(*inputs, **kwargs) if trainer.sanity_checking or trainer.validating: - return self.module.validation_step(*inputs, **kwargs) + return self._forward_module.validation_step(*inputs, **kwargs) if trainer.predicting: - return self.module.predict_step(*inputs, **kwargs) - return self.module(*inputs, **kwargs) - - -def unwrap_lightning_module(wrapped_model: nn.Module) -> "pl.LightningModule": + return self._forward_module.predict_step(*inputs, **kwargs) + return self._forward_module(*inputs, **kwargs) + + @classmethod + def _validate_init_arguments( + cls, + pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + ) -> None: + # TODO: In v1.10.0, remove this method and mark the forward_module init argument in all subclasses as required + if pl_module is not None: + rank_zero_deprecation( + f"The argument `pl_module` in `{cls.__name__}` is deprecated in v1.8.0 and will be removed in" + " v1.10.0. Please use `forward_module` instead." + ) + elif forward_module is None: + raise ValueError("Argument `forward_module` is required.") + + +def unwrap_lightning_module(wrapped_model: nn.Module, _suppress_warning: bool = False) -> "pl.LightningModule": """Recursively unwraps a :class:`~pytorch_lightning.core.module.LightningModule` by following the ``.module`` attributes on the wrapper. + .. deprecated:: v1.8.0 + The function ``unwrap_lightning_module`` is deprecated in v1.8.0 and will be removed in v1.10.0. Access the + ``LightningModule`` directly through the strategy attribute ``Strategy.lightning_module``. + Raises: TypeError: If the unwrapping leads to a module that is not a LightningModule and that cannot be unwrapped further. """ + if not _suppress_warning: + rank_zero_deprecation( + "The function `unwrap_lightning_module` is deprecated in v1.8.0 and will be removed in v1.10.0. Access the" + " `LightningModule` directly through the strategy attribute `Strategy.lightning_module`." + ) model = wrapped_model if isinstance(model, (DistributedDataParallel, DataParallel)): model = unwrap_lightning_module(model.module) - if isinstance(model, (_LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase)): - model = unwrap_lightning_module(model.module) + if isinstance(model, _LightningModuleWrapperBase): + model = model.lightning_module + if isinstance(model, _LightningPrecisionModuleWrapperBase): + model = model.module if not isinstance(model, pl.LightningModule): raise TypeError(f"Unwrapping the module did not yield a `LightningModule`, got {type(model)} instead.") return model diff --git a/src/pytorch_lightning/overrides/data_parallel.py b/src/pytorch_lightning/overrides/data_parallel.py index 9fa253b9d8321..98d23cee391bc 100644 --- a/src/pytorch_lightning/overrides/data_parallel.py +++ b/src/pytorch_lightning/overrides/data_parallel.py @@ -13,7 +13,7 @@ # limitations under the License. import numbers import warnings -from typing import Any, cast, Union +from typing import Any, cast, Optional, Union import torch from torch import Tensor @@ -52,11 +52,23 @@ class LightningParallelModule(_LightningModuleWrapperBase): ) Args: - pl_module: the model to wrap + pl_module: The module to wrap. See description for `forward_module`. + + .. deprecated:: v1.8.0 + The argument ``pl_module`` is deprecated in v1.8.0 and will be removed in v1.10.0. Please use + ``forward_module`` instead. + + forward_module: The module to wrap. If it's not a ``LightningModule``, it must have an attribute ``.module`` + pointing to a ``LightningModule`` reference. """ - def __init__(self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]) -> None: - super().__init__(pl_module) + def __init__( + self, + forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + ) -> None: + self._validate_init_arguments(pl_module, forward_module) + super().__init__(forward_module=(pl_module or forward_module)) _ignore_scalar_return_in_dp() def forward(self, *inputs: Any, **kwargs: Any) -> Any: @@ -65,7 +77,7 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any: output = super().forward(*inputs, **kwargs) def output_transform(data: Any) -> Any: - device = cast(torch.device, self.module.device) + device = cast(torch.device, self.lightning_module.device) data = python_scalar_to_tensor(data, device) data = unsqueeze_scalar_tensor(data) return data @@ -95,7 +107,7 @@ def find_tensor_with_device(tensor: Tensor) -> Tensor: if replica_device is not None: # by calling .to() we force the update to the self.device property - self.module.to(device=replica_device) + self._forward_module.to(device=replica_device) else: rank_zero_warn( "Could not determine on which device the inputs are." diff --git a/src/pytorch_lightning/overrides/distributed.py b/src/pytorch_lightning/overrides/distributed.py index 929d1ed486f4a..3ecac8c1eea04 100644 --- a/src/pytorch_lightning/overrides/distributed.py +++ b/src/pytorch_lightning/overrides/distributed.py @@ -19,12 +19,19 @@ from torch.nn.parallel import DistributedDataParallel from torch.utils.data import BatchSampler, Dataset, DistributedSampler, Sampler -from pytorch_lightning.overrides.base import _LightningModuleWrapperBase +import pytorch_lightning as pl +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.utilities.exceptions import MisconfigurationException class LightningDistributedModule(_LightningModuleWrapperBase): - ... + def __init__( + self, + forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + ) -> None: + self._validate_init_arguments(pl_module, forward_module) + super().__init__(forward_module=(pl_module or forward_module)) def _find_tensors( diff --git a/src/pytorch_lightning/overrides/fairscale.py b/src/pytorch_lightning/overrides/fairscale.py index f48fa8dcf9ccf..d9fd2e60aff61 100644 --- a/src/pytorch_lightning/overrides/fairscale.py +++ b/src/pytorch_lightning/overrides/fairscale.py @@ -11,27 +11,44 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from typing import Optional, Union + import torch.nn as nn import pytorch_lightning as pl -from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, unwrap_lightning_module -from pytorch_lightning.utilities import _IS_WINDOWS, _module_available +from pytorch_lightning.overrides.base import ( + _LightningModuleWrapperBase, + _LightningPrecisionModuleWrapperBase, + unwrap_lightning_module, +) +from pytorch_lightning.utilities import rank_zero_deprecation +from pytorch_lightning.utilities.imports import _IS_WINDOWS, _module_available _FAIRSCALE_AVAILABLE = not _IS_WINDOWS and _module_available("fairscale.nn") -if _FAIRSCALE_AVAILABLE: + +if _FAIRSCALE_AVAILABLE: # pragma: no-cover from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel class LightningShardedDataParallel(_LightningModuleWrapperBase): - # Just do this for later docstrings - pass + def __init__( + self, + forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + ) -> None: + self._validate_init_arguments(pl_module, forward_module) + super().__init__(forward_module=(pl_module or forward_module)) def unwrap_lightning_module_sharded(wrapped_model: nn.Module) -> "pl.LightningModule": + rank_zero_deprecation( + "The function `unwrap_lightning_module_sharded` is deprecated in v1.8.0 and will be removed in v1.10.0." + " Access the `LightningModule` directly through the strategy attribute `Strategy.lightning_module`." + ) model = wrapped_model if isinstance(model, ShardedDataParallel): model = model.module - return unwrap_lightning_module(model) + return unwrap_lightning_module(model, _suppress_warning=True) else: LightningShardedDataParallel = ... # type: ignore[assignment,misc] diff --git a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py index f5646c2094253..570e25bd85caa 100644 --- a/src/pytorch_lightning/plugins/precision/sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/sharded_native_amp.py @@ -15,9 +15,9 @@ import torch -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE if _FAIRSCALE_AVAILABLE: from fairscale.optim import OSS diff --git a/src/pytorch_lightning/strategies/bagua.py b/src/pytorch_lightning/strategies/bagua.py index d100d1aa97adc..f08d1aebf1b7c 100644 --- a/src/pytorch_lightning/strategies/bagua.py +++ b/src/pytorch_lightning/strategies/bagua.py @@ -7,11 +7,7 @@ from torch.nn import Module import pytorch_lightning as pl -from pytorch_lightning.overrides.base import ( - _LightningModuleWrapperBase, - _LightningPrecisionModuleWrapperBase, - unwrap_lightning_module, -) +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -54,10 +50,16 @@ class LightningBaguaModule(_LightningModuleWrapperBase): - def __init__(self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]) -> None: - super().__init__(pl_module) + def __init__( + self, + forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + ) -> None: + self._validate_init_arguments(pl_module, forward_module) + forward_module = pl_module or forward_module + super().__init__(forward_module=forward_module) # Bagua use `bagua_module_name` to distinguish different modules - self._bagua_module_name = f"{pl_module.__class__.__name__}{id(pl_module)}" + self._bagua_module_name = f"{forward_module.__class__.__name__}{id(forward_module)}" class BaguaStrategy(DDPStrategy): @@ -109,13 +111,6 @@ def __init__( self._bagua_flatten = flatten self._bagua_kwargs = bagua_kwargs - @property - def lightning_module(self) -> Optional["pl.LightningModule"]: - model = self.model - if isinstance(model, BaguaDistributedDataParallel): - model = model.module - return unwrap_lightning_module(model) if model is not None else None - def setup_distributed(self) -> None: reset_seed() @@ -190,7 +185,7 @@ def _check_qadam_optimizer(self) -> None: def _configure_bagua_model(self, trainer: "pl.Trainer") -> None: model = LightningBaguaModule(self.model) # type: ignore[arg-type] - self._model = self._setup_model(model) + self.model = self._setup_model(model) # start the background communication for async algorithm if trainer.training and self._bagua_algorithm == "async": diff --git a/src/pytorch_lightning/strategies/ddp.py b/src/pytorch_lightning/strategies/ddp.py index 57ab3a151b011..f4f5397a78bca 100644 --- a/src/pytorch_lightning/strategies/ddp.py +++ b/src/pytorch_lightning/strategies/ddp.py @@ -34,7 +34,6 @@ from pytorch_lightning.overrides import LightningDistributedModule from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase from pytorch_lightning.overrides.distributed import prepare_for_backward -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin @@ -55,7 +54,12 @@ sync_ddp_if_available, ) from pytorch_lightning.utilities.exceptions import DeadlockDetectedException -from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, _TORCH_GREATER_EQUAL_1_11 +from pytorch_lightning.utilities.imports import ( + _FAIRSCALE_AVAILABLE, + _IS_WINDOWS, + _TORCH_GREATER_EQUAL_1_10, + _TORCH_GREATER_EQUAL_1_11, +) from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_info, rank_zero_only, rank_zero_warn from pytorch_lightning.utilities.seed import reset_seed diff --git a/src/pytorch_lightning/strategies/deepspeed.py b/src/pytorch_lightning/strategies/deepspeed.py index 8acbc80257bd1..4a70eb983fd86 100644 --- a/src/pytorch_lightning/strategies/deepspeed.py +++ b/src/pytorch_lightning/strategies/deepspeed.py @@ -77,10 +77,14 @@ class LightningDeepSpeedModule(_LightningModuleWrapperBase): """ def __init__( - self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase], precision: Union[str, int] + self, + forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + precision: Union[str, int] = 32, + pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, ) -> None: rank_zero_deprecation("`LightningDeepSpeedModule` has been deprecated in v1.7.1 and will be removed in v1.9.0") - super().__init__(pl_module) + self._validate_init_arguments(pl_module, forward_module) + super().__init__(forward_module=(pl_module or forward_module)) self.precision = precision def forward(self, *inputs: Any, **kwargs: Any) -> Any: @@ -485,7 +489,7 @@ def init_deepspeed(self) -> None: ) assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) - model = _LightningModuleWrapperBase(pl_module=self.model) + model = _LightningModuleWrapperBase(forward_module=self.model) if self.lightning_module.trainer and self.lightning_module.trainer.training: self._initialize_deepspeed_train(model) @@ -611,14 +615,6 @@ def _initialize_deepspeed_inference(self, model: Module) -> None: ) self.model = model - @property - def lightning_module(self) -> Optional["pl.LightningModule"]: - # the model may not be wrapped with DeepEngine & _LightningModuleWrapperBase if calling this too early - module = getattr(self.model, "module", self.model) - module = module.module if isinstance(module, _LightningModuleWrapperBase) else module - assert isinstance(module, pl.LightningModule) or module is None - return module - @property def distributed_sampler_kwargs(self) -> Dict[str, int]: distributed_sampler_kwargs = dict(num_replicas=self.world_size, rank=self.global_rank) diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index 4bedbfd6d70fc..f56c095dc12c1 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -51,10 +51,14 @@ class LightningIPUModule(_LightningModuleWrapperBase): """ def __init__( - self, pl_module: Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase], precision: Union[str, int] + self, + forward_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, + precision: Union[str, int] = 32, + pl_module: Optional[Union["pl.LightningModule", _LightningPrecisionModuleWrapperBase]] = None, ) -> None: rank_zero_deprecation("`LightningIPUModule` has been deprecated in v1.7.0 and will be removed in v1.8.0") - super().__init__(pl_module) + self._validate_init_arguments(pl_module, forward_module) + super().__init__(forward_module=(pl_module or forward_module)) self.precision = precision def forward(self, *inputs: Any, **kwargs: Any) -> Any: diff --git a/src/pytorch_lightning/strategies/parallel.py b/src/pytorch_lightning/strategies/parallel.py index 2517848274e3d..9d469313103a1 100644 --- a/src/pytorch_lightning/strategies/parallel.py +++ b/src/pytorch_lightning/strategies/parallel.py @@ -19,7 +19,6 @@ from torch import Tensor import pytorch_lightning as pl -from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins import LayerSync from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO @@ -55,10 +54,6 @@ def __init__( def root_device(self) -> torch.device: """Return the root device.""" - @property - def lightning_module(self) -> Optional["pl.LightningModule"]: - return unwrap_lightning_module(self.model) if self.model is not None else None - @property def global_rank(self) -> int: return self.cluster_environment.global_rank() if self.cluster_environment is not None else 0 diff --git a/src/pytorch_lightning/strategies/sharded.py b/src/pytorch_lightning/strategies/sharded.py index 01401bd53bb56..ce1e4cd96b961 100644 --- a/src/pytorch_lightning/strategies/sharded.py +++ b/src/pytorch_lightning/strategies/sharded.py @@ -20,20 +20,18 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import LightningOptimizer -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase from pytorch_lightning.strategies.ddp import DDPStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE +from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE, _FAIRSCALE_OSS_FP16_BROADCAST_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only if _FAIRSCALE_AVAILABLE: from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel from fairscale.optim import OSS - - from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded else: OSS = ShardedDataParallel = object @@ -44,6 +42,14 @@ class DDPShardedStrategy(DDPStrategy): strategy_name = "ddp_sharded" _REDUCE_BUFFER_SIZE_DEFAULT: int = 2**23 # 8M + def connect(self, model: "pl.LightningModule") -> None: + if not _FAIRSCALE_AVAILABLE: # pragma: no cover + raise MisconfigurationException( + "`DDPShardedStrategy` requires `fairscale` to be installed." + " Install it by running `pip install fairscale`." + ) + return super().connect(model) + def setup(self, trainer: "pl.Trainer") -> None: # share ddp pids to all processes self._rank_0_will_call_children_scripts = self.broadcast(self._rank_0_will_call_children_scripts) @@ -70,7 +76,7 @@ def configure_ddp(self) -> None: self._set_ddp_kwargs() self.setup_optimizers(self.model.trainer) self.model, self.optimizers = self._setup_model_and_optimizers( - model=LightningShardedDataParallel(self.model), + model=_LightningModuleWrapperBase(self.model), optimizers=self.optimizers, ) optimizers_to_device(self.optimizers, self.root_device) @@ -128,15 +134,6 @@ def _optim_state_dict(self, optimizer): """ return optimizer.state_dict() - @property - def lightning_module(self) -> Optional["pl.LightningModule"]: - if not _FAIRSCALE_AVAILABLE: # pragma: no cover - raise MisconfigurationException( - "`DDPShardedStrategy` requires `fairscale` to be installed." - " Install it by running `pip install fairscale`." - ) - return unwrap_lightning_module_sharded(self.model) if self.model is not None else None - def pre_backward(self, closure_loss: Tensor) -> None: pass diff --git a/src/pytorch_lightning/strategies/sharded_spawn.py b/src/pytorch_lightning/strategies/sharded_spawn.py index 882302e101cb6..f19aae7302eea 100644 --- a/src/pytorch_lightning/strategies/sharded_spawn.py +++ b/src/pytorch_lightning/strategies/sharded_spawn.py @@ -12,18 +12,18 @@ # See the License for the specific language governing permissions and # limitations under the License. from contextlib import contextmanager -from typing import Any, Dict, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Tuple from torch import Tensor from torch.nn import Module from torch.optim import Optimizer import pytorch_lightning as pl -from pytorch_lightning.overrides.base import _LightningPrecisionModuleWrapperBase -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE +from pytorch_lightning.overrides.base import _LightningModuleWrapperBase, _LightningPrecisionModuleWrapperBase from pytorch_lightning.strategies.ddp_spawn import DDPSpawnStrategy from pytorch_lightning.trainer.states import TrainerFn from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from pytorch_lightning.utilities.optimizer import optimizers_to_device from pytorch_lightning.utilities.rank_zero import rank_zero_only @@ -31,7 +31,6 @@ from fairscale.nn.data_parallel.sharded_ddp import ShardedDataParallel from fairscale.optim import OSS - from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded else: OSS = ShardedDataParallel = object @@ -41,13 +40,21 @@ class DDPSpawnShardedStrategy(DDPSpawnStrategy): strategy_name = "ddp_sharded_spawn" + def connect(self, model: "pl.LightningModule") -> None: + if not _FAIRSCALE_AVAILABLE: # pragma: no cover + raise MisconfigurationException( + "`DDPSpawnShardedStrategy` requires `fairscale` to be installed." + " Install it by running `pip install fairscale`." + ) + return super().connect(model) + def configure_ddp(self) -> None: # set up optimizers after the wrapped module has been moved to the device assert self.lightning_module is not None self.setup_optimizers(self.lightning_module.trainer) assert isinstance(self.model, (pl.LightningModule, _LightningPrecisionModuleWrapperBase)) self.model, self.optimizers = self._setup_model_and_optimizers( - model=LightningShardedDataParallel(self.model), optimizers=self.optimizers + model=_LightningModuleWrapperBase(self.model), optimizers=self.optimizers ) optimizers_to_device(self.optimizers, self.root_device) @@ -104,15 +111,6 @@ def _optim_state_dict(self, optimizer: Optimizer) -> Dict[str, Any]: """ return optimizer.state_dict() - @property - def lightning_module(self) -> Optional["pl.LightningModule"]: - if not _FAIRSCALE_AVAILABLE: # pragma: no cover - raise MisconfigurationException( - "`DDPSpawnShardedStrategy` requires `fairscale` to be installed." - " Install it by running `pip install fairscale`." - ) - return unwrap_lightning_module_sharded(self.model) if self.model is not None else None - def pre_backward(self, closure_loss: Tensor) -> None: pass diff --git a/src/pytorch_lightning/strategies/strategy.py b/src/pytorch_lightning/strategies/strategy.py index 59f1e37095e60..c09e7eae8c586 100644 --- a/src/pytorch_lightning/strategies/strategy.py +++ b/src/pytorch_lightning/strategies/strategy.py @@ -24,7 +24,6 @@ import pytorch_lightning as pl from pytorch_lightning.core.optimizer import _init_optimizers_and_lr_schedulers, LightningOptimizer -from pytorch_lightning.overrides.base import unwrap_lightning_module from pytorch_lightning.plugins import TorchCheckpointIO from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.io.wrapper import _WrappingCheckpointIO @@ -62,8 +61,9 @@ def __init__( self._accelerator: Optional["pl.accelerators.accelerator.Accelerator"] = accelerator self._checkpoint_io: Optional[CheckpointIO] = checkpoint_io self._precision_plugin: Optional[PrecisionPlugin] = precision_plugin - self._launcher: Optional[_Launcher] = None + self._lightning_module: Optional[pl.LightningModule] = None self._model: Optional[Module] = None + self._launcher: Optional[_Launcher] = None self._optimizers: List[Optimizer] = [] self._lightning_optimizers: Dict[int, LightningOptimizer] = {} self.lr_scheduler_configs: List[LRSchedulerConfig] = [] @@ -113,8 +113,9 @@ def optimizers(self, optimizers: List[Optimizer]) -> None: idx: LightningOptimizer._to_lightning_optimizer(opt, self, idx) for idx, opt in enumerate(self.optimizers) } - def connect(self, model: Module) -> None: + def connect(self, model: "pl.LightningModule") -> None: """Called by the accelerator to connect the accelerator and the model with this plugin.""" + self._lightning_module = model self.model = model def _configure_launcher(self) -> None: @@ -328,7 +329,7 @@ def post_backward(self, closure_loss: Tensor) -> None: @property def model(self) -> Optional[Module]: """Returns the potentially wrapped LightningModule.""" - return self._model + return self._model if self._model is not None else self._lightning_module @model.setter def model(self, new_model: Optional[Module]) -> None: @@ -337,7 +338,7 @@ def model(self, new_model: Optional[Module]) -> None: @property def lightning_module(self) -> Optional["pl.LightningModule"]: """Returns the pure LightningModule without potential wrappers.""" - return unwrap_lightning_module(self.model) if self.model is not None else None + return self._lightning_module def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]: torch.cuda.empty_cache() diff --git a/src/pytorch_lightning/strategies/tpu_spawn.py b/src/pytorch_lightning/strategies/tpu_spawn.py index 62bb1c308480b..5ca8db74c4620 100644 --- a/src/pytorch_lightning/strategies/tpu_spawn.py +++ b/src/pytorch_lightning/strategies/tpu_spawn.py @@ -124,7 +124,7 @@ def _validate_patched_dataloaders(model: "pl.LightningModule") -> None: assert not isinstance(source.instance, (pl.LightningModule, pl.LightningDataModule)) TPUSpawnStrategy._validate_dataloader(source.instance) - def connect(self, model: "pl.LightningModule") -> None: # type: ignore + def connect(self, model: "pl.LightningModule") -> None: TPUSpawnStrategy._validate_patched_dataloaders(model) self.wrapped_model = xmp.MpModelWrapper(LightningDistributedModule(model)) return super().connect(model) @@ -139,11 +139,11 @@ def setup(self, trainer: "pl.Trainer") -> None: if self.debug: os.environ["PT_XLA_DEBUG"] = "1" - assert self.model - shared_params = find_shared_parameters(self.model) + assert self.lightning_module + shared_params = find_shared_parameters(self.lightning_module) self.model_to_device() - assert isinstance(self.model.module, Module) - set_shared_parameters(self.model.module, shared_params) + + set_shared_parameters(self.lightning_module, shared_params) self.setup_precision_plugin() if trainer.state.fn == TrainerFn.FITTING: diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 6853c4328af46..5983324f2f62d 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -696,7 +696,7 @@ def fit( """ if not isinstance(model, pl.LightningModule): raise TypeError(f"`Trainer.fit()` requires a `LightningModule`, got: {model.__class__.__qualname__}") - self.strategy.model = model + self.strategy._lightning_module = model self._call_and_handle_interrupt( self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path ) @@ -778,7 +778,7 @@ def validate( """ if model is not None and not isinstance(model, pl.LightningModule): raise TypeError(f"`Trainer.validate()` requires a `LightningModule`, got: {model.__class__.__qualname__}") - self.strategy.model = model or self.lightning_module + self.strategy._lightning_module = model or self.lightning_module return self._call_and_handle_interrupt(self._validate_impl, model, dataloaders, ckpt_path, verbose, datamodule) def _validate_impl( @@ -868,7 +868,7 @@ def test( """ if model is not None and not isinstance(model, pl.LightningModule): raise TypeError(f"`Trainer.test()` requires a `LightningModule`, got: {model.__class__.__qualname__}") - self.strategy.model = model or self.lightning_module + self.strategy._lightning_module = model or self.lightning_module return self._call_and_handle_interrupt(self._test_impl, model, dataloaders, ckpt_path, verbose, datamodule) def _test_impl( @@ -957,7 +957,7 @@ def predict( """ if model is not None and not isinstance(model, pl.LightningModule): raise TypeError(f"`Trainer.predict()` requires a `LightningModule`, got: {model.__class__.__qualname__}") - self.strategy.model = model or self.lightning_module + self.strategy._lightning_module = model or self.lightning_module return self._call_and_handle_interrupt( self._predict_impl, model, dataloaders, datamodule, return_predictions, ckpt_path ) diff --git a/tests/tests_pytorch/accelerators/test_ipu.py b/tests/tests_pytorch/accelerators/test_ipu.py index db3b9d1f91952..470cb4a028bed 100644 --- a/tests/tests_pytorch/accelerators/test_ipu.py +++ b/tests/tests_pytorch/accelerators/test_ipu.py @@ -99,7 +99,7 @@ def test_epoch_end(self, outputs) -> None: @pytest.mark.skipif(_IPU_AVAILABLE, reason="test requires non-IPU machine") @mock.patch("pytorch_lightning.accelerators.ipu.IPUAccelerator.is_available", return_value=True) -def test_fail_if_no_ipus(mock_ipu_acc_avail, tmpdir): +def test_fail_if_no_ipus(_, tmpdir): with pytest.raises(MisconfigurationException, match="IPU Accelerator requires IPU devices to run"): Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1) @@ -118,7 +118,7 @@ def test_warning_if_ipus_not_used(): @RunIf(ipu=True) -def test_no_warning_plugin(tmpdir): +def test_no_warning_strategy(tmpdir): with pytest.warns(None) as record: Trainer(default_root_dir=tmpdir, max_epochs=1, strategy=IPUStrategy(training_opts=poptorch.Options())) assert len(record) == 0 @@ -235,7 +235,7 @@ def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: @RunIf(ipu=True) -def test_device_iterations_ipu_plugin(tmpdir): +def test_device_iterations_ipu_strategy(tmpdir): class TestCallback(Callback): def on_train_start(self, trainer: Trainer, pl_module: LightningModule) -> None: assert trainer.strategy.device_iterations == 2 @@ -442,10 +442,10 @@ def test_manual_poptorch_opts_custom(tmpdir): class TestCallback(Callback): def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None: # ensure dataloaders were correctly set up during training. - plugin = trainer.strategy - assert isinstance(plugin, IPUStrategy) - assert plugin.training_opts.replication_factor == 2 - assert plugin.inference_opts.replication_factor == 1 + strategy = trainer.strategy + assert isinstance(strategy, IPUStrategy) + assert strategy.training_opts.replication_factor == 2 + assert strategy.inference_opts.replication_factor == 1 val_dataloader = trainer.val_dataloaders[0] train_dataloader = trainer.train_dataloader @@ -456,21 +456,21 @@ def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None: assert train_dataloader.options.replication_factor == 2 assert val_dataloader.options.replication_factor == 1 - plugin = IPUStrategy(inference_opts=inference_opts, training_opts=training_opts) + strategy = IPUStrategy(inference_opts=inference_opts, training_opts=training_opts) # ensure we default to the training options replication factor - assert plugin.replication_factor == 2 - trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy=plugin, callbacks=TestCallback()) + assert strategy.replication_factor == 2 + trainer = Trainer(default_root_dir=tmpdir, fast_dev_run=True, strategy=strategy, callbacks=TestCallback()) trainer.fit(model) - plugin = trainer.strategy - assert isinstance(plugin, IPUStrategy) + strategy = trainer.strategy + assert isinstance(strategy, IPUStrategy) - training_opts = plugin.training_opts + training_opts = strategy.training_opts assert training_opts.device_iterations == 8 assert training_opts.replication_factor == 2 assert training_opts.Training.gradient_accumulation == 2 - inference_opts = plugin.inference_opts + inference_opts = strategy.inference_opts assert inference_opts.device_iterations == 16 assert inference_opts.replication_factor == 1 assert inference_opts.Training.gradient_accumulation == 1 @@ -481,8 +481,8 @@ def test_replication_factor(tmpdir): """Ensure if the user passes manual poptorch Options with custom parameters set, we set them correctly in the dataloaders.""" - plugin = IPUStrategy() - trainer = Trainer(accelerator="ipu", devices=2, default_root_dir=tmpdir, fast_dev_run=True, strategy=plugin) + strategy = IPUStrategy() + trainer = Trainer(accelerator="ipu", devices=2, default_root_dir=tmpdir, fast_dev_run=True, strategy=strategy) assert isinstance(trainer.accelerator, IPUAccelerator) assert trainer.num_devices == 2 assert trainer.strategy.replication_factor == 2 @@ -492,11 +492,11 @@ def test_replication_factor(tmpdir): inference_opts = poptorch.Options() training_opts.replicationFactor(8) inference_opts.replicationFactor(7) - plugin = IPUStrategy(inference_opts=inference_opts, training_opts=training_opts) + strategy = IPUStrategy(inference_opts=inference_opts, training_opts=training_opts) - trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1, strategy=plugin) + trainer = Trainer(default_root_dir=tmpdir, accelerator="ipu", devices=1, strategy=strategy) trainer.optimizers = model.configure_optimizers()[0] - plugin.model = model + strategy._lightning_module = model model.trainer = trainer trainer.state.fn = TrainerFn.FITTING trainer.strategy.setup(trainer) @@ -551,7 +551,7 @@ def configure_optimizers(self): @RunIf(ipu=True) -def test_precision_plugin(tmpdir): +def test_precision_plugin(): """Ensure precision plugin value is set correctly.""" plugin = IPUPrecisionPlugin(precision=16) @@ -606,13 +606,13 @@ def test_set_devices_if_none_ipu(): @RunIf(ipu=True) -def test_strategy_choice_ipu_plugin(tmpdir): +def test_strategy_choice_ipu_strategy(): trainer = Trainer(strategy=IPUStrategy(), accelerator="ipu", devices=8) assert isinstance(trainer.strategy, IPUStrategy) @RunIf(ipu=True) -def test_device_type_when_ipu_strategy_passed(tmpdir): +def test_device_type_when_ipu_strategy_passed(): trainer = Trainer(strategy=IPUStrategy(), accelerator="ipu", devices=8) assert isinstance(trainer.strategy, IPUStrategy) assert isinstance(trainer.accelerator, IPUAccelerator) @@ -620,11 +620,11 @@ def test_device_type_when_ipu_strategy_passed(tmpdir): @RunIf(ipu=True) def test_poptorch_models_at_different_stages(tmpdir): - plugin = IPUStrategy() - trainer = Trainer(default_root_dir=tmpdir, strategy=plugin, accelerator="ipu", devices=8) + strategy = IPUStrategy() + trainer = Trainer(default_root_dir=tmpdir, strategy=strategy, accelerator="ipu", devices=8) model = BoringModel() model.trainer = trainer - plugin.model = model + strategy._lightning_module = model trainer.optimizers = model.configure_optimizers()[0] trainer.state.fn = TrainerFn.FITTING diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index 6a0a458c6c041..186e526313bba 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -11,11 +11,55 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +"""Test deprecated functionality which will be removed in v1.10.0.""" import pytest from pytorch_lightning import Trainer +from pytorch_lightning.demos.boring_classes import BoringModel +from pytorch_lightning.overrides import LightningDistributedModule, LightningParallelModule +from pytorch_lightning.overrides.base import unwrap_lightning_module +from pytorch_lightning.overrides.fairscale import LightningShardedDataParallel, unwrap_lightning_module_sharded +from pytorch_lightning.strategies.bagua import LightningBaguaModule +from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule +from pytorch_lightning.strategies.ipu import LightningIPUModule +from tests_pytorch.helpers.runif import RunIf +from tests_pytorch.helpers.utils import no_warning_call def test_deprecated_amp_level(): with pytest.deprecated_call(match="Setting `amp_level` inside the `Trainer` is deprecated in v1.8.0"): Trainer(amp_level="O3", amp_backend="apex") + + +@pytest.mark.parametrize( + "wrapper_class", + [ + LightningParallelModule, + LightningDistributedModule, + LightningBaguaModule, + LightningDeepSpeedModule, + pytest.param(LightningShardedDataParallel, marks=RunIf(fairscale=True)), + LightningIPUModule, + ], +) +def test_v1_10_deprecated_pl_module_init_parameter(wrapper_class): + with no_warning_call( + DeprecationWarning, match=rf"The argument `pl_module` in `{wrapper_class.__name__}` is deprecated in v1.8.0" + ): + wrapper_class(BoringModel()) + + with pytest.deprecated_call( + match=rf"The argument `pl_module` in `{wrapper_class.__name__}` is deprecated in v1.8.0" + ): + wrapper_class(pl_module=BoringModel()) + + +def test_v1_10_deprecated_unwrap_lightning_module(): + with pytest.deprecated_call(match=r"The function `unwrap_lightning_module` is deprecated in v1.8.0"): + unwrap_lightning_module(BoringModel()) + + +@RunIf(fairscale=True) +def test_v1_10_deprecated_unwrap_lightning_module_sharded(): + with pytest.deprecated_call(match=r"The function `unwrap_lightning_module_sharded` is deprecated in v1.8.0"): + unwrap_lightning_module_sharded(BoringModel()) diff --git a/tests/tests_pytorch/helpers/runif.py b/tests/tests_pytorch/helpers/runif.py index abbca75f626ad..4074eaf725e1f 100644 --- a/tests/tests_pytorch/helpers/runif.py +++ b/tests/tests_pytorch/helpers/runif.py @@ -22,11 +22,11 @@ from pytorch_lightning.accelerators.mps import _MPS_AVAILABLE from pytorch_lightning.callbacks.progress.rich_progress import _RICH_AVAILABLE -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities.imports import ( _APEX_AVAILABLE, + _FAIRSCALE_AVAILABLE, _FAIRSCALE_FULLY_SHARDED_AVAILABLE, _HIVEMIND_AVAILABLE, _HOROVOD_AVAILABLE, diff --git a/tests/tests_pytorch/models/test_amp.py b/tests/tests_pytorch/models/test_amp.py index 159a3767c1df2..786de99f59714 100644 --- a/tests/tests_pytorch/models/test_amp.py +++ b/tests/tests_pytorch/models/test_amp.py @@ -96,8 +96,6 @@ def test_amp_cpus(tmpdir, strategy, precision, devices): trainer.test(model) trainer.predict(model, DataLoader(RandomDataset(32, 64))) - assert trainer.state.finished, f"Training failed with {trainer.state}" - @RunIf(min_cuda_gpus=2, min_torch="1.10") @pytest.mark.parametrize("strategy", [None, "dp", "ddp_spawn"]) @@ -121,8 +119,6 @@ def test_amp_gpus(tmpdir, strategy, precision, devices): trainer.test(model) trainer.predict(model, DataLoader(RandomDataset(32, 64))) - assert trainer.state.finished, f"Training failed with {trainer.state}" - @RunIf(min_cuda_gpus=2) @mock.patch.dict( @@ -162,9 +158,6 @@ def test_amp_gpu_ddp_slurm_managed(tmpdir): ) trainer.fit(model) - # correct result and ok accuracy - assert trainer.state.finished, "amp + ddp model failed to complete" - # test root model address assert isinstance(trainer.strategy.cluster_environment, SLURMEnvironment) assert trainer.strategy.cluster_environment.resolve_root_node_address("abc") == "abc" @@ -185,7 +178,6 @@ def test_amp_without_apex(bwd_mock, tmpdir): trainer = Trainer(default_root_dir=tmpdir, max_epochs=1, amp_backend="apex") assert trainer.amp_backend is None trainer.fit(model) - assert trainer.state.finished, f"Training failed with {trainer.state}" assert not bwd_mock.called @@ -213,7 +205,6 @@ def configure_optimizers(self): ) assert str(trainer.amp_backend) == "AMPType.APEX" trainer.fit(model) - assert trainer.state.finished, f"Training failed with {trainer.state}" # `max_steps` is fulfilled in the third batch first optimizer, but we don't check the loop # `done` condition until all optimizers have run, so the number of backwards is higher than `max_steps` assert bwd_mock.call_count == 6 diff --git a/tests/tests_pytorch/overrides/test_base.py b/tests/tests_pytorch/overrides/test_base.py index fa07912d0d44e..27d2db688d7ae 100644 --- a/tests/tests_pytorch/overrides/test_base.py +++ b/tests/tests_pytorch/overrides/test_base.py @@ -38,4 +38,5 @@ def test_unwrap_lightning_module(): wrapped_model = _LightningModuleWrapperBase(wrapped_model) wrapped_model = DataParallel(wrapped_model) - assert unwrap_lightning_module(wrapped_model) == model + with pytest.deprecated_call(match="The function `unwrap_lightning_module` is deprecated in v1.8.0"): + assert unwrap_lightning_module(wrapped_model) == model diff --git a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py index 0c08c8e9540eb..ab7a4a432a2c6 100644 --- a/tests/tests_pytorch/plugins/precision/test_sharded_precision.py +++ b/tests/tests_pytorch/plugins/precision/test_sharded_precision.py @@ -15,8 +15,8 @@ import pytest import torch -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.plugins import ShardedNativeMixedPrecisionPlugin +from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from tests_pytorch.helpers.runif import RunIf ShardedGradScaler = None diff --git a/tests/tests_pytorch/strategies/test_sharded_strategy.py b/tests/tests_pytorch/strategies/test_sharded_strategy.py index ad0673ed1a5fa..a0abfb3f73ec0 100644 --- a/tests/tests_pytorch/strategies/test_sharded_strategy.py +++ b/tests/tests_pytorch/strategies/test_sharded_strategy.py @@ -7,9 +7,9 @@ from pytorch_lightning import LightningModule, Trainer from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies import DDPShardedStrategy, DDPSpawnShardedStrategy from pytorch_lightning.trainer.states import TrainerFn +from pytorch_lightning.utilities.imports import _FAIRSCALE_AVAILABLE from tests_pytorch.helpers.runif import RunIf if _FAIRSCALE_AVAILABLE: @@ -256,8 +256,8 @@ def test_configure_ddp(tmpdir): def test_custom_kwargs_sharded(_, cls): """Tests to ensure that if custom kwargs are passed, they are set correctly.""" strategy = cls(reduce_fp16=True) - strategy.model = Mock(spec=LightningModule) - strategy.model.trainer = Mock() + strategy._lightning_module = Mock(spec=LightningModule) + strategy._lightning_module.trainer = Mock() strategy.parallel_devices = [Mock()] class_name = "sharded" if isinstance(strategy, DDPShardedStrategy) else "sharded_spawn" @@ -276,8 +276,8 @@ def test_custom_kwargs_sharded_reduce_buffer_size(_, params, expected_buffer_siz """Tests to ensure that ``reduce_buffer_size`` is correctly set based on user kwargs.""" strategy = DDPShardedStrategy(**params) strategy.num_nodes = num_nodes - strategy.model = Mock(spec=LightningModule) - strategy.model.trainer = Mock() + strategy._lightning_module = Mock(spec=LightningModule) + strategy._lightning_module.trainer = Mock() strategy.parallel_devices = [Mock()] with mock.patch("pytorch_lightning.strategies.sharded.ShardedDataParallel", autospec=True) as mock_sharded: diff --git a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py index 02e846425a2a0..c56f3fb4d988d 100644 --- a/tests/tests_pytorch/trainer/connectors/test_callback_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_callback_connector.py @@ -56,7 +56,7 @@ def test_checkpoint_callbacks_are_last(tmpdir): # no model callbacks model = LightningModule() model.configure_callbacks = lambda: [] - trainer.model = model + trainer.strategy._lightning_module = model cb_connector = CallbackConnector(trainer) cb_connector._attach_model_callbacks() assert trainer.callbacks == [ @@ -72,7 +72,7 @@ def test_checkpoint_callbacks_are_last(tmpdir): model = LightningModule() model.configure_callbacks = lambda: [checkpoint1, early_stopping, model_summary, checkpoint2] trainer = Trainer(callbacks=[progress_bar, lr_monitor, ModelCheckpoint(tmpdir)]) - trainer.model = model + trainer.strategy._lightning_module = model cb_connector = CallbackConnector(trainer) cb_connector._attach_model_callbacks() assert trainer.callbacks == [ @@ -154,7 +154,7 @@ def _attach_callbacks(trainer_callbacks, model_callbacks): enable_model_summary=False, callbacks=trainer_callbacks, ) - trainer.model = model + trainer.strategy._lightning_module = model cb_connector = CallbackConnector(trainer) cb_connector._attach_model_callbacks() return trainer @@ -212,7 +212,7 @@ def test_attach_model_callbacks_override_info(caplog): trainer = Trainer( enable_checkpointing=False, callbacks=[EarlyStopping(monitor="foo"), LearningRateMonitor(), TQDMProgressBar()] ) - trainer.model = model + trainer.strategy._lightning_module = model cb_connector = CallbackConnector(trainer) with caplog.at_level(logging.INFO): cb_connector._attach_model_callbacks() diff --git a/tests/tests_pytorch/trainer/flags/test_overfit_batches.py b/tests/tests_pytorch/trainer/flags/test_overfit_batches.py index 32f0b8938caf6..da3e154349e1b 100644 --- a/tests/tests_pytorch/trainer/flags/test_overfit_batches.py +++ b/tests/tests_pytorch/trainer/flags/test_overfit_batches.py @@ -142,7 +142,7 @@ def test_distributed_sampler_with_overfit_batches(): strategy="ddp_spawn", ) model.trainer = trainer - trainer.model = model + trainer.strategy._lightning_module = model trainer._data_connector.attach_dataloaders(model) trainer.reset_train_dataloader() train_sampler = trainer.train_dataloader.loaders.sampler diff --git a/tests/tests_pytorch/utilities/test_imports.py b/tests/tests_pytorch/utilities/test_imports.py index 25995bb029f3a..c673716c457f2 100644 --- a/tests/tests_pytorch/utilities/test_imports.py +++ b/tests/tests_pytorch/utilities/test_imports.py @@ -13,7 +13,6 @@ # limitations under the License. import operator -from pytorch_lightning.overrides.fairscale import _FAIRSCALE_AVAILABLE from pytorch_lightning.strategies.bagua import _BAGUA_AVAILABLE from pytorch_lightning.strategies.deepspeed import _DEEPSPEED_AVAILABLE from pytorch_lightning.utilities import ( @@ -23,7 +22,7 @@ _OMEGACONF_AVAILABLE, _POPTORCH_AVAILABLE, ) -from pytorch_lightning.utilities.imports import _compare_version, _RequirementAvailable, torch +from pytorch_lightning.utilities.imports import _compare_version, _FAIRSCALE_AVAILABLE, _RequirementAvailable, torch def test_module_exists(): From c8e22b4572277dec395eb5cd7ce6c3451e410847 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 12 Aug 2022 14:14:21 +0530 Subject: [PATCH 167/230] Avoid raising the sampler warning if num_replicas=1 (#14097) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../trainer/connectors/data_connector.py | 10 +++++++--- .../trainer/connectors/test_data_connector.py | 9 +++++---- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 4f986257f33ed..d7c4350373e9a 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -89,6 +89,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) + + - Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index 1de8bee90d18f..6e592b9f6d310 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -298,10 +298,14 @@ def _resolve_sampler(self, dataloader: DataLoader, shuffle: bool, mode: Optional # update docs too once this is resolved trainer_fn = self.trainer.state.fn - if isinstance(sampler, DistributedSampler) and trainer_fn in (TrainerFn.VALIDATING, TrainerFn.TESTING): + if ( + isinstance(sampler, DistributedSampler) + and sampler.num_replicas > 1 + and trainer_fn in (TrainerFn.VALIDATING, TrainerFn.TESTING) + ): rank_zero_warn( - f"Using `DistributedSampler` with the dataloaders. During `trainer.{trainer_fn.value}()`," - " it is recommended to use `Trainer(devices=1)` to ensure each sample/batch gets evaluated" + f"Using `DistributedSampler` with the dataloaders. During `trainer.{trainer_fn.value}()`, it is" + " recommended to use `Trainer(devices=1, num_nodes=1)` to ensure each sample/batch gets evaluated" " exactly once. Otherwise, multi-device settings use `DistributedSampler` that replicates" " some samples to make sure all devices have same batch size in case of uneven inputs.", category=PossibleUserWarning, diff --git a/tests/tests_pytorch/trainer/connectors/test_data_connector.py b/tests/tests_pytorch/trainer/connectors/test_data_connector.py index 7273d7719834e..379a3248a1535 100644 --- a/tests/tests_pytorch/trainer/connectors/test_data_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_data_connector.py @@ -526,19 +526,20 @@ def test_invalid_hook_passed_in_datahook_selector(): dh_selector.get_instance("setup") -def test_eval_distributed_sampler_warning(tmpdir): +@pytest.mark.parametrize("devices, warn_context", [(1, no_warning_call), (2, pytest.warns)]) +def test_eval_distributed_sampler_warning(devices, warn_context): """Test that a warning is raised when `DistributedSampler` is used with evaluation.""" model = BoringModel() - trainer = Trainer(strategy="ddp", devices=2, accelerator="cpu", fast_dev_run=True) + trainer = Trainer(strategy="ddp", devices=devices, accelerator="cpu") trainer._data_connector.attach_data(model) trainer.state.fn = TrainerFn.VALIDATING - with pytest.warns(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): + with warn_context(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): trainer.reset_val_dataloader(model) trainer.state.fn = TrainerFn.TESTING - with pytest.warns(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): + with warn_context(PossibleUserWarning, match="multi-device settings use `DistributedSampler`"): trainer.reset_test_dataloader(model) From 208512e6e93979e4e653e127b0f43b6d703de455 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 12 Aug 2022 14:44:07 +0530 Subject: [PATCH 168/230] Add docs for `fsdp_native` (#14108) --- .../advanced/model_parallel.rst | 114 +++++++++++++++--- docs/source-pytorch/api_references.rst | 1 + docs/source-pytorch/common/trainer.rst | 2 +- docs/source-pytorch/extensions/strategy.rst | 9 +- .../strategies/fully_sharded.py | 15 ++- .../strategies/fully_sharded_native.py | 73 +++++------ 6 files changed, 148 insertions(+), 66 deletions(-) diff --git a/docs/source-pytorch/advanced/model_parallel.rst b/docs/source-pytorch/advanced/model_parallel.rst index 49da7e2614419..d1db58b3f1869 100644 --- a/docs/source-pytorch/advanced/model_parallel.rst +++ b/docs/source-pytorch/advanced/model_parallel.rst @@ -89,24 +89,24 @@ Sharded Training can work across all DDP variants by adding the additional ``--s Internally we re-initialize your optimizers and shard them across your machines and processes. We handle all communication using PyTorch distributed, so no code changes are required. ----------- +---- .. _fully-sharded-training: -Fully Sharded Training -^^^^^^^^^^^^^^^^^^^^^^ +FairScale Fully Sharded Training +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ .. warning:: - Fully Sharded Training is in beta and the API is subject to change. Please create an `issue `_ if you run into any issues. + FairScale Fully Sharded Training is in BETA and the API is subject to change. Please create an `issue `_ if you run into any problems. -`Fully Sharded `__ shards optimizer state, gradients and parameters across data parallel workers. This allows you to fit much larger models onto multiple GPUs into memory. +`Fully Sharded `_ shards optimizer state, gradients, and parameters across data parallel workers. This allows you to fit much larger models onto multiple GPUs into memory. Fully Sharded Training alleviates the need to worry about balancing layers onto specific devices using some form of pipe parallelism, and optimizes for distributed communication with minimal effort. Shard Parameters to Reach 10+ Billion Parameters """""""""""""""""""""""""""""""""""""""""""""""" -To reach larger parameter sizes and be memory efficient, we have to shard parameters. There are various ways to enable this. +To reach larger parameter sizes and to be memory efficient, we have to shard parameters. There are various ways to enable this. .. note:: Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``. @@ -116,18 +116,18 @@ To reach larger parameter sizes and be memory efficient, we have to shard parame Enabling Module Sharding for Maximum Memory Efficiency """""""""""""""""""""""""""""""""""""""""""""""""""""" -To activate parameter sharding, you must wrap your model using provided ``wrap`` or ``auto_wrap`` functions as described below. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` and ``auto_wrap`` parameters are passed correctly. +To activate parameter sharding, you must wrap your model using the ``wrap`` or ``auto_wrap`` functions. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` and ``auto_wrap`` parameters are passed correctly. -When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies. +When not using Fully Sharded Training these wrap functions are a no-op. That means once the changes have been made, there is no need to remove the changes for other strategies. -``auto_wrap`` will recursively wrap :class:`~torch.nn.Module` within the ``LightningModule`` with nested Fully Sharded Wrappers, +``auto_wrap`` recursively wraps :class:`~torch.nn.Module` within the ``LightningModule`` with nested Fully Sharded Wrappers, signalling that we'd like to partition these modules across data parallel devices, discarding the full weights when not required (information :class:`here `). -``auto_wrap`` can have varying level of success based on the complexity of your model. **Auto Wrap does not support models with shared parameters**. +``auto_wrap`` can have varying levels of success based on the complexity of your model. **Auto Wrap does not support models with shared parameters**. -``wrap`` will simply wrap the module with a Fully Sharded Parallel class with the correct parameters from the Lightning context manager. +``wrap`` simply wraps the module with a Fully Sharded Parallel class with the correct parameters from the Lightning context manager. -Below is an example of using both ``wrap`` and ``auto_wrap`` to create your model. +Here's an example using both ``wrap`` and ``auto_wrap`` to create your model: .. code-block:: python @@ -147,7 +147,7 @@ Below is an example of using both ``wrap`` and ``auto_wrap`` to create your mode def configure_sharded_model(self): # modules are sharded across processes - # as soon as they are wrapped with ``wrap`` or ``auto_wrap``. + # as soon as they are wrapped with `wrap` or `auto_wrap`. # During the forward/backward passes, weights get synced across processes # and de-allocated once computation is complete, saving memory. @@ -174,23 +174,22 @@ Below is an example of using both ``wrap`` and ``auto_wrap`` to create your mode trainer.test() trainer.predict() - ----------- +---- .. _fairscale-activation-checkpointing: FairScale Activation Checkpointing -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +"""""""""""""""""""""""""""""""""" Activation checkpointing frees activations from memory as soon as they are not needed during the forward pass. They are then re-computed for the backwards pass as needed. Activation checkpointing is very useful when you have intermediate layers that produce large activations. -FairScales' checkpointing wrapper also handles batch norm layers correctly unlike the PyTorch implementation, ensuring stats are tracked correctly due to the multiple forward passes. +FairScale's checkpointing wrapper also handles batch norm layers correctly, unlike the PyTorch implementation, ensuring stats are tracked correctly due to the multiple forward passes. -This saves memory when training larger models however requires wrapping modules you'd like to use activation checkpointing on. See :class:`here ` for more information. +This saves memory when training larger models, however it requires wrapping modules you'd like to use activation checkpointing on. See :class:`here ` for more information. .. warning:: - Ensure to not wrap the entire model with activation checkpointing. This is not the intended usage of activation checkpointing, and will lead to failures as seen in `this discussion `__. + Do not wrap the entire model with activation checkpointing. This is not the intended use of activation checkpointing, and will lead to failures as seen in `this discussion `_. .. code-block:: python @@ -205,6 +204,83 @@ This saves memory when training larger models however requires wrapping modules self.block_1 = checkpoint_wrapper(nn.Sequential(nn.Linear(32, 32), nn.ReLU())) self.block_2 = nn.Linear(32, 2) +---- + +.. _fully-sharded-native-training: + +PyTorch Fully Sharded Training +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +PyTorch has it's own version of `FSDP `_ which is upstreamed from their `fairscale `__ project. +It was introduced in their `v1.11.0 release `_. The API is pretty similar to that of FairScale. + +.. note:: + Currently Fully Sharded Training relies on the user to wrap the model with Fully Sharded within the ``LightningModule``. + This means you must create a single model that is treated as a ``torch.nn.Module`` within the ``LightningModule``. + This is a limitation of Fully Sharded Training that will be resolved in the future. + +To activate parameter sharding, you must wrap your model using the``wrap`` function. Internally in Lightning, we enable a context manager around the ``configure_sharded_model`` function to make sure the ``wrap`` parameters are passed correctly. + +When not using Fully Sharded these wrap functions are a no-op. This means once the changes have been made, there is no need to remove the changes for other strategies. + +``wrap`` simply wraps the module with a Fully Sharded Parallel class with the correct parameters from the Lightning context manager. + +Here's an example using that uses ``wrap`` to create your model: + +.. code-block:: python + + import torch + import torch.nn as nn + import pytorch_lightning as pl + from pytorch_lightning import Trainer + from torch.distributed.fsdp.wrap import wrap + + + class MyModel(pl.LightningModule): + def __init__(self): + super().__init__() + self.linear_layer = nn.Linear(32, 32) + self.block = nn.Sequential(nn.Linear(32, 32), nn.Linear(32, 32)) + + def configure_sharded_model(self): + # modules are sharded across processes + # as soon as they are wrapped with `wrap`. + # During the forward/backward passes, weights get synced across processes + # and de-allocated once computation is complete, saving memory. + + # Wraps the layer in a Fully Sharded Wrapper automatically + linear_layer = wrap(self.linear_layer) + + for i, layer in enumerate(self.block): + self.block[i] = wrap(layer) + + self.model = nn.Sequential(linear_layer, nn.ReLU(), self.block) + + def configure_optimizers(self): + return torch.optim.AdamW(self.model.parameters()) + + + model = MyModel() + trainer = Trainer(accelerator="gpu", devices=4, strategy="fsdp_native", precision=16) + trainer.fit(model) + + +You can customize the strategy configuration by adjusting the arguments of :class:`~pytorch_lightning.strategies.fully_sharded_native.DDPFullyShardedNativeStrategy` and pass that to the ``strategy`` argument inside the ``Trainer``. + +.. code-block:: python + + from pytorch_lightning import Trainer + from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy + from torch.distributed.fsdp.fully_sharded_data_parallel import CPUOffload + + + native_fsdp = DDPFullyShardedNativeStrategy(cpu_offload=CPUOffload(offload_params=True)) + trainer = pl.Trainer(strategy=native_fsdp, accelerator="gpu", device=4) + + +Check out `this tutorial `__ to learn more about the native support. + +---- .. _deepspeed_advanced: diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index 8daed5ddcaf41..f7e2c4bd58b43 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -285,6 +285,7 @@ strategies BaguaStrategy HivemindStrategy + DDPFullyShardedNativeStrategy DDPFullyShardedStrategy DDPShardedStrategy DDPSpawnShardedStrategy diff --git a/docs/source-pytorch/common/trainer.rst b/docs/source-pytorch/common/trainer.rst index 290d3aefb8524..fe76ebea481aa 100644 --- a/docs/source-pytorch/common/trainer.rst +++ b/docs/source-pytorch/common/trainer.rst @@ -468,7 +468,7 @@ callbacks | -Add a list of :class:`~pytorch_lightning.callbacks.Callback`. Callbacks run sequentially in the order defined here +Add a list of :class:`~pytorch_lightning.callbacks.callback.Callback`. Callbacks run sequentially in the order defined here with the exception of :class:`~pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint` callbacks which run after all others to ensure all states are saved to the checkpoints. diff --git a/docs/source-pytorch/extensions/strategy.rst b/docs/source-pytorch/extensions/strategy.rst index 0cc426225ca36..ed39f68d45e23 100644 --- a/docs/source-pytorch/extensions/strategy.rst +++ b/docs/source-pytorch/extensions/strategy.rst @@ -37,7 +37,6 @@ Built-in strategies can be selected in two ways. The latter allows you to configure further options on the specifc strategy. Here are some examples: - .. code-block:: python # Training with the DistributedDataParallel strategy on 4 GPUs @@ -61,10 +60,8 @@ Here are some examples: # Training with the default IPU strategy on 8 IPUs trainer = Trainer(accelerator="ipu", devices=8) - The below table lists all relevant strategies available in Lightning with their corresponding short-hand name: - .. list-table:: Strategy Classes and Nicknames :widths: 20 20 20 :header-rows: 1 @@ -78,9 +75,12 @@ The below table lists all relevant strategies available in Lightning with their * - collaborative - :class:`~pytorch_lightning.strategies.HivemindStrategy` - Strategy for training collaboratively on local machines or unreliable GPUs across the internet. :ref:`Learn more. ` + * - fsdp_native + - :class:`~pytorch_lightning.strategies.DDPFullyShardedNativeStrategy` + - Strategy for Fully Sharded Data Parallel provided by PyTorch. :ref:`Learn more. ` * - fsdp - :class:`~pytorch_lightning.strategies.DDPFullyShardedStrategy` - - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. ` + - Strategy for Fully Sharded Data Parallel provided by FairScale. :ref:`Learn more. ` * - ddp_sharded - :class:`~pytorch_lightning.strategies.DDPShardedStrategy` - Optimizer and gradient sharded training provided by FairScale. :ref:`Learn more. ` @@ -118,6 +118,7 @@ The below table lists all relevant strategies available in Lightning with their - :class:`~pytorch_lightning.strategies.SingleTPUStrategy` - Strategy for training on a single TPU device. :doc:`Learn more. <../accelerators/tpu>` +---- ************************ Create a Custom Strategy diff --git a/src/pytorch_lightning/strategies/fully_sharded.py b/src/pytorch_lightning/strategies/fully_sharded.py index 283e5e6a868cc..239e4844b146e 100644 --- a/src/pytorch_lightning/strategies/fully_sharded.py +++ b/src/pytorch_lightning/strategies/fully_sharded.py @@ -60,19 +60,22 @@ def __init__( ): """Plugin for Fully Sharded Data Parallel provided by FairScale. + .. warning:: ``DDPFullyShardedStrategy`` is in beta and subject to change. + Full Sharded Training shards the entire model across all available GPUs, allowing you to scale model size, whilst using efficient communication to reduce overhead. In practice, this means we can remain at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar to ZeRO-Stage 3 but has been built for upstreaming to PyTorch. - `For more information: https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html`. - .. warning:: ``FullyShardedPlugin`` is in beta and subject to change. + + For more information + `check out FairScale's docs `__. Defaults have been set and options have been exposed, but may require configuration - based on your level of memory/speed efficiency. We suggest having a look at this PR for more information. - `https://github.com/facebookresearch/fairscale/pull/413` + based on your level of memory/speed efficiency. We suggest having a look at + `this PR for more information `__. - Many of the helpful doc strings below came from the original FairScale documentation: - `https://fairscale.readthedocs.io/en/latest/api/nn/fsdp.html` + Many of the helpful doc strings below came from the original + `FairScale documentation `__. Arguments: cpu_offload: Offload FP32 params to CPU. Only usable in precision=16 mode. diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index d92931fb5cdb2..cf34aa9738f88 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -59,6 +59,43 @@ class DDPFullyShardedNativeStrategy(ParallelStrategy): + r"""Strategy for Fully Sharded Data Parallel provided by torch.distributed. + + .. warning:: ``DDPFullyShardedNativeStrategy`` is in BETA and subject to change. The interface can + bring breaking changes and new features with the next release of PyTorch. + + Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model + size, whilst using efficient communication to reduce overhead. In practice, this means we can remain + at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar + to ZeRO-Stage 3. + + For more information `check out `__. + + Defaults have been set and options have been exposed, but may require configuration + based on your level of memory/speed efficiency. We suggest having a look at + `this tutorial `__ for more information. + + Arguments: + cpu_offload: + CPU offloading config. Currently, only parameter and gradient CPU + offload is supported. It can be enabled via passing in + ``cpu_offload=CPUOffload(offload_params=True)``. Note that this + currently implicitly enables gradient offloading to CPU in order for + params and grads to be on same device to work with optimizer. This + API is subject to change. Default is ``None`` in which case there + will be no offloading. + backward_prefetch: + This is an experimental feature that is subject to change in the + the near future. It allows users to enable two different backward_prefetch + algorithms to help backward communication and computation overlapping. + The pros and cons of each algorithm is explained in the class ``BackwardPrefetch``. + mixed_precision: + Mixed Precision config. By default, Lightning will enable FP16 if ``precision=16`` + or BF16 if ``precision=bf16`` unless a config is passed in. + This is only available in PyTorch 1.12 and later. + \**kwargs: Passed to the FSDP context manager which will configure the FSDP class when wrapping modules. + + """ strategy_name = "fsdp_native" _registered_strategies: List[str] = [] @@ -76,42 +113,6 @@ def __init__( mixed_precision: Optional[MixedPrecision] = None, **kwargs: Any, ) -> None: - r"""Strategy for Fully Sharded Data Parallel provided by torch.Distributed. - - Fully Sharded Training shards the entire model across all available GPUs, allowing you to scale model - size, whilst using efficient communication to reduce overhead. In practice, this means we can remain - at parity with PyTorch DDP, whilst scaling our model sizes dramatically. The technique is similar - to ZeRO-Stage 3. - `For more information: https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/`. - - .. warning:: ``DDPFullyShardedNativeStrategy`` is in beta and subject to change. The interface can - bring breaking changes and new features with the next release of PyTorch. - - Defaults have been set and options have been exposed, but may require configuration - based on your level of memory/speed efficiency. We suggest having a look at this tutorial for - more information. - `https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html` - - Arguments: - cpu_offload: - CPU offloading config. Currently, only parameter and gradient CPU - offload is supported. It can be enabled via passing in - ``cpu_offload=CPUOffload(offload_params=True)``. Note that this - currently implicitly enables gradient offloading to CPU in order for - params and grads to be on same device to work with optimizer. This - API is subject to change. Default is ``None`` in which case there - will be no offloading. - backward_prefetch: - This is an experimental feature that is subject to change in the - the near future. It allows users to enable two different backward_prefetch - algorithms to help backward communication and computation overlapping. - Pros and cons of each algorithm is explained in the class ``BackwardPrefetch``. - mixed_precision: - Mixed Precision config. By default, Lightning will enable FP16 if ``precision=16` - or BF16 if ``precision=bf16`` unless a config is passed in. - This is only available in PyTorch 1.12 and later. - \**kwargs: Passed to the FSDP Context manager which will configure the FSDP class when wrapping modules. - """ if not _TORCH_GREATER_EQUAL_1_12: raise MisconfigurationException( "`DDPFullyShardedNativeStrategy` is supported from PyTorch v1.12.0 onwards." From fe9e5d55bf7991ba36b76d6adae9075b93dfcaa0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 12 Aug 2022 13:24:35 +0200 Subject: [PATCH 169/230] Remove skipping logic in favor of path filtering (#14170) --- .azure/gpu-tests.yml | 53 ++++++++------------ .github/checkgroup.yml | 1 - .github/file-filters.yml | 9 ---- .github/workflows/ci-app-cloud-e2e-test.yml | 28 ++--------- .github/workflows/ci-app-examples.yml | 7 +++ .github/workflows/ci-app-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 35 +++---------- .github/workflows/ci-pytorch-test-full.yml | 54 +++++---------------- .github/workflows/ci-pytorch-test-slow.yml | 38 ++++----------- 9 files changed, 65 insertions(+), 164 deletions(-) delete mode 100644 .github/file-filters.yml diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 8ae670d265ced..8444468c0c58a 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -12,15 +12,31 @@ trigger: - "master" - "release/*" - "refs/tags/*" + paths: + include: + - ".azure/**" + - "examples/run_ddp_examples.sh" + - "examples/convert_from_pt_to_pl/**" + - "examples/run_pl_examples.sh" + - "examples/pl_basics/backbone_image_classifier.py" + - "examples/pl_basics/autoencoder.py" + - "examples/pl_loops/mnist_lite.py" + - "examples/pl_fault_tolerant/automatic.py" + - "examples/test_pl_examples.py" + - "examples/pl_integrations/dali_image_classifier.py" + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" + - "pyproject.toml" + - ".github/workflows/ci-pytorch*.yml" + - ".github/workflows/docs-*.yml" + pr: - "master" - "release/*" -variables: - - name: continue - value: '1' - jobs: - job: testing strategy: @@ -41,22 +57,6 @@ jobs: clean: all steps: - - - bash: | - CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.azure/gpu-tests.yml' - echo $CHANGED_FILES > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "##vso[task.setvariable variable=continue]0" - else - echo "Continue" - echo "##vso[task.setvariable variable=continue]1" - fi - displayName: Skipper - - bash: | lspci | egrep 'VGA|3D' whereis nvidia @@ -66,7 +66,6 @@ jobs: pip --version pip list displayName: 'Image info & NVIDIA' - condition: eq(variables['continue'], '1') - bash: | set -e @@ -82,7 +81,6 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' - condition: eq(variables['continue'], '1') - bash: | set -e @@ -91,16 +89,13 @@ jobs: python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py displayName: 'Env details' - condition: eq(variables['continue'], '1') - bash: bash .actions/pull_legacy_checkpoints.sh displayName: 'Get legacy checkpoints' - condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest workingDirectory: src/pytorch_lightning displayName: 'Testing: PyTorch doctests' - condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 env: @@ -108,7 +103,6 @@ jobs: workingDirectory: tests/tests_pytorch displayName: 'Testing: PyTorch standard' timeoutInMinutes: "35" - condition: eq(variables['continue'], '1') - bash: bash run_standalone_tests.sh workingDirectory: tests/tests_pytorch @@ -117,7 +111,6 @@ jobs: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch standalone tests' timeoutInMinutes: "35" - condition: eq(variables['continue'], '1') - bash: bash run_standalone_tasks.sh workingDirectory: tests/tests_pytorch @@ -126,7 +119,6 @@ jobs: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch standalone tasks' timeoutInMinutes: "10" - condition: eq(variables['continue'], '1') - bash: | python -m coverage report @@ -136,14 +128,13 @@ jobs: ls -l workingDirectory: tests/tests_pytorch displayName: 'Statistics' - condition: eq(variables['continue'], '1') - task: PublishTestResults@2 displayName: 'Publish test results' inputs: testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: and(succeededOrFailed(), eq(variables['continue'], '1')) + condition: succeededOrFailed() - script: | set -e @@ -155,11 +146,9 @@ jobs: env: PL_USE_MOCKED_MNIST: "1" displayName: 'Testing: PyTorch examples' - condition: eq(variables['continue'], '1') - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0 workingDirectory: tests/tests_pytorch env: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch benchmarks' - condition: eq(variables['continue'], '1') diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 8f1d3c6fb5e86..0cb80d6e34bd8 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -101,7 +101,6 @@ subprojects: - id: "lightning_app" paths: - ".github/workflows/ci-app*.yml" - - "examples/app_**" - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" diff --git a/.github/file-filters.yml b/.github/file-filters.yml deleted file mode 100644 index e621cd83881e4..0000000000000 --- a/.github/file-filters.yml +++ /dev/null @@ -1,9 +0,0 @@ -# This file contains filters to be used in the CI to detect file changes and run the required CI jobs. - -app_examples: - - "src/lightning_app/**" - - "tests/tests_app_examples/**" - - "requirements/app/**" - - "examples/app_*" - - "setup.py" - - "src/pytorch_lightning/__version__.py" diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index 9a5a10a95cd33..707c506e89e5a 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -7,37 +7,19 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + paths: + - ".github/workflows/ci-app-cloud-e2e-test.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} jobs: - # This is job should once only once per PR to detect file changes so run required jobs. - # see .github/file-filters.yml to define file filters and run the jobs based on the output of each filter. - # More info: https://github.com/marketplace/actions/paths-changes-filter - - changes: - runs-on: ubuntu-latest - # Set job outputs to the values from filter step - outputs: - app_examples: ${{ steps.filter.outputs.app_examples }} - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - - uses: dorny/paths-filter@v2 - id: filter - with: - filters: .github/file-filters.yml - cloud-test: name: Cloud Test - needs: changes - if: ${{ needs.changes.outputs.app_examples == 'true' }} runs-on: ubuntu-20.04 strategy: fail-fast: false diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 01570f59c2c77..8114f59b01aaa 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -6,6 +6,13 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + paths: + - ".github/workflows/ci-app-examples.yml" + - "requirements/app/**" + - "src/lightning_app/**" + - "tests/tests_app_examples/**" + # the examples are used in the app CI + - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index fe3cc36dc16d3..fb2cdbda69079 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -6,10 +6,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: paths: + - ".github/workflows/ci-app-tests.yml" + - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" - - "requirements/app/**" - - "setup.py" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 2bbdb699c2c1e..d314a742bbdcb 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -6,6 +6,12 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] pull_request: branches: [master, "release/*"] + paths: + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch-test-conda.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -35,28 +41,7 @@ jobs: - uses: actions/checkout@v2 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v24 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch-test-conda.yml' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Update base dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -70,12 +55,10 @@ jobs: run: pip install "Pillow<9.0" # It messes with torchvision - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Update all dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -95,11 +78,9 @@ jobs: python requirements/pytorch/check-avail-extras.py - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml @@ -111,7 +92,7 @@ jobs: if: failure() - name: Statistics - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -119,7 +100,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ success() && (steps.skip.outputs.continue == '1') }} + if: success() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 7409ce25a5128..386bb012b8cc6 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -7,6 +7,12 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] + paths: + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch-test-full.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -37,67 +43,42 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v24 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-full.yml' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - name: Set up Python ${{ matrix.python-version }} - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Reset caching - if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: basic setup - if: ${{ (steps.skip.outputs.continue == '1') }} run: | pip --version pip install -q fire # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS - if: ${{ (runner.os == 'macOS') && (steps.skip.outputs.continue == '1') }} + if: ${{ (runner.os == 'macOS') }} run: | brew install openmpi libuv # Horovod on macOS requires OpenMPI, Gloo not currently supported - name: Setup Windows - if: ${{ (runner.os == 'windows') && (steps.skip.outputs.continue == '1') }} + if: ${{ (runner.os == 'windows') }} run: | python .actions/assistant.py requirements_prune_pkgs horovod - name: Set min. dependencies - if: ${{ (matrix.requires == 'oldest') && (steps.skip.outputs.continue == '1') }} + if: ${{ (matrix.requires == 'oldest') }} run: | python .actions/assistant.py replace_oldest_ver # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Get pip cache dir - if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" - name: pip cache - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -106,11 +87,9 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - name: Pull legacy checkpoints - if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -122,12 +101,10 @@ jobs: shell: bash - name: DocTests - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt @@ -136,7 +113,7 @@ jobs: shell: bash - name: Reinstall Horovod if necessary - if: ${{ (runner.os != 'windows') && (steps.skip.outputs.continue == '1') }} + if: ${{ (runner.os != 'windows') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -153,43 +130,38 @@ jobs: shell: bash - name: Cache datasets - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: Datasets key: pl-dataset - name: Sanity check - if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Upload pytest results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + if: failure() uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Prepare Examples - if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - name: Run Examples - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -197,7 +169,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} + if: always() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 36007d3311451..8e97ea90b2bc4 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -7,6 +7,12 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] + paths: + - "requirements/pytorch/**" + - "src/pytorch_lightning/**" + - "tests/tests_pytorch/**" + - "setup.cfg" # includes pytest config + - ".github/workflows/ci-pytorch-test-slow.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -28,43 +34,19 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Get changed files - id: changed-files - uses: tj-actions/changed-files@v24 - - - name: Decide if the test should be skipped - id: skip - shell: bash -l {0} - run: | - FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*|.github/workflows/ci-pytorch_test-slow.yml' - echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt - MATCHES=$(cat changed_files.txt | grep -E $FILTER) - echo $MATCHES - if [ -z "$MATCHES" ]; then - echo "Skip" - echo "::set-output name=continue::0" - else - echo "Continue" - echo "::set-output name=continue::1" - fi - - uses: actions/setup-python@v4 - if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} - name: Reset caching - if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: Get pip cache - if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: | python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" - name: Cache pip - if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -73,7 +55,6 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}- - name: Install dependencies - if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -85,21 +66,20 @@ jobs: shell: bash - name: Testing PyTorch - if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}.xml env: PL_RUN_SLOW_TESTS: 1 - name: Upload pytest test results - if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} + if: failure() uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}.xml - name: Statistics - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() working-directory: tests/tests_pytorch run: | coverage report @@ -107,7 +87,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} + if: success() # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: From 48c23e571637438726662104325c05ba768288be Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sat, 13 Aug 2022 13:22:06 +0530 Subject: [PATCH 170/230] Use fsdp module to initialize precision scalar for fsdp native (#14092) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí Co-authored-by: Laverne Henderson --- docs/source-pytorch/api_references.rst | 1 + docs/source-pytorch/extensions/plugins.rst | 1 + src/pytorch_lightning/CHANGELOG.md | 6 ++ src/pytorch_lightning/plugins/__init__.py | 2 + .../plugins/precision/__init__.py | 43 ++++++++---- .../precision/fsdp_native_native_amp.py | 65 +++++++++++++++++++ .../precision/fully_sharded_native_amp.py | 26 +------- .../strategies/fully_sharded_native.py | 4 +- .../connectors/accelerator_connector.py | 5 +- .../test_ddp_fully_sharded_native.py | 5 +- 10 files changed, 114 insertions(+), 44 deletions(-) create mode 100644 src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py diff --git a/docs/source-pytorch/api_references.rst b/docs/source-pytorch/api_references.rst index f7e2c4bd58b43..9203f60ef3c02 100644 --- a/docs/source-pytorch/api_references.rst +++ b/docs/source-pytorch/api_references.rst @@ -187,6 +187,7 @@ precision DeepSpeedPrecisionPlugin DoublePrecisionPlugin FullyShardedNativeMixedPrecisionPlugin + FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin diff --git a/docs/source-pytorch/extensions/plugins.rst b/docs/source-pytorch/extensions/plugins.rst index a0dbefd141464..27aff0c11fdcb 100644 --- a/docs/source-pytorch/extensions/plugins.rst +++ b/docs/source-pytorch/extensions/plugins.rst @@ -56,6 +56,7 @@ The full list of built-in precision plugins is listed below. DeepSpeedPrecisionPlugin DoublePrecisionPlugin FullyShardedNativeMixedPrecisionPlugin + FullyShardedNativeNativeMixedPrecisionPlugin HPUPrecisionPlugin IPUPrecisionPlugin MixedPrecisionPlugin diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d7c4350373e9a..6aa6a9c7d8037 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,6 +8,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added +- Added `FullyShardedNativeNativeMixedPrecisionPlugin` to handle precision for `DDPFullyShardedNativeStrategy` ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) + + - Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290)) @@ -95,6 +98,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) +- Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/plugins/__init__.py b/src/pytorch_lightning/plugins/__init__.py index 4d4cc63d89973..5967b8debf3ad 100644 --- a/src/pytorch_lightning/plugins/__init__.py +++ b/src/pytorch_lightning/plugins/__init__.py @@ -10,6 +10,7 @@ from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin @@ -38,6 +39,7 @@ "PrecisionPlugin", "ShardedNativeMixedPrecisionPlugin", "FullyShardedNativeMixedPrecisionPlugin", + "FullyShardedNativeNativeMixedPrecisionPlugin", "TPUPrecisionPlugin", "TPUBf16PrecisionPlugin", "LayerSync", diff --git a/src/pytorch_lightning/plugins/precision/__init__.py b/src/pytorch_lightning/plugins/precision/__init__.py index 4bc29c1be1864..5206aed62c497 100644 --- a/src/pytorch_lightning/plugins/precision/__init__.py +++ b/src/pytorch_lightning/plugins/precision/__init__.py @@ -11,17 +11,32 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import ( # noqa: F401 - FullyShardedNativeMixedPrecisionPlugin, -) -from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin # noqa: F401 -from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin # noqa: F401 +from pytorch_lightning.plugins.precision.apex_amp import ApexMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.deepspeed import DeepSpeedPrecisionPlugin +from pytorch_lightning.plugins.precision.double import DoublePrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.hpu import HPUPrecisionPlugin +from pytorch_lightning.plugins.precision.ipu import IPUPrecisionPlugin +from pytorch_lightning.plugins.precision.mixed import MixedPrecisionPlugin +from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.precision_plugin import PrecisionPlugin +from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.tpu import TPUPrecisionPlugin +from pytorch_lightning.plugins.precision.tpu_bf16 import TPUBf16PrecisionPlugin + +__all__ = [ + "ApexMixedPrecisionPlugin", + "DeepSpeedPrecisionPlugin", + "DoublePrecisionPlugin", + "FullyShardedNativeNativeMixedPrecisionPlugin", + "FullyShardedNativeMixedPrecisionPlugin", + "HPUPrecisionPlugin", + "IPUPrecisionPlugin", + "MixedPrecisionPlugin", + "NativeMixedPrecisionPlugin", + "PrecisionPlugin", + "ShardedNativeMixedPrecisionPlugin", + "TPUPrecisionPlugin", + "TPUBf16PrecisionPlugin", +] diff --git a/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py new file mode 100644 index 0000000000000..2201db94586a2 --- /dev/null +++ b/src/pytorch_lightning/plugins/precision/fsdp_native_native_amp.py @@ -0,0 +1,65 @@ +# Copyright The PyTorch Lightning team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Optional, Union + +import torch + +from pytorch_lightning.plugins.precision.native_amp import NativeMixedPrecisionPlugin +from pytorch_lightning.utilities.enums import PrecisionType +from pytorch_lightning.utilities.exceptions import MisconfigurationException +from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 + +if _TORCH_GREATER_EQUAL_1_12: + from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler +else: + MixedPrecision = None # type: ignore[misc,assignment] + + +class FullyShardedNativeNativeMixedPrecisionPlugin(NativeMixedPrecisionPlugin): + """Native AMP for Fully Sharded Native Training.""" + + def __init__( + self, precision: Union[str, int], device: str, scaler: Optional[torch.cuda.amp.GradScaler] = None + ) -> None: + if not _TORCH_GREATER_EQUAL_1_12: + raise MisconfigurationException( + "`FullyShardedNativeNativeMixedPrecisionPlugin` is supported from PyTorch v1.12.0 onwards." + ) + super().__init__(precision, device, scaler=ShardedGradScaler() if scaler is None and precision == 16 else None) + + def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: + # see https://pytorch.org/docs/stable/fsdp.html#torch.distributed.fsdp.FullyShardedDataParallel.clip_grad_norm_ + # section `Gradient Clipping`, using `torch.nn.utils.clip_grad_norm_` is incorrect + # for FSDP module. To overcome this, needs to call sharded_module.clip_grad_norm(clip_val) + # however we rely on LightningModule's configure_sharded_model to wrap FSDP, it would be hard to + # trace back the root FSDP. Now we only support clip by value. + raise MisconfigurationException( + f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" + ) + + @property + def mixed_precision_config(self) -> Optional[MixedPrecision]: + assert MixedPrecision is not None + if self.precision == PrecisionType.HALF: + dtype = torch.float16 + elif self.precision == PrecisionType.BFLOAT: + dtype = torch.bfloat16 + else: + raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") + return MixedPrecision( + param_dtype=dtype, + reduce_dtype=dtype, + buffer_dtype=dtype, + ) diff --git a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py index 60e53b880c84d..870e658bfc9c3 100644 --- a/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py +++ b/src/pytorch_lightning/plugins/precision/fully_sharded_native_amp.py @@ -11,19 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from typing import Any, Optional - -import torch +from typing import Any from pytorch_lightning.plugins.precision.sharded_native_amp import ShardedNativeMixedPrecisionPlugin -from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException -from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 - -if _TORCH_GREATER_EQUAL_1_12: - from torch.distributed.fsdp.fully_sharded_data_parallel import MixedPrecision -else: - MixedPrecision = None # type: ignore[misc,assignment] class FullyShardedNativeMixedPrecisionPlugin(ShardedNativeMixedPrecisionPlugin): @@ -38,18 +29,3 @@ def clip_grad_by_norm(self, *_: Any, **__: Any) -> None: raise MisconfigurationException( f"`gradient_clip_algorithm='norm'` is currently not supported for `{self.__class__.__name__}`" ) - - @property - def mixed_precision_config(self) -> Optional[MixedPrecision]: - assert MixedPrecision is not None - if self.precision == PrecisionType.HALF: - dtype = torch.float16 - elif self.precision == PrecisionType.BFLOAT: - dtype = torch.bfloat16 - else: - raise MisconfigurationException(f"Was unable to infer precision type, received {self.precision!r}.") - return MixedPrecision( - param_dtype=dtype, - reduce_dtype=dtype, - buffer_dtype=dtype, - ) diff --git a/src/pytorch_lightning/strategies/fully_sharded_native.py b/src/pytorch_lightning/strategies/fully_sharded_native.py index cf34aa9738f88..4dbf36e4c2861 100644 --- a/src/pytorch_lightning/strategies/fully_sharded_native.py +++ b/src/pytorch_lightning/strategies/fully_sharded_native.py @@ -23,7 +23,7 @@ from pytorch_lightning.plugins.environments.cluster_environment import ClusterEnvironment from pytorch_lightning.plugins.io.checkpoint_plugin import CheckpointIO from pytorch_lightning.plugins.precision import PrecisionPlugin -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies.launchers.subprocess_script import _SubprocessScriptLauncher from pytorch_lightning.strategies.parallel import ParallelStrategy from pytorch_lightning.strategies.strategy import TBroadcast @@ -159,7 +159,7 @@ def mixed_precision_config(self) -> Optional[MixedPrecision]: if self.mixed_precision: return self.mixed_precision plugin = self.precision_plugin - if isinstance(plugin, FullyShardedNativeMixedPrecisionPlugin): + if isinstance(plugin, FullyShardedNativeNativeMixedPrecisionPlugin): return plugin.mixed_precision_config @property diff --git a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py index ccfdaa3185686..54636f6d617ac 100644 --- a/src/pytorch_lightning/trainer/connectors/accelerator_connector.py +++ b/src/pytorch_lightning/trainer/connectors/accelerator_connector.py @@ -53,6 +53,7 @@ TorchElasticEnvironment, ) from pytorch_lightning.plugins.layer_sync import LayerSync, NativeSyncBatchNorm +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies import ( DDPFullyShardedNativeStrategy, DDPFullyShardedStrategy, @@ -725,7 +726,9 @@ def _check_and_init_precision(self) -> PrecisionPlugin: if isinstance(self.strategy, (DDPShardedStrategy, DDPSpawnShardedStrategy)): return ShardedNativeMixedPrecisionPlugin(self._precision_flag, device) - if isinstance(self.strategy, (DDPFullyShardedStrategy, DDPFullyShardedNativeStrategy)): + if isinstance(self.strategy, DDPFullyShardedNativeStrategy): + return FullyShardedNativeNativeMixedPrecisionPlugin(self._precision_flag, device) + if isinstance(self.strategy, DDPFullyShardedStrategy): return FullyShardedNativeMixedPrecisionPlugin(self._precision_flag, device) return NativeMixedPrecisionPlugin(self._precision_flag, device) diff --git a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py index 74f9534c47ce3..ede201da1f68f 100644 --- a/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py +++ b/tests/tests_pytorch/strategies/test_ddp_fully_sharded_native.py @@ -7,7 +7,7 @@ from pytorch_lightning import Trainer from pytorch_lightning.callbacks import ModelCheckpoint from pytorch_lightning.demos.boring_classes import BoringModel -from pytorch_lightning.plugins.precision.fully_sharded_native_amp import FullyShardedNativeMixedPrecisionPlugin +from pytorch_lightning.plugins.precision.fsdp_native_native_amp import FullyShardedNativeNativeMixedPrecisionPlugin from pytorch_lightning.strategies import DDPFullyShardedNativeStrategy from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_12 @@ -35,7 +35,7 @@ def test_invalid_on_cpu(tmpdir): @RunIf(min_torch="1.12", min_cuda_gpus=1) @pytest.mark.parametrize("precision, expected", [(16, torch.float16), ("bf16", torch.bfloat16)]) def test_precision_plugin_config(precision, expected): - plugin = FullyShardedNativeMixedPrecisionPlugin(precision=precision, device="cuda") + plugin = FullyShardedNativeNativeMixedPrecisionPlugin(precision=precision, device="cuda") config = plugin.mixed_precision_config assert config.param_dtype == expected assert config.buffer_dtype == expected @@ -96,6 +96,7 @@ def on_predict_batch_end(self, outputs: Optional[Any], batch: Any, batch_idx: in def _assert_layer_fsdp_instance(self) -> None: assert isinstance(self.layer, FullyShardedDataParallel) + assert isinstance(self.trainer.strategy.precision_plugin, FullyShardedNativeNativeMixedPrecisionPlugin) assert isinstance(self.layer.module[0], FullyShardedDataParallel) assert isinstance(self.layer.module[2], FullyShardedDataParallel) # root should not be resharding From 562d22f0c8e3bb1ef28005ca280611e5f4f694e4 Mon Sep 17 00:00:00 2001 From: Mansy Date: Mon, 15 Aug 2022 15:31:16 +0200 Subject: [PATCH 171/230] Fix install latest version of app/component (#14181) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * fix install latest version of app/component * Add changelog * a better testcase * Update src/lightning_app/CHANGELOG.md Co-authored-by: mansy Co-authored-by: Adrian Wälchli --- src/lightning_app/CHANGELOG.md | 2 +- src/lightning_app/cli/cmd_install.py | 8 +++---- tests/tests_app/cli/test_cmd_install.py | 32 +++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 2aa5c7cdd837c..5a88748832aa3 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -45,7 +45,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- +- Resolved a bug where the install command was not installing the latest version of an app/component by default ([#14181](https://github.com/Lightning-AI/lightning/pull/14181)) ## [0.5.5] - 2022-08-9 diff --git a/src/lightning_app/cli/cmd_install.py b/src/lightning_app/cli/cmd_install.py index f15567bd8470c..8f0e45145e59a 100644 --- a/src/lightning_app/cli/cmd_install.py +++ b/src/lightning_app/cli/cmd_install.py @@ -1,4 +1,3 @@ -import json import logging import os import re @@ -7,6 +6,7 @@ import sys import requests +from packaging.version import Version from lightning_app.core.constants import LIGHTNING_APPS_PUBLIC_REGISTRY, LIGHTNING_COMPONENT_PUBLIC_REGISTRY @@ -299,8 +299,8 @@ def _validate_name(name, resource_type, example): def _resolve_resource(registry_url, name, version_arg, resource_type): gallery_entries = [] try: - url = requests.get(registry_url) - data = json.loads(url.text) + response = requests.get(registry_url) + data = response.json() if resource_type == "app": gallery_entries = [a for a in data["apps"] if a["canDownloadSourceCode"]] @@ -328,7 +328,7 @@ def _resolve_resource(registry_url, name, version_arg, resource_type): entry = None if version_arg == "latest": - entry = entries[-1] + entry = max(entries, key=lambda app: Version(app["version"])) else: for e in entries: if e["version"] == version_arg: diff --git a/tests/tests_app/cli/test_cmd_install.py b/tests/tests_app/cli/test_cmd_install.py index 2d277ddb7790c..0139bbc9c5501 100644 --- a/tests/tests_app/cli/test_cmd_install.py +++ b/tests/tests_app/cli/test_cmd_install.py @@ -212,6 +212,38 @@ def test_version_arg_app(tmpdir): assert result.exit_code == 0 +@mock.patch("lightning_app.cli.cmd_install.subprocess", mock.MagicMock()) +@mock.patch("lightning_app.cli.cmd_install.os.chdir", mock.MagicMock()) +@mock.patch("lightning_app.cli.cmd_install._show_install_app_prompt") +def test_install_resolve_latest_version(mock_show_install_app_prompt, tmpdir): + + app_name = "lightning/invideo" + runner = CliRunner() + with mock.patch("lightning_app.cli.cmd_install.requests.get") as get_api_mock: + get_api_mock.return_value.json.return_value = { + "apps": [ + { + "canDownloadSourceCode": True, + "version": "0.0.2", + "name": "lightning/invideo", + }, + { + "canDownloadSourceCode": True, + "version": "0.0.4", + "name": "lightning/invideo", + }, + { + "canDownloadSourceCode": True, + "version": "0.0.5", + "name": "another_app", + }, + ] + } + runner.invoke(lightning_cli.install_app, [app_name, "--yes"]) # no version specified so latest is installed + assert mock_show_install_app_prompt.called + assert mock_show_install_app_prompt.call_args[0][0]["version"] == "0.0.4" + + def test_proper_url_parsing(): name = "lightning/invideo" From deaadc157b87b653f1c3aa7926a5ff092ab81863 Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Mon, 15 Aug 2022 19:05:10 +0530 Subject: [PATCH 172/230] (app) Documentation fix for Work resources (#14182) --- docs/source-app/core_api/lightning_work/compute_content.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source-app/core_api/lightning_work/compute_content.rst b/docs/source-app/core_api/lightning_work/compute_content.rst index 68853c949e12c..ac608574d9203 100644 --- a/docs/source-app/core_api/lightning_work/compute_content.rst +++ b/docs/source-app/core_api/lightning_work/compute_content.rst @@ -34,9 +34,9 @@ Here is the full list of supported machine names: - GPUs - Memory * - default - - 2 + - 1 - 0 - - 3 GB + - 4 GB * - cpu-small - 2 - 0 From 2622989b108368813fdd3850b002fd1ad69988c2 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Mon, 15 Aug 2022 20:06:29 +0200 Subject: [PATCH 173/230] add more issues types (#14174) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add more issues types * Update .github/ISSUE_TEMPLATE/config.yml Co-authored-by: Mansy * typo Co-authored-by: Adrian Wälchli Co-authored-by: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Co-authored-by: Mansy Co-authored-by: Adrian Wälchli Co-authored-by: Laverne Henderson Co-authored-by: Akihiro Nitta --- .github/ISSUE_TEMPLATE/config.yml | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 0fe790310f247..f71844e9664fe 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,8 +1,14 @@ blank_issues_enabled: false contact_links: - - name: Ask a Question + - name: ❓ Ask a Question url: https://github.com/Lightning-AI/lightning/discussions/new - about: Ask and answer Lightning related questions - - name: 💬 Slack + about: Ask and answer Lightning related questions. + - name: 💬 Chat with us url: https://www.pytorchlightning.ai/community - about: Chat with our community + about: Live chat with experts, engineers, and users in our Slack community. + - name: 📖 Read the documentation + url: https://lightning.ai/lightning-docs/ + about: Please consult the documentation before opening any issues! + - name: 🙋 Contact us about professional services + url: https://lightning.ai + about: Contact the Lightning.ai sales team for paid support. From acd4805f1a284e513272d150de6f98f27a0489b3 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 16 Aug 2022 15:28:30 +0200 Subject: [PATCH 174/230] CI: clean building docs (#14216) * CI: clean building docs * group * . --- .github/checkgroup.yml | 2 ++ .github/workflows/docs-checks.yml | 7 ++++--- requirements/app/docs.txt | 17 ++++------------- requirements/docs.txt | 13 +++++++++++++ requirements/pytorch/docs.txt | 15 ++------------- 5 files changed, 25 insertions(+), 29 deletions(-) create mode 100644 requirements/docs.txt diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 0cb80d6e34bd8..a29deb705295e 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -64,6 +64,7 @@ subprojects: paths: - "docs/source-pytorch/**" - ".github/workflows/docs-*.yml" + - "requirements/docs.txt" - "requirements/pytorch/**" checks: - "doctest (pytorch)" @@ -133,6 +134,7 @@ subprojects: paths: - "docs/source-app/**" - ".github/workflows/docs-*.yml" + - "requirements/docs.txt" - "requirements/app/**" checks: - "doctest (app)" diff --git a/.github/workflows/docs-checks.yml b/.github/workflows/docs-checks.yml index 977118b644ef3..5b5a9aec778be 100644 --- a/.github/workflows/docs-checks.yml +++ b/.github/workflows/docs-checks.yml @@ -42,13 +42,13 @@ jobs: - name: Install dependencies env: FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: ${{ matrix.pkg }} run: | sudo apt-get update sudo apt-get install -y cmake pandoc pip --version - pip install -q fire # python -m pip install --upgrade --user pip - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/base.txt -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html pip install -r requirements/${{ matrix.pkg }}/devel.txt pip list shell: bash @@ -91,11 +91,12 @@ jobs: - name: Install dependencies env: FREEZE_REQUIREMENTS: 1 + PACKAGE_NAME: ${{ matrix.pkg }} run: | sudo apt-get update sudo apt-get install -y cmake pandoc pip --version - pip install -e . --quiet -r requirements/${{ matrix.pkg }}/base.txt -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html + pip install -e . --quiet -r requirements/${{ matrix.pkg }}/docs.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html # install Texlive, see https://linuxconfig.org/how-to-install-latex-on-ubuntu-20-04-focal-fossa-linux sudo apt-get update && sudo apt-get install -y texlive-latex-extra dvipng texlive-pictures pip list diff --git a/requirements/app/docs.txt b/requirements/app/docs.txt index bf22aef2c2d92..c189d6034ab28 100644 --- a/requirements/app/docs.txt +++ b/requirements/app/docs.txt @@ -1,17 +1,8 @@ -sphinx>=4.0,<5.0 -myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5, <=0.8.9 +-r ../docs.txt + ipython[notebook] ipython_genutils -pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 -sphinxcontrib-fulltoc>=1.0, <=1.2.0 -sphinxcontrib-mockautodoc +pytorch-lightning -https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip -sphinx-autodoc-typehints>=1.0,<1.15 # v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1, <=0.5.4 -sphinx-togglebutton>=0.2, <=0.3.2 -sphinx-copybutton>=0.3, <=0.5.0 sphinx-autobuild -jinja2>=3.0.0,<3.1.0 +https://storage.googleapis.com/grid-packages/lightning-ai-sphinx-theme/build-31.3.zip diff --git a/requirements/docs.txt b/requirements/docs.txt new file mode 100644 index 0000000000000..1b00471602c60 --- /dev/null +++ b/requirements/docs.txt @@ -0,0 +1,13 @@ +sphinx>=4.0, <5.0 +myst-parser>=0.15, <0.17 +nbsphinx>=0.8.5, <=0.8.9 +pandoc>=1.0, <=2.2 +docutils>=0.16, <0.19 +sphinxcontrib-fulltoc>=1.0, <=1.2.0 +sphinxcontrib-mockautodoc +sphinx-autodoc-typehints>=1.11, <1.15 # strict; v1.15 failing on master (#11405) +sphinx-paramlinks>=0.5.1, <=0.5.4 +sphinx-togglebutton>=0.2, <=0.3.2 +sphinx-copybutton>=0.3, <=0.5.0 +sphinx-multiproject +jinja2>=3.0.0,<3.1.0 diff --git a/requirements/pytorch/docs.txt b/requirements/pytorch/docs.txt index 50e7c2049f6f6..474620b1e74b8 100644 --- a/requirements/pytorch/docs.txt +++ b/requirements/pytorch/docs.txt @@ -1,17 +1,6 @@ -sphinx>=4.0,<5.0 -myst-parser>=0.15,<0.17 -nbsphinx>=0.8.5, <=0.8.9 +-r ../docs.txt + ipython[notebook] -pandoc>=1.0, <=2.2 -docutils>=0.16, <0.19 -sphinxcontrib-fulltoc>=1.0, <=1.2.0 -sphinxcontrib-mockautodoc pt-lightning-sphinx-theme @ https://github.com/Lightning-AI/lightning_sphinx_theme/archive/master.zip -sphinx-autodoc-typehints>=1.11,<1.15 # strict; v1.15 failing on master (#11405) -sphinx-paramlinks>=0.5.1, <=0.5.4 -sphinx-togglebutton>=0.2, <=0.3.2 -sphinx-copybutton>=0.3, <=0.5.0 -typing-extensions # already in `requirements.txt` but the docs CI job does not install it -jinja2>=3.0.0,<3.1.0 -r ../../_notebooks/.actions/requirements.txt From fcf9b42df9549ae5c4f6c46c07679649bd58c90d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 17 Aug 2022 16:15:23 +0200 Subject: [PATCH 175/230] Revert "Remove skipping logic in favor of path filtering (#14170)" (#14244) --- .azure/gpu-tests.yml | 60 +++++++++++---------- .github/file-filters.yml | 9 ++++ .github/workflows/ci-app-cloud-e2e-test.yml | 28 ++++++++-- .github/workflows/ci-app-examples.yml | 7 --- .github/workflows/ci-app-tests.yml | 4 +- .github/workflows/ci-pytorch-test-conda.yml | 35 +++++++++--- .github/workflows/ci-pytorch-test-full.yml | 56 ++++++++++++++----- .github/workflows/ci-pytorch-test-slow.yml | 40 ++++++++++---- 8 files changed, 164 insertions(+), 75 deletions(-) create mode 100644 .github/file-filters.yml diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 8444468c0c58a..683212cd55d4b 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -12,31 +12,15 @@ trigger: - "master" - "release/*" - "refs/tags/*" - paths: - include: - - ".azure/**" - - "examples/run_ddp_examples.sh" - - "examples/convert_from_pt_to_pl/**" - - "examples/run_pl_examples.sh" - - "examples/pl_basics/backbone_image_classifier.py" - - "examples/pl_basics/autoencoder.py" - - "examples/pl_loops/mnist_lite.py" - - "examples/pl_fault_tolerant/automatic.py" - - "examples/test_pl_examples.py" - - "examples/pl_integrations/dali_image_classifier.py" - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" - - "pyproject.toml" - - ".github/workflows/ci-pytorch*.yml" - - ".github/workflows/docs-*.yml" - pr: - "master" - "release/*" +variables: + - name: continue + value: '1' + jobs: - job: testing strategy: @@ -57,6 +41,22 @@ jobs: clean: all steps: + + - bash: | + CHANGED_FILES=$(git diff --name-status origin/master -- . | awk '{print $2}') + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo $CHANGED_FILES > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "##vso[task.setvariable variable=continue]0" + else + echo "Continue" + echo "##vso[task.setvariable variable=continue]1" + fi + displayName: Skipper + - bash: | lspci | egrep 'VGA|3D' whereis nvidia @@ -66,6 +66,7 @@ jobs: pip --version pip list displayName: 'Image info & NVIDIA' + condition: eq(variables['continue'], '1') - bash: | set -e @@ -81,6 +82,7 @@ jobs: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 displayName: 'Install dependencies' + condition: eq(variables['continue'], '1') - bash: | set -e @@ -89,13 +91,16 @@ jobs: python requirements/pytorch/check-avail-strategies.py python requirements/pytorch/check-avail-extras.py displayName: 'Env details' + condition: eq(variables['continue'], '1') - bash: bash .actions/pull_legacy_checkpoints.sh displayName: 'Get legacy checkpoints' + condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest workingDirectory: src/pytorch_lightning displayName: 'Testing: PyTorch doctests' + condition: eq(variables['continue'], '1') - bash: python -m coverage run --source pytorch_lightning -m pytest --ignore benchmarks -v --junitxml=$(Build.StagingDirectory)/test-results.xml --durations=50 env: @@ -103,6 +108,7 @@ jobs: workingDirectory: tests/tests_pytorch displayName: 'Testing: PyTorch standard' timeoutInMinutes: "35" + condition: eq(variables['continue'], '1') - bash: bash run_standalone_tests.sh workingDirectory: tests/tests_pytorch @@ -111,14 +117,7 @@ jobs: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch standalone tests' timeoutInMinutes: "35" - - - bash: bash run_standalone_tasks.sh - workingDirectory: tests/tests_pytorch - env: - PL_USE_MOCKED_MNIST: "1" - PL_RUN_CUDA_TESTS: "1" - displayName: 'Testing: PyTorch standalone tasks' - timeoutInMinutes: "10" + condition: eq(variables['continue'], '1') - bash: | python -m coverage report @@ -128,13 +127,14 @@ jobs: ls -l workingDirectory: tests/tests_pytorch displayName: 'Statistics' + condition: eq(variables['continue'], '1') - task: PublishTestResults@2 displayName: 'Publish test results' inputs: testResultsFiles: '$(Build.StagingDirectory)/test-results.xml' testRunTitle: '$(Agent.OS) - $(Build.DefinitionName) - Python $(python.version)' - condition: succeededOrFailed() + condition: and(succeededOrFailed(), eq(variables['continue'], '1')) - script: | set -e @@ -146,9 +146,11 @@ jobs: env: PL_USE_MOCKED_MNIST: "1" displayName: 'Testing: PyTorch examples' + condition: eq(variables['continue'], '1') - bash: python -m pytest benchmarks -v --maxfail=2 --durations=0 workingDirectory: tests/tests_pytorch env: PL_RUN_CUDA_TESTS: "1" displayName: 'Testing: PyTorch benchmarks' + condition: eq(variables['continue'], '1') diff --git a/.github/file-filters.yml b/.github/file-filters.yml new file mode 100644 index 0000000000000..e621cd83881e4 --- /dev/null +++ b/.github/file-filters.yml @@ -0,0 +1,9 @@ +# This file contains filters to be used in the CI to detect file changes and run the required CI jobs. + +app_examples: + - "src/lightning_app/**" + - "tests/tests_app_examples/**" + - "requirements/app/**" + - "examples/app_*" + - "setup.py" + - "src/pytorch_lightning/__version__.py" diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index 707c506e89e5a..b2281389e7358 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -7,19 +7,37 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - paths: - - ".github/workflows/ci-app-cloud-e2e-test.yml" - - "requirements/app/**" - - "src/lightning_app/**" - - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} jobs: + # This is job should once only once per PR to detect file changes so run required jobs. + # see .github/file-filters.yml to define file filters and run the jobs based on the output of each filter. + # More info: https://github.com/marketplace/actions/paths-changes-filter + + changes: + runs-on: ubuntu-latest + # Set job outputs to the values from filter step + outputs: + app_examples: ${{ steps.filter.outputs.app_examples }} + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.8 + uses: actions/setup-python@v2 + with: + python-version: "3.8" + + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: .github/file-filters.yml + cloud-test: name: Cloud Test + needs: changes + if: ${{ needs.changes.outputs.app_examples == 'true' }} runs-on: ubuntu-20.04 strategy: fail-fast: false diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 8114f59b01aaa..01570f59c2c77 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -6,13 +6,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - paths: - - ".github/workflows/ci-app-examples.yml" - - "requirements/app/**" - - "src/lightning_app/**" - - "tests/tests_app_examples/**" - # the examples are used in the app CI - - "examples/app_*" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index fb2cdbda69079..fe3cc36dc16d3 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -6,10 +6,10 @@ on: # Trigger the workflow on push or pull request, but only for the master bran branches: [master, "release/*"] pull_request: paths: - - ".github/workflows/ci-app-tests.yml" - - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" + - "requirements/app/**" + - "setup.py" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index d314a742bbdcb..3498f087ef0aa 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -6,12 +6,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bra branches: [master, "release/*"] pull_request: branches: [master, "release/*"] - paths: - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-test-conda.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -41,7 +35,28 @@ jobs: - uses: actions/checkout@v2 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v23.1 + + - name: Decide if the test should be skipped + id: skip + shell: bash -l {0} + run: | + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "::set-output name=continue::0" + else + echo "Continue" + echo "::set-output name=continue::1" + fi + - name: Update base dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -55,10 +70,12 @@ jobs: run: pip install "Pillow<9.0" # It messes with torchvision - name: DocTests + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Update all dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -78,9 +95,11 @@ jobs: python requirements/pytorch/check-avail-extras.py - name: Pull legacy checkpoints + if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Testing PyTorch + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --timeout 150 --durations=50 --junitxml=results-${{ runner.os }}-torch${{ matrix.pytorch-version }}.xml @@ -92,7 +111,7 @@ jobs: if: failure() - name: Statistics - if: success() + if: ${{ success() && (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: | coverage report @@ -100,7 +119,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: success() + if: ${{ success() && (steps.skip.outputs.continue == '1') }} # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 386bb012b8cc6..173e2a44a61f4 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -7,12 +7,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] - paths: - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-test-full.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -43,42 +37,67 @@ jobs: steps: - uses: actions/checkout@v2 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v23.1 + + - name: Decide if the test should be skipped + id: skip + shell: bash -l {0} + run: | + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "::set-output name=continue::0" + else + echo "Continue" + echo "::set-output name=continue::1" + fi + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + if: ${{ (steps.skip.outputs.continue == '1') }} + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Reset caching + if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: basic setup + if: ${{ (steps.skip.outputs.continue == '1') }} run: | pip --version pip install -q fire # Github Actions: Run step on specific OS: https://stackoverflow.com/a/57948488/4521646 - name: Setup macOS - if: ${{ (runner.os == 'macOS') }} + if: ${{ (runner.os == 'macOS') && (steps.skip.outputs.continue == '1') }} run: | brew install openmpi libuv # Horovod on macOS requires OpenMPI, Gloo not currently supported - name: Setup Windows - if: ${{ (runner.os == 'windows') }} + if: ${{ (runner.os == 'windows') && (steps.skip.outputs.continue == '1') }} run: | python .actions/assistant.py requirements_prune_pkgs horovod - name: Set min. dependencies - if: ${{ (matrix.requires == 'oldest') }} + if: ${{ (matrix.requires == 'oldest') && (steps.skip.outputs.continue == '1') }} run: | python .actions/assistant.py replace_oldest_ver # Note: This uses an internal pip API and may not always work # https://github.com/actions/cache/blob/master/examples.md#multiple-oss-in-a-workflow - name: Get pip cache dir + if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: echo "::set-output name=dir::$(pip cache dir)" - name: pip cache + if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -87,9 +106,11 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}-${{ matrix.release }}-${{ matrix.requires }}- - name: Pull legacy checkpoints + if: ${{ (steps.skip.outputs.continue == '1') }} run: bash .actions/pull_legacy_checkpoints.sh - name: Install dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -101,10 +122,12 @@ jobs: shell: bash - name: DocTests + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./src run: pytest pytorch_lightning --cov=pytorch_lightning - name: Install extra dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/extra.txt @@ -113,7 +136,7 @@ jobs: shell: bash - name: Reinstall Horovod if necessary - if: ${{ (runner.os != 'windows') }} + if: ${{ (runner.os != 'windows') && (steps.skip.outputs.continue == '1') }} env: HOROVOD_BUILD_ARCH_FLAGS: "-mfma" HOROVOD_WITHOUT_MXNET: 1 @@ -130,38 +153,43 @@ jobs: shell: bash - name: Cache datasets + if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: Datasets key: pl-dataset - name: Sanity check + if: ${{ (steps.skip.outputs.continue == '1') }} run: python requirements/pytorch/check-avail-extras.py - name: Testing PyTorch + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch # NOTE: do not include coverage report here, see: https://github.com/nedbat/coveragepy/issues/1003 run: coverage run --source pytorch_lightning -m pytest -v --durations=50 --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Upload pytest results - if: failure() + if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}-${{ matrix.requires }}-${{ matrix.release }}.xml - name: Prepare Examples + if: ${{ (steps.skip.outputs.continue == '1') }} run: | # adjust versions according installed Torch version python ./requirements/pytorch/adjust-versions.py requirements/pytorch/examples.txt pip install -r requirements/pytorch/examples.txt --find-links https://download.pytorch.org/whl/cpu/torch_stable.html --upgrade - name: Run Examples + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: ./examples run: python -m pytest test_pl_examples.py -v --durations=10 - name: Statistics - if: success() + if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: | coverage report @@ -169,7 +197,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: always() + if: ${{ (always()) && (steps.skip.outputs.continue == '1') }} # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 8e97ea90b2bc4..0bb9916ee302a 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -7,12 +7,6 @@ on: # Trigger the workflow on push or pull request, but only for the master bra pull_request: branches: [master, "release/*"] types: [opened, reopened, ready_for_review, synchronize] - paths: - - "requirements/pytorch/**" - - "src/pytorch_lightning/**" - - "tests/tests_pytorch/**" - - "setup.cfg" # includes pytest config - - ".github/workflows/ci-pytorch-test-slow.yml" concurrency: group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} @@ -34,19 +28,43 @@ jobs: steps: - uses: actions/checkout@v2 - - uses: actions/setup-python@v4 + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@v23.1 + + - name: Decide if the test should be skipped + id: skip + shell: bash -l {0} + run: | + FILTER='src/pytorch_lightning|requirements/pytorch|tests/tests_pytorch|examples/pl_*' + echo "${{ steps.changed-files.outputs.all_changed_files }}" | tr " " "\n" > changed_files.txt + MATCHES=$(cat changed_files.txt | grep -E $FILTER) + echo $MATCHES + if [ -z "$MATCHES" ]; then + echo "Skip" + echo "::set-output name=continue::0" + else + echo "Continue" + echo "::set-output name=continue::1" + fi + + - uses: actions/setup-python@v2 + if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} - name: Reset caching + if: ${{ (steps.skip.outputs.continue == '1') }} run: python -c "import time; days = time.time() / 60 / 60 / 24; print(f'TIME_PERIOD=d{int(days / 2) * 2}')" >> $GITHUB_ENV - name: Get pip cache + if: ${{ (steps.skip.outputs.continue == '1') }} id: pip-cache run: | python -c "from pip._internal.locations import USER_CACHE_DIR; print('::set-output name=dir::' + USER_CACHE_DIR)" - name: Cache pip + if: ${{ (steps.skip.outputs.continue == '1') }} uses: actions/cache@v3 with: path: ${{ steps.pip-cache.outputs.dir }} @@ -55,6 +73,7 @@ jobs: ${{ runner.os }}-pip-td${{ env.TIME_PERIOD }}-py${{ matrix.python-version }}- - name: Install dependencies + if: ${{ (steps.skip.outputs.continue == '1') }} env: PACKAGE_NAME: pytorch FREEZE_REQUIREMENTS: 1 @@ -66,20 +85,21 @@ jobs: shell: bash - name: Testing PyTorch + if: ${{ (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: coverage run --source pytorch_lightning -m pytest -v --junitxml=results-${{ runner.os }}-py${{ matrix.python-version }}.xml env: PL_RUN_SLOW_TESTS: 1 - name: Upload pytest test results - if: failure() + if: ${{ (failure()) && (steps.skip.outputs.continue == '1') }} uses: actions/upload-artifact@v3 with: name: unittest-results-${{ runner.os }}-py${{ matrix.python-version }} path: tests/tests_pytorch/results-${{ runner.os }}-py${{ matrix.python-version }}.xml - name: Statistics - if: success() + if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} working-directory: tests/tests_pytorch run: | coverage report @@ -87,7 +107,7 @@ jobs: - name: Upload coverage to Codecov uses: codecov/codecov-action@v3 - if: success() + if: ${{ (success()) && (steps.skip.outputs.continue == '1') }} # see: https://github.com/actions/toolkit/issues/399 continue-on-error: true with: From 61132837f221d0e3247eda6bad714444f3b867ee Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Wed, 17 Aug 2022 07:15:41 -0700 Subject: [PATCH 176/230] Docs BYOC content (#13976) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * BYOC content Content for the upcoming BYOC feature * First DRAFT of BYOC content * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update docs/source-app/index.rst Co-authored-by: thomas chaton * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: thomas chaton * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: thomas chaton * Updates based on feedback * Updates based on feedback * Update external ID with note * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Raphael Randschau * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Raphael Randschau * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Raphael Randschau * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Raphael Randschau * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Raphael Randschau * Updates for terraform mod Updates for terraform mod and arg pram split * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Adrian Wälchli * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Adrian Wälchli * Update index.rst * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Adrian Wälchli * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Adrian Wälchli * Update docs/source-app/workflows/byoc/index.rst Co-authored-by: Adrian Wälchli * Update content with table Changed bullets into table based on feedback * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: thomas chaton Co-authored-by: Raphael Randschau Co-authored-by: Adrian Wälchli --- docs/source-app/index.rst | 1 + docs/source-app/workflows/byoc/index.rst | 115 +++++++++++++++++++++++ 2 files changed, 116 insertions(+) create mode 100644 docs/source-app/workflows/byoc/index.rst diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index 239288004c2a0..10a65db660d7f 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -227,6 +227,7 @@ Keep Learning Publish a Lightning component Run a server within a Lightning App Run an App on the cloud + Run Apps on your cloud account (BYOC) Run work in parallel Share an app Share files between components diff --git a/docs/source-app/workflows/byoc/index.rst b/docs/source-app/workflows/byoc/index.rst new file mode 100644 index 0000000000000..ba15112bac689 --- /dev/null +++ b/docs/source-app/workflows/byoc/index.rst @@ -0,0 +1,115 @@ + +################################# +Run Apps on your own cloud (BYOC) +################################# + +**Audience:** Users looking to run Lightning Apps on their own private cloud. + +---- + +******************* +A bit of background +******************* + +BYOC - Bring Your Own Cloud, is an alternate deployment model to Lightning Cloud (fully managed SaaS). +BYOC separates the control and data plane. The data plane, that includes +Lightning clusters, services and Lightning Apps, reside inside the user’s VPC. +The control plane resides on Lightning Cloud. + +Setup begins with configuring a cloud provider (today AWS, but more are coming soon) with your personal credentials for +delegated access and an identity provider for secure access to the data plane. + +Next, as part of the environment creation process, you can configure networking, +security, and select among cluster configuration options based on their own use cases. + +After submitting a cluster creation request, the Lightning Control Plane creates the required cloud infrastructure on the user account. This +sets up a new Lightning Cluster along with a Lightning Kubernetes Operator. + + +******************************* +Create a Lightning BYOC cluster +******************************* + +You must have your cloud configured before you try and create a BYOC cluster. + +And to make your life a little easier, we've made a `Terraform module to help with that `_. + +Create a Lightning BYOC cluster using the following command: + +.. code:: bash + + lightning create cluster + +Here's an example: + +.. code:: bash + + lightning create cluster my-byoc-cluster --provider aws --role-arn arn:aws:iam::1234567890:role/lai-byoc --external-id dummy --region us-west-2 --instance-types t3.xlarge --enable-performance + +..note:: Cluster creation is going to take an hour or more after you run this command. + +**Arguments** + +* cluster_name: The name of the cluster to be created + +.. note:: Cluster names can only contain lowercase letters, numbers, and periodic hyphens ( - ). + +**Parameters:** + ++------------------------+----------------------------------------------------------------------------------------------------+ +|Parameter | Descritption | ++========================+====================================================================================================+ +| provider | The cloud provider where your cluster is located. | +| | | +| | AWS is supported today, but support for other cloud providers is coming soon. | ++------------------------+----------------------------------------------------------------------------------------------------+ +| role-arn | AWS IAM Role ARN used to provision resources | ++------------------------+----------------------------------------------------------------------------------------------------+ +| external-id | AWS IAM Role external ID | +| | | +| | To read more on what the AWS external ID is and why it's useful go | +| | `here `_| ++------------------------+----------------------------------------------------------------------------------------------------+ +| region | AWS region containing compute resources | ++------------------------+----------------------------------------------------------------------------------------------------+ +| instance-types | Instance types that you want to support, for computer jobs within the cluster. | +| | | +| | For now, this is the AWS instance types supported by the cluster. | ++------------------------+----------------------------------------------------------------------------------------------------+ +| enable-performance | Specifies if the cluster uses cost savings mode. | +| | | +| | In cost saving mode the number of compute nodes is reduced to one, reducing the cost for clusters | +| | with low utilization. | ++------------------------+----------------------------------------------------------------------------------------------------+ +| edit-before-creation | Enables interactive editing of requests before submitting it to Lightning AI. | ++------------------------+----------------------------------------------------------------------------------------------------+ +| wait | Waits for the cluster to be in a RUNNING state. Only use this for debugging. | ++------------------------+----------------------------------------------------------------------------------------------------+ + +---- + +******************************************* +View a list of your Lightning BYOC clusters +******************************************* + +.. code:: bash + + lightning list clusters + +---- + +******************************* +Delete a Lightning BYOC cluster +******************************* + +Deletes a Lightning BYOC cluster. Lightning AI removes cluster artifacts and any resources running on the cluster. + +.. warning:: Using the --force parameter when deleting a cluster does not clean up any resources managed by Lightning AI. Check your cloud provider to verify that existing cloud resources are deleted. + +Deletion permanently removes not only the record of all runs on a cluster, but all associated artifacts, metrics, logs, etc. + +.. warning:: This process may take a few minutes to complete, but once started it CANNOT be rolled back. Deletion permanently removes not only the BYOC cluster from being managed by Lightning AI, but tears down every BYOC resource Lightning AI managed (for that cluster id) in the host cloud. All object stores, container registries, logs, compute nodes, volumes, etc. are deleted and cannot be recovered. + +.. code:: bash + + lightning delete cluster From 79fc6afa894d8b597a18db1bfb3443d11ba80c22 Mon Sep 17 00:00:00 2001 From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com> Date: Wed, 17 Aug 2022 16:15:52 +0200 Subject: [PATCH 177/230] [App] Moved app.py to root dir for `lightning init app ` template (#13853) * Moved app.py to main app directory * updated docs * updated changelog * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: thomas chaton Co-authored-by: Jirka Borovec --- .../workflows/build_lightning_app/from_scratch_content.rst | 4 ++-- src/lightning_app/CHANGELOG.md | 3 +++ .../cli/app-template/{placeholdername => }/app.py | 0 src/lightning_app/cli/cmd_init.py | 4 ++-- 4 files changed, 7 insertions(+), 4 deletions(-) rename src/lightning_app/cli/app-template/{placeholdername => }/app.py (100%) diff --git a/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst b/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst index 91e7fea93e28c..d90d8662dd430 100644 --- a/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst +++ b/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst @@ -87,10 +87,10 @@ You'll see a print-out like this: /Users/Your/Current/dir/your-app-name run your app with: - lightning run app your-app-name/your_app_name/app.py + lightning run app your-app-name/app.py run it on the cloud to share with your collaborators: - lightning run app your-app-name/your_app_name/app.py --cloud + lightning run app your-app-name/app.py --cloud ---- diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 5a88748832aa3..810cdc51cce5e 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -48,6 +48,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Resolved a bug where the install command was not installing the latest version of an app/component by default ([#14181](https://github.com/Lightning-AI/lightning/pull/14181)) +- Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) + + ## [0.5.5] - 2022-08-9 ### Deprecated diff --git a/src/lightning_app/cli/app-template/placeholdername/app.py b/src/lightning_app/cli/app-template/app.py similarity index 100% rename from src/lightning_app/cli/app-template/placeholdername/app.py rename to src/lightning_app/cli/app-template/app.py diff --git a/src/lightning_app/cli/cmd_init.py b/src/lightning_app/cli/cmd_init.py index 4e239da87c736..565cbd0dec03f 100644 --- a/src/lightning_app/cli/cmd_init.py +++ b/src/lightning_app/cli/cmd_init.py @@ -19,10 +19,10 @@ def app(app_name): {new_resource_name} run your app with: - lightning run app {app_name}/{name_for_files}/app.py + lightning run app {app_name}/app.py run it on the cloud to share with your collaborators: - lightning run app {app_name}/{name_for_files}/app.py --cloud + lightning run app {app_name}/app.py --cloud """ logger.info(m) From 60933c007c2e9f032c9c1b3a33a0d2cd9e7d8c1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kacper=20=C5=81ukawski?= Date: Wed, 17 Aug 2022 16:16:25 +0200 Subject: [PATCH 178/230] Add 'app' parameter into the command example (#14055) Co-authored-by: Jirka Borovec --- src/lightning_app/cli/cmd_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning_app/cli/cmd_init.py b/src/lightning_app/cli/cmd_init.py index 565cbd0dec03f..a7127cd6eb205 100644 --- a/src/lightning_app/cli/cmd_init.py +++ b/src/lightning_app/cli/cmd_init.py @@ -138,7 +138,7 @@ def run(self): app = la.LightningApp(LitApp()) ⚡ Checkout the demo app with your {component_name} component: ⚡ - lightning run {component_name}/app.py + lightning run app {component_name}/app.py ⚡ Tip: Publish your component to the Lightning Gallery to enable users to install it like so: lightning install component YourLightningUserName/{component_name} From 7c329dd3aca2f0cdc8a2ed32f609e0c5e3ac1ba1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 17 Aug 2022 16:17:11 +0200 Subject: [PATCH 179/230] Warn when http URLs are configured (#14233) * add a warning * add test * add test * add changelog * remove todo * clarify http won't work in cloud * Apply suggestions from code review Co-authored-by: Sherin Thomas Co-authored-by: Jirka Borovec Co-authored-by: Sherin Thomas --- src/lightning_app/CHANGELOG.md | 8 +++++++- src/lightning_app/utilities/cloud.py | 6 ++++++ src/lightning_app/utilities/layout.py | 14 +++++++++++--- .../lightning_app/test_configure_layout.py | 18 ++++++++++++++++++ tests/tests_app/utilities/test_cloud.py | 14 ++++++++++++++ 5 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 tests/tests_app/utilities/test_cloud.py diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 810cdc51cce5e..de2416b4208a9 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -21,11 +21,17 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) -- Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) +- Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) + - Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) + + +- Added a warning when `configure_layout` returns URLs configured with http instead of https ([#14233](https://github.com/Lightning-AI/lightning/pull/14233)) + + ### Changed - Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132)) diff --git a/src/lightning_app/utilities/cloud.py b/src/lightning_app/utilities/cloud.py index b320979a62028..6e1d55b70a794 100644 --- a/src/lightning_app/utilities/cloud.py +++ b/src/lightning_app/utilities/cloud.py @@ -1,3 +1,4 @@ +import os import warnings from lightning_cloud.openapi import V1Membership @@ -34,3 +35,8 @@ def _get_project(client: LightningClient, project_id: str = LIGHTNING_CLOUD_PROJ def _sigterm_flow_handler(*_, app: "lightning_app.LightningApp"): app.stage = AppStage.STOPPING + + +def is_running_in_cloud() -> bool: + """Returns True if the Lightning App is running in the cloud.""" + return "LIGHTNING_APP_STATE_URL" in os.environ diff --git a/src/lightning_app/utilities/layout.py b/src/lightning_app/utilities/layout.py index ed7c11020b066..bffc56e9192db 100644 --- a/src/lightning_app/utilities/layout.py +++ b/src/lightning_app/utilities/layout.py @@ -1,8 +1,10 @@ import inspect +import warnings from typing import Dict, List, Union import lightning_app from lightning_app.frontend.frontend import Frontend +from lightning_app.utilities.cloud import is_running_in_cloud def _add_comment_to_literal_code(method, contains, comment): @@ -79,11 +81,17 @@ def _collect_content_layout(layout: List[Dict], flow: "lightning_app.LightningFl f" For the value, choose either a reference to a child flow or a URla." ) if isinstance(entry["content"], str): # assume this is a URL - # The URL isn't fully defined yet. Looks something like ``self.work.url + /something``. - if entry["content"].startswith("/"): + url = entry["content"] + if url.startswith("/"): + # The URL isn't fully defined yet. Looks something like ``self.work.url + /something``. entry["target"] = "" else: - entry["target"] = entry["content"] + entry["target"] = url + if url.startswith("http://") and is_running_in_cloud(): + warnings.warn( + f"You configured an http link {url[:32]}... but it won't be accessible in the cloud." + f" Consider replacing 'http' with 'https' in the link above." + ) elif isinstance(entry["content"], lightning_app.LightningFlow): entry["content"] = entry["content"].name elif isinstance(entry["content"], lightning_app.LightningWork): diff --git a/tests/tests_app/core/lightning_app/test_configure_layout.py b/tests/tests_app/core/lightning_app/test_configure_layout.py index 9323a3503e839..6595d28286095 100644 --- a/tests/tests_app/core/lightning_app/test_configure_layout.py +++ b/tests/tests_app/core/lightning_app/test_configure_layout.py @@ -218,3 +218,21 @@ def test_dynamic_content_layout_update(): app = LightningApp(flow) MultiProcessRuntime(app).dispatch() assert flow.configure_layout_called == 5 + + +@mock.patch("lightning_app.utilities.layout.is_running_in_cloud", return_value=True) +def test_http_url_warning(*_): + class Root(EmptyFlow): + def configure_layout(self): + return [ + dict(name="warning expected", content="http://github.com/very/long/link/to/display"), + dict(name="no warning expected", content="https://github.com"), + ] + + root = Root() + + with pytest.warns( + UserWarning, + match=escape("You configured an http link http://github.com/very/long/link... but it won't be accessible"), + ): + LightningApp(root) diff --git a/tests/tests_app/utilities/test_cloud.py b/tests/tests_app/utilities/test_cloud.py new file mode 100644 index 0000000000000..db5a3efdf13bd --- /dev/null +++ b/tests/tests_app/utilities/test_cloud.py @@ -0,0 +1,14 @@ +import os +from unittest import mock + +from lightning_app.utilities.cloud import is_running_in_cloud + + +@mock.patch.dict(os.environ, clear=True) +def test_is_running_locally(): + assert not is_running_in_cloud() + + +@mock.patch.dict(os.environ, {"LIGHTNING_APP_STATE_URL": "127.0.0.1"}) +def test_is_running_cloud(): + assert is_running_in_cloud() From 909e7e77880deb825dbe92c00012a1bba33ba80c Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Wed, 17 Aug 2022 17:39:27 +0200 Subject: [PATCH 180/230] CI: docker focus on PL only (#14246) * CI: docker focus on PL only * group --- .github/checkgroup.yml | 7 +++++++ .../{cicd-pytorch-dockers.yml => ci-pytorch-dockers.yml} | 3 ++- 2 files changed, 9 insertions(+), 1 deletion(-) rename .github/workflows/{cicd-pytorch-dockers.yml => ci-pytorch-dockers.yml} (99%) diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index a29deb705295e..c2654eddd7ca1 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -73,6 +73,13 @@ subprojects: - id: "pytorch_lightning: Docker" paths: - "dockers/**" + - "!dockers/README.md" + - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" + - "environment.yml" + - ".github/workflows/*docker*.yml" + - "setup.py" checks: - "build-conda (3.8, 1.10)" - "build-conda (3.8, 1.9)" diff --git a/.github/workflows/cicd-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml similarity index 99% rename from .github/workflows/cicd-pytorch-dockers.yml rename to .github/workflows/ci-pytorch-dockers.yml index 84051cafd82d8..a05dbbb5bc8ef 100644 --- a/.github/workflows/cicd-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -8,8 +8,9 @@ on: paths: - "dockers/**" - "!dockers/README.md" - - "requirements/**" - "requirements.txt" + - "requirements/*.txt" + - "requirements/pytorch/*" - "environment.yml" - ".github/workflows/*docker*.yml" - "setup.py" From 44cdbcab04e0453cfea46cf3b16fda600ceda50e Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Wed, 17 Aug 2022 15:42:54 +0000 Subject: [PATCH 181/230] Allowed setting attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks (#14212) --- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/lite/lite.py | 8 +- src/pytorch_lightning/strategies/ipu.py | 6 +- .../trainer/connectors/data_connector.py | 8 +- src/pytorch_lightning/utilities/data.py | 136 +++++++++++++----- tests/tests_pytorch/lite/test_lite.py | 2 +- tests/tests_pytorch/utilities/test_data.py | 136 +++++++++++++++--- 7 files changed, 229 insertions(+), 70 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 6aa6a9c7d8037..5342faf06f77e 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -101,6 +101,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) +- Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/lite/lite.py b/src/pytorch_lightning/lite/lite.py index 981eed30635f6..ca45a4011fcdd 100644 --- a/src/pytorch_lightning/lite/lite.py +++ b/src/pytorch_lightning/lite/lite.py @@ -35,7 +35,7 @@ from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, - _replace_init_method, + _replace_dunder_methods, _update_dataloader, has_iterable_dataset, ) @@ -403,9 +403,9 @@ def _run_impl(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: def _run_with_strategy_setup(self, run_method: Callable, *args: Any, **kwargs: Any) -> Any: self._strategy.setup_environment() - with self._strategy.model_sharded_context(), _replace_init_method(DataLoader, "dataset"), _replace_init_method( - BatchSampler - ): + with self._strategy.model_sharded_context(), _replace_dunder_methods( + DataLoader, "dataset" + ), _replace_dunder_methods(BatchSampler): return run_method(*args, **kwargs) def _move_model_to_device(self, model: nn.Module, optimizers: List[Optimizer]) -> nn.Module: diff --git a/src/pytorch_lightning/strategies/ipu.py b/src/pytorch_lightning/strategies/ipu.py index f56c095dc12c1..b254c5df16ca5 100644 --- a/src/pytorch_lightning/strategies/ipu.py +++ b/src/pytorch_lightning/strategies/ipu.py @@ -31,7 +31,7 @@ from pytorch_lightning.utilities import _IPU_AVAILABLE, _POPTORCH_AVAILABLE, rank_zero_warn from pytorch_lightning.utilities.apply_func import apply_to_collection from pytorch_lightning.utilities.cloud_io import get_filesystem -from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs +from pytorch_lightning.utilities.data import _get_dataloader_init_args_and_kwargs, _reinstantiate_wrapped_cls from pytorch_lightning.utilities.enums import PrecisionType from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.model_helpers import is_overridden @@ -248,7 +248,9 @@ def _convert_to_poptorch_loader( dataloader, sampler, mode, self.replication_factor > 1 # type: ignore[arg-type] ) opts = self.training_opts if mode == RunningStage.TRAINING else self.inference_opts - dataloader = poptorch.DataLoader(opts, *dl_args, **dl_kwargs) + dataloader = _reinstantiate_wrapped_cls( + dataloader, opts, *dl_args, explicit_cls=poptorch.DataLoader, **dl_kwargs + ) return dataloader def _handle_gradient_accumulation_steps(self) -> None: diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index 6e592b9f6d310..e20eac2ffae57 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -31,7 +31,7 @@ from pytorch_lightning.utilities.data import ( _auto_add_worker_init_fn, _is_dataloader_shuffled, - _replace_init_method, + _replace_dunder_methods, _update_dataloader, has_iterable_dataset, has_len_all_ranks, @@ -428,9 +428,11 @@ def _request_dataloader(self, stage: RunningStage) -> Union[DataLoader, List[Dat """ source = getattr(self, f"_{stage.dataloader_prefix}_dataloader_source") - with _replace_init_method(DataLoader, "dataset"), _replace_init_method(BatchSampler): + with _replace_dunder_methods(DataLoader, "dataset"), _replace_dunder_methods(BatchSampler): # under this context manager, the arguments passed to `DataLoader.__init__` will be captured and saved as - # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning + # attributes on the instance in case the dataloader needs to be re-instantiated later by Lightning. + # Also, it records all attribute setting and deletion using patched `__setattr__` and `__delattr__` + # methods so that the re-instantiated object is as close to the original as possible. dataloader = source.dataloader() if isinstance(dataloader, tuple): dataloader = list(dataloader) diff --git a/src/pytorch_lightning/utilities/data.py b/src/pytorch_lightning/utilities/data.py index b625a046f6122..b0c9307cec8e1 100644 --- a/src/pytorch_lightning/utilities/data.py +++ b/src/pytorch_lightning/utilities/data.py @@ -37,7 +37,7 @@ from pytorch_lightning.trainer.states import RunningStage from pytorch_lightning.utilities.apply_func import _is_dataclass_instance from pytorch_lightning.utilities.auto_restart import CaptureIterableDataset, CaptureMapDataset, FastForwardSampler -from pytorch_lightning.utilities.enums import _FaultTolerantMode +from pytorch_lightning.utilities.enums import _FaultTolerantMode, LightningEnum from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.meta import _get_all_subclasses from pytorch_lightning.utilities.rank_zero import rank_zero_warn @@ -49,6 +49,18 @@ warning_cache = WarningCache() +class _WrapAttrTag(LightningEnum): + SET = "set" + DEL = "del" + + def __call__(self, *args): + if self == self.SET: + fn = setattr + else: + fn = delattr + return fn(*args) + + def _extract_batch_size(batch: BType) -> Generator[int, None, None]: if isinstance(batch, Tensor): if batch.ndim == 0: @@ -189,27 +201,7 @@ def _update_dataloader( dataloader: DataLoader, sampler: Union[Sampler, Iterable], mode: Optional[RunningStage] = None ) -> DataLoader: dl_args, dl_kwargs = _get_dataloader_init_args_and_kwargs(dataloader, sampler, mode) - dl_cls = type(dataloader) - try: - dataloader = dl_cls(*dl_args, **dl_kwargs) - except TypeError as e: - # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass - # `__init__` arguments map to one `DataLoader.__init__` argument - import re - - match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) - if not match: - # an unexpected `TypeError`, continue failure - raise - argument = match.groups()[0] - message = ( - f"The {dl_cls.__name__} `DataLoader` implementation has an error where more than one `__init__` argument" - f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" - f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." - f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." - " This argument was automatically passed to your DataLoader by PyTorch Lightning." - ) - raise MisconfigurationException(message) from e + dataloader = _reinstantiate_wrapped_cls(dataloader, *dl_args, **dl_kwargs) return dataloader @@ -375,7 +367,7 @@ def _dataloader_init_kwargs_resolve_sampler( "this, expose an argument `sampler` in the `__init__` method of your custom class." ) - batch_sampler = batch_sampler_cls(*args, **kwargs) + batch_sampler = _reinstantiate_wrapped_cls(batch_sampler, *args, **kwargs) else: try: batch_sampler = batch_sampler_cls( @@ -450,6 +442,37 @@ def _auto_add_worker_init_fn(dataloader: DataLoader, rank: int) -> None: dataloader.worker_init_fn = partial(pl_worker_init_function, rank=rank) +def _reinstantiate_wrapped_cls(orig_object: Any, *args: Any, explicit_cls: Optional[Type] = None, **kwargs: Any) -> Any: + constructor = type(orig_object) if explicit_cls is None else explicit_cls + + try: + result = constructor(*args, **kwargs) + except TypeError as e: + # improve exception message due to an incorrect implementation of the `DataLoader` where multiple subclass + # `__init__` arguments map to one `DataLoader.__init__` argument + import re + + match = re.match(r".*__init__\(\) got multiple values .* '(\w+)'", str(e)) + if not match: + # an unexpected `TypeError`, continue failure + raise + argument = match.groups()[0] + message = ( + f"The {constructor.__name__} implementation has an error where more than one `__init__` argument" + f" can be passed to its parent's `{argument}=...` `__init__` argument. This is likely caused by allowing" + f" passing both a custom argument that will map to the `{argument}` argument as well as `**kwargs`." + f" `kwargs` should be filtered to make sure they don't contain the `{argument}` key." + " This argument was automatically passed to your object by PyTorch Lightning." + ) + raise MisconfigurationException(message) from e + + attrs_record = getattr(orig_object, "__pl_attrs_record", list()) + for args, fn in attrs_record: + fn(result, *args) + + return result + + def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) -> Callable: """Wraps the ``__init__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" @@ -458,6 +481,8 @@ def _wrap_init_method(init: Callable, store_explicit_arg: Optional[str] = None) def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: # We need to inspect `init`, as inspecting `obj.__init__` # can lead to inspecting the wrong function with multiple inheritance + old_inside_init = getattr(obj, "__pl_inside_init", False) + object.__setattr__(obj, "__pl_inside_init", True) params = inspect.signature(init).parameters parameters_defaults = OrderedDict( @@ -475,45 +500,82 @@ def wrapper(obj: Any, *args: Any, **kwargs: Any) -> None: } if not hasattr(obj, "__pl_saved_args"): - obj.__pl_saved_args = args - obj.__pl_saved_kwargs = kwargs - obj.__pl_saved_arg_names = param_names - obj.__pl_saved_default_kwargs = default_kwargs + object.__setattr__(obj, "__pl_saved_args", args) + object.__setattr__(obj, "__pl_saved_kwargs", kwargs) + object.__setattr__(obj, "__pl_saved_arg_names", param_names) + object.__setattr__(obj, "__pl_saved_default_kwargs", default_kwargs) # We want to use the latest possible value for explicit argument (i.e. ideally what gets passed to base class) # so that we can be sure, that it will not get changed anymore. # That is why we are setting this in every `__init__` if store_explicit_arg is not None: if store_explicit_arg in param_names: - setattr(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) + object.__setattr__(obj, f"__{store_explicit_arg}", args[param_names.index(store_explicit_arg)]) elif store_explicit_arg in kwargs: - setattr(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) + object.__setattr__(obj, f"__{store_explicit_arg}", kwargs[store_explicit_arg]) init(obj, *args, **kwargs) + object.__setattr__(obj, "__pl_inside_init", old_inside_init) + + return wrapper + + +def _wrap_attr_method(method: Callable, tag: _WrapAttrTag) -> Callable: + """Wraps the ``__setattr__`` or ``__delattr__`` method of classes (currently :class:`~torch.utils.data.DataLoader` and + :class:`~torch.utils.data.BatchSampler`) in order to enable re-instantiation of custom subclasses.""" + + @functools.wraps(method) + def wrapper(obj: Any, *args: Any): + # First, let's find out if we're the first in inheritance chain calling the patched method. + name, *_ = args + prev_call_name, prev_call_method = getattr(obj, "__pl_current_call", (None, "method")) + first_call = not (prev_call_name == name and prev_call_method == tag) + + # Then mark the current called method + object.__setattr__(obj, "__pl_current_call", (name, tag)) + + # call original method + method(obj, *args) + if first_call and not getattr(obj, "__pl_inside_init", True): + # and save the value it was called with to the internal list, + # if we're outside of __init__ and the original call did not fail and we're the first call + attrs_record = getattr(obj, "__pl_attrs_record", list()) + attrs_record.append((args, tag)) + object.__setattr__(obj, "__pl_attrs_record", attrs_record) + object.__setattr__(obj, "__pl_current_call", (prev_call_name, prev_call_method)) return wrapper @contextmanager -def _replace_init_method(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: +def _replace_dunder_methods(base_cls: Type, store_explicit_arg: Optional[str] = None) -> Generator[None, None, None]: """This context manager is used to add support for re-instantiation of custom (subclasses) of `base_cls`. - It patches the ``__init__`` method. + It patches the ``__init__``, ``__setattr__`` and ``__delattr__`` methods. """ classes = _get_all_subclasses(base_cls) | {base_cls} for cls in classes: # Check that __init__ belongs to the class # https://stackoverflow.com/a/5253424 if "__init__" in cls.__dict__: - cls._old_init = cls.__init__ + cls.__old__init__ = cls.__init__ cls.__init__ = _wrap_init_method(cls.__init__, store_explicit_arg) + + # we want at least one setattr/delattr in the chain to be patched and it can happen, that none of the subclasses + # implement `__setattr__`/`__delattr__`. Therefore, we are always patching the `base_cls` + for patch_fn_name, tag in (("__setattr__", _WrapAttrTag.SET), ("__delattr__", _WrapAttrTag.DEL)): + if patch_fn_name in cls.__dict__ or cls is base_cls: + saved_name = f"__old{patch_fn_name}" + setattr(cls, saved_name, getattr(cls, patch_fn_name)) + setattr(cls, patch_fn_name, _wrap_attr_method(getattr(cls, patch_fn_name), tag)) yield for cls in classes: - # Check that _old_init belongs to the class - # https://stackoverflow.com/a/5253424 - if "_old_init" in cls.__dict__: - cls.__init__ = cls._old_init - del cls._old_init + for patched_name in ("__setattr__", "__delattr__", "__init__"): + # Check that __old__{init,setattr,delattr} belongs to the class + # https://stackoverflow.com/a/5253424 + if f"__old{patched_name}" in cls.__dict__: + setattr(cls, patched_name, getattr(cls, f"__old{patched_name}")) + delattr(cls, f"__old{patched_name}") def _wrap_with_capture_dataset(dataset: Dataset) -> Dataset: diff --git a/tests/tests_pytorch/lite/test_lite.py b/tests/tests_pytorch/lite/test_lite.py index 86a0a5a82195a..d45046f249d54 100644 --- a/tests/tests_pytorch/lite/test_lite.py +++ b/tests/tests_pytorch/lite/test_lite.py @@ -177,7 +177,7 @@ def test_setup_dataloaders_return_type(): assert lite_dataloader1.dataset is dataset1 -@mock.patch("pytorch_lightning.lite.lite._replace_init_method") +@mock.patch("pytorch_lightning.lite.lite._replace_dunder_methods") def test_setup_dataloaders_captures_dataloader_arguments(ctx_manager): """Test that Lite intercepts the DataLoader constructor arguments with a context manager in its run method.""" diff --git a/tests/tests_pytorch/utilities/test_data.py b/tests/tests_pytorch/utilities/test_data.py index cc70417988616..9e3d04ae65560 100644 --- a/tests/tests_pytorch/utilities/test_data.py +++ b/tests/tests_pytorch/utilities/test_data.py @@ -13,9 +13,10 @@ from pytorch_lightning.utilities.data import ( _dataloader_init_kwargs_resolve_sampler, _get_dataloader_init_args_and_kwargs, - _replace_init_method, + _replace_dunder_methods, _replace_value_in_saved_args, _update_dataloader, + _WrapAttrTag, extract_batch_size, get_len, has_iterable_dataset, @@ -144,10 +145,10 @@ def __init__(self, foo, *args, **kwargs): super().__init__(foo, *args, **kwargs) dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) - with pytest.raises(MisconfigurationException, match="`DataLoader` implementation has an error.*`dataset`"): + with pytest.raises(MisconfigurationException, match="implementation has an error.*`dataset`"): _update_dataloader(dataloader, dataloader.sampler) - with _replace_init_method(DataLoader, "dataset"): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = BadStandaloneGoodHookImpl([1, 2, 3]) new_dataloader = _update_dataloader(dataloader, dataloader.sampler) assert isinstance(new_dataloader, BadStandaloneGoodHookImpl) @@ -159,7 +160,7 @@ def __init__(self, randomize, *args, **kwargs): super().__init__(*args, shuffle=randomize, **kwargs) dataloader = BadImpl(False, []) - with pytest.raises(MisconfigurationException, match="`DataLoader` implementation has an error.*`shuffle`"): + with pytest.raises(MisconfigurationException, match="implementation has an error.*`shuffle`"): _update_dataloader(dataloader, dataloader.sampler) class GoodImpl(DataLoader): @@ -173,28 +174,33 @@ def __init__(self, randomize, *args, **kwargs): assert isinstance(new_dataloader, GoodImpl) -def test_replace_init_method_multiple_loaders_without_init(): +def test_replace_dunder_methods_multiple_loaders_without_init(): """In case of a class, that inherits from a class that we are patching, but doesn't define its own `__init__` - method (the one we are wrapping), it can happen, that `hasattr(cls, "_old_init")` is True because of parent + method (the one we are wrapping), it can happen, that `hasattr(cls, "__old__init__")` is True because of parent class, but it is impossible to delete, because that method is owned by parent class. Furthermore, the error occured only sometimes because it depends on the order in which we are iterating over a set of classes we are patching. This test simulates the behavior by generating sufficient number of dummy classes, which do not define `__init__` - and are children of `DataLoader`. We are testing that a) context manager `_replace_init_method` exits cleanly, and - b) the mechanism checking for presence of `_old_init` works as expected. + and are children of `DataLoader`. We are testing that a) context manager `_replace_dunder_method` exits cleanly, and + b) the mechanism checking for presence of `__old__init__` works as expected. """ classes = [DataLoader] for i in range(100): classes.append(type(f"DataLoader_{i}", (random.choice(classes),), {})) - with _replace_init_method(DataLoader, "dataset"): + before = {cls: cls.__init__ for cls in classes} + + with _replace_dunder_methods(DataLoader, "dataset"): for cls in classes[1:]: # First one is `DataLoader` - assert "_old_init" not in cls.__dict__ - assert hasattr(cls, "_old_init") + assert "__old__init__" not in cls.__dict__ + assert hasattr(cls, "__old__init__") + + assert "__old__init__" in DataLoader.__dict__ + assert hasattr(DataLoader, "__old__init__") - assert "_old_init" in DataLoader.__dict__ - assert hasattr(DataLoader, "_old_init") + for cls in classes: + assert before[cls] == cls.__init__ class DataLoaderSubclass1(DataLoader): @@ -322,8 +328,8 @@ def __init__(self, dataset, **kwargs): pytest.param(ChangingDataLoader, (range(5),), dict(), ("dataset",), list(range(10)), dict(), id="test9"), ], ) -def test_replace_init_method_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): - with _replace_init_method(DataLoader, "dataset"): +def test_replace_dunder_methods_dataloader(cls, args, kwargs, arg_names, dataset, checked_values): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = cls(*args, **kwargs) assert dataloader.__pl_saved_args == args @@ -360,12 +366,12 @@ def test_replace_init_method_dataloader(cls, args, kwargs, arg_names, dataset, c assert dataloader_value == value -def test_replace_init_method_extra_kwargs(): +def test_replace_dunder_methods_extra_kwargs(): class LoaderSubclass(DataLoader): def __init__(self, dataset, *args, batch_size=10, **kwargs): super().__init__(dataset, *args, batch_size=batch_size, **kwargs) - with _replace_init_method(DataLoader, "dataset"): + with _replace_dunder_methods(DataLoader, "dataset"): dataloader = LoaderSubclass(range(10)) assert dataloader.__pl_saved_args == (range(10),) @@ -375,6 +381,90 @@ def __init__(self, dataset, *args, batch_size=10, **kwargs): assert dataloader.__dataset == range(10) +def test_replace_dunder_methods_attrs(): + """This test checks, that all the calls from setting and deleting attributes within `_replace_dunder_methods` + are correctly preserved even after reinstantiation. + + It also includes a custom `__setattr__` + """ + + class Loader(DataLoader): + def __setattr__(self, attr, val): + if attr == "custom_arg": + val = val + 2 + super().__setattr__(attr, val) + + with _replace_dunder_methods(DataLoader, "dataset"): + dataloader = Loader(range(10)) + dataloader.custom_arg = 5 + dataloader.my_arg = 10 + dataloader.another_arg = 100 + del dataloader.dataset + try: + del dataloader.abc_arg + except AttributeError: + pass + + assert dataloader.__pl_saved_args == (range(10),) + assert dataloader.__pl_saved_kwargs == {} + assert dataloader.__pl_saved_arg_names == ("dataset",) + assert dataloader.__dataset == range(10) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + assert dataloader.__pl_attrs_record == [ + (("custom_arg", 5), _WrapAttrTag.SET), + (("my_arg", 10), _WrapAttrTag.SET), + (("another_arg", 100), _WrapAttrTag.SET), + (("dataset",), _WrapAttrTag.DEL), + ] + + dataloader = _update_dataloader(dataloader, dataloader.sampler) + assert dataloader.custom_arg == 7 + assert dataloader.my_arg == 10 + assert dataloader.another_arg == 100 + assert not hasattr(dataloader, "dataset") + + +def test_replace_dunder_methods_restore_methods(): + """This tests checks whether are all dunder methods restored to their original versions.""" + + class Init(DataLoader): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + class SetAttr(DataLoader): + def __setattr__(self, *args): + return super().__setattr__(*args) + + class DelAttr(DataLoader): + def __delattr__(self, *args): + return super().__delattr__(*args) + + class InitAndSetAttr(Init, SetAttr): + pass + + class InitAndDelAttr(Init, DelAttr): + pass + + class SetAttrAndDelAttr(SetAttr, DelAttr): + pass + + class AllDunder(Init, SetAttr, DelAttr): + pass + + before = dict() + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + before[cls] = {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + with _replace_dunder_methods(DataLoader, "dataset"): + pass + + for cls in (Init, SetAttr, DelAttr, InitAndSetAttr, InitAndDelAttr, SetAttrAndDelAttr, AllDunder): + assert before[cls] == {"init": cls.__init__, "setattr": cls.__setattr__, "delattr": cls.__delattr__} + + @pytest.mark.parametrize("predicting", [True, False]) def test_custom_batch_sampler(predicting): """This test asserts, that custom `BatchSampler`, with all the arguments, that are required in order to @@ -391,8 +481,8 @@ def __init__(self, sampler, extra_arg, drop_last=True): super().__init__(sampler, 10, drop_last) sampler = RandomSampler(range(10)) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler(sampler, "random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) @@ -437,8 +527,8 @@ def __init__(self, sampler, extra_arg): super().__init__(sampler, 10, False) sampler = RandomSampler(range(10)) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler(sampler, "random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) @@ -464,8 +554,8 @@ def __init__(self, extra_arg): self.extra_arg = extra_arg super().__init__(RandomSampler(range(10)), 10, False) - with _replace_init_method(BatchSampler): - # instantiate within `_replace_init_method` context manager, simulating `*_dataloader` hooks + with _replace_dunder_methods(BatchSampler): + # instantiate within `_replace_dunder_method` context manager, simulating `*_dataloader` hooks batch_sampler = MyBatchSampler("random_str") dataloader = DataLoader(range(10), batch_sampler=batch_sampler) From 2e59c49592e1e81107be0f112c25a9bd324fcd08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 17 Aug 2022 18:31:20 +0200 Subject: [PATCH 182/230] Update defaults for WandbLogger's run name and project name (#14145) --- src/pytorch_lightning/CHANGELOG.md | 12 +++++++++++- src/pytorch_lightning/loggers/wandb.py | 13 +++++++------ tests/tests_pytorch/loggers/test_all.py | 2 +- tests/tests_pytorch/loggers/test_wandb.py | 20 ++++++++++++++------ 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 5342faf06f77e..906286102b9e6 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -25,7 +25,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - The `Trainer.{fit,validate,test,predict,tune}` methods now raise a useful error message if the input is not a `LightningModule` ([#13892](https://github.com/Lightning-AI/lightning/pull/13892)) -- Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([13961](https://github.com/Lightning-AI/lightning/pull/13961)) +- Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([#13961](https://github.com/Lightning-AI/lightning/pull/13961)) - Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) @@ -34,6 +34,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) +- The `WandbLogger.name` property no longer returns the name of the experiment, and instead returns the project's name ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) + + +- The default project name in `WandbLogger` is now "lightning_logs" ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) + + ### Deprecated - Deprecated `LightningDeepSpeedModule` ([#14000](https://github.com/Lightning-AI/lightning/pull/14000)) @@ -101,9 +107,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) +- Fixed an issue in which the default name for a run in `WandbLogger` would be set to the project name instead of a randomly generated string ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) + + - Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/loggers/wandb.py b/src/pytorch_lightning/loggers/wandb.py index 530fb58fabe5e..baf4bc9092774 100644 --- a/src/pytorch_lightning/loggers/wandb.py +++ b/src/pytorch_lightning/loggers/wandb.py @@ -260,7 +260,7 @@ def __init__( id: Optional[str] = None, anonymous: Optional[bool] = None, version: Optional[str] = None, - project: Optional[str] = None, + project: str = "lightning_logs", log_model: Union[str, bool] = False, experiment: Union[Run, RunDisabled, None] = None, prefix: str = "", @@ -297,7 +297,7 @@ def __init__( self._checkpoint_callback: Optional["ReferenceType[Checkpoint]"] = None # set wandb init arguments self._wandb_init: Dict[str, Any] = dict( - name=name or project, + name=name, project=project, id=version or id, dir=save_dir, @@ -306,6 +306,7 @@ def __init__( ) self._wandb_init.update(**kwargs) # extract parameters + self._project = self._wandb_init.get("project") self._save_dir = self._wandb_init.get("dir") self._name = self._wandb_init.get("name") self._id = self._wandb_init.get("id") @@ -450,13 +451,13 @@ def save_dir(self) -> Optional[str]: @property def name(self) -> Optional[str]: - """Gets the name of the experiment. + """The project name of this experiment. Returns: - The name of the experiment if the experiment exists else the name given to the constructor. + The name of the project the current experiment belongs to. This name is not the same as `wandb.Run`'s + name. To access wandb's internal experiment name, use ``logger.experiment.name`` instead. """ - # don't create an experiment if we don't have one - return self._experiment.name if self._experiment else self._name + return self._project @property def version(self) -> Optional[str]: diff --git a/tests/tests_pytorch/loggers/test_all.py b/tests/tests_pytorch/loggers/test_all.py index d613296abccf5..612d7bf035c2f 100644 --- a/tests/tests_pytorch/loggers/test_all.py +++ b/tests/tests_pytorch/loggers/test_all.py @@ -300,7 +300,7 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx): @pytest.mark.parametrize("logger_class", ALL_LOGGER_CLASSES_WO_NEPTUNE_WANDB) -@RunIf(skip_windows=True, skip_hanging_spawn=True) +@RunIf(skip_windows=True) def test_logger_created_on_rank_zero_only(tmpdir, monkeypatch, logger_class): """Test that loggers get replaced by dummy loggers on global rank > 0.""" _patch_comet_atexit(monkeypatch) diff --git a/tests/tests_pytorch/loggers/test_wandb.py b/tests/tests_pytorch/loggers/test_wandb.py index fbc1d5e189637..648e1a8f38ec8 100644 --- a/tests/tests_pytorch/loggers/test_wandb.py +++ b/tests/tests_pytorch/loggers/test_wandb.py @@ -25,6 +25,16 @@ from tests_pytorch.helpers.utils import no_warning_call +@mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) +@mock.patch("pytorch_lightning.loggers.wandb.wandb") +def test_wandb_project_name(*_): + logger = WandbLogger() + assert logger.name == "lightning_logs" + + logger = WandbLogger(project="project") + assert logger.name == "project" + + @mock.patch("pytorch_lightning.loggers.wandb.Run", new=mock.Mock) @mock.patch("pytorch_lightning.loggers.wandb.wandb") def test_wandb_logger_init(wandb, monkeypatch): @@ -48,7 +58,7 @@ def test_wandb_logger_init(wandb, monkeypatch): wandb.init.reset_mock() WandbLogger(project="test_project").experiment wandb.init.assert_called_once_with( - name="test_project", dir=None, id=None, project="test_project", resume="allow", anonymous=None + name=None, dir=None, id=None, project="test_project", resume="allow", anonymous=None ) # test wandb.init and setting logger experiment externally @@ -91,7 +101,6 @@ def test_wandb_logger_init(wandb, monkeypatch): logger.watch("model", "log", 10, False) wandb.init().watch.assert_called_once_with("model", log="log", log_freq=10, log_graph=False) - assert logger.name == wandb.init().name assert logger.version == wandb.init().id @@ -140,10 +149,9 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): """Test that the logger creates the folders and files in the right place.""" monkeypatch.setattr(pytorch_lightning.loggers.wandb, "_WANDB_GREATER_EQUAL_0_12_10", True) wandb.run = None - logger = WandbLogger(save_dir=str(tmpdir), offline=True) + logger = WandbLogger(project="project", save_dir=str(tmpdir), offline=True) # the logger get initialized assert logger.version == wandb.init().id - assert logger.name == wandb.init().name # mock return values of experiment wandb.run = None @@ -154,7 +162,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): _ = logger.experiment assert logger.version == "1" - assert logger.name == "run_name" + assert logger.name == "project" assert str(tmpdir) == logger.save_dir assert not os.listdir(tmpdir) @@ -164,7 +172,7 @@ def test_wandb_logger_dirs_creation(wandb, monkeypatch, tmpdir): assert trainer.log_dir == logger.save_dir trainer.fit(model) - assert trainer.checkpoint_callback.dirpath == str(tmpdir / "run_name" / version / "checkpoints") + assert trainer.checkpoint_callback.dirpath == str(tmpdir / "project" / version / "checkpoints") assert set(os.listdir(trainer.checkpoint_callback.dirpath)) == {"epoch=0-step=3.ckpt"} assert trainer.log_dir == logger.save_dir From 1745d192f9c3f5c19e5d5eea7a7330f9bc296b57 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Wed, 17 Aug 2022 11:35:23 -0700 Subject: [PATCH 183/230] Remove incorrect "template" information (#13911) Co-authored-by: Jirka Borovec --- .../from_scratch_content.rst | 61 ------------------- .../from_scratch_component_content.rst | 47 -------------- 2 files changed, 108 deletions(-) diff --git a/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst b/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst index d90d8662dd430..7641b4f4e7c30 100644 --- a/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst +++ b/docs/source-app/workflows/build_lightning_app/from_scratch_content.rst @@ -58,64 +58,3 @@ Run the Lightning App on the cloud: .. code:: bash lightning run app app.py --cloud - ----- - -************************************* -Build a Lightning App from a template -************************************* -If you didn't find an Lightning App similar to the one you need (in the `Lightning App gallery `_), another option is to start from a template. -The Lightning CLI can generate a template with built-in testing that can be easily published to the -Lightning App Gallery. - -Generate a Lightning App with our template generator: - -.. code:: bash - - lightning init app your-app-name - -You'll see a print-out like this: - -.. code:: bash - - ➜ lightning init app your-app-name - - /Users/Your/Current/dir/your-app-name - INFO: laying out app template at /Users/Your/Current/dir/your-app-name - INFO: - Lightning app template created! - /Users/Your/Current/dir/your-app-name - - run your app with: - lightning run app your-app-name/app.py - - run it on the cloud to share with your collaborators: - lightning run app your-app-name/app.py --cloud - ----- - -Modify the Lightning App template -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The command above generates a Lightning App file like this: - -.. code:: python - - from your_app_name import ComponentA, ComponentB - - import lightning as L - - - class LitApp(L.LightningFlow): - def __init__(self) -> None: - super().__init__() - self.component_a = ComponentA() - self.component_b = ComponentB() - - def run(self): - self.component_a.run() - self.component_b.run() - - - app = L.LightningApp(LitApp()) - -Now you can add your own components as you wish! diff --git a/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst b/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst index f3168e566823d..a42be9b739fe6 100644 --- a/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst +++ b/docs/source-app/workflows/build_lightning_component/from_scratch_component_content.rst @@ -151,50 +151,3 @@ run the app .. code:: bash lightning run app app.py - ----- - -******************************************* -Build a Lightning component from a template -******************************************* -If you'd prefer a component template with built-in testing that can be easily published to the -Lightning component gallery, generate it with our template generator: - -.. code:: bash - - lightning init component your-component-name - -You'll see a print-out like this: - -.. code:: bash - - ➜ lightning init component your-component-name - INFO: laying out component template at /Users/williamfalcon/Developer/opensource/_/lightning/scratch/hello-world - INFO: - ⚡ Lightning component template created! ⚡ - /Users/williamfalcon/Developer/opensource/_/lightning/scratch/hello-world - - ... - ----- - -Modify the component template -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The command above generates a component file like this: - -.. code:: python - - import lightning as L - - - class TemplateComponent(L.LightningWork): - def __init__(self) -> None: - super().__init__() - self.value = 0 - - def run(self): - self.value += 1 - print("welcome to your work component") - print("this is running inside a work") - -Now you can modify the component as you wish! From 7d1731096609adb97c3a686d1cb5000bbaea4832 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 17 Aug 2022 21:08:41 +0200 Subject: [PATCH 184/230] Terminate process when main process raises error in ServableModuleValidator (#14217) Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> --- .github/workflows/ci-pytorch-test-conda.yml | 2 +- src/pytorch_lightning/serve/servable_module_validator.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index 3498f087ef0aa..a0ec35973b579 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -27,7 +27,7 @@ jobs: - {python-version: "3.8", pytorch-version: "1.10"} - {python-version: "3.9", pytorch-version: "1.11"} - {python-version: "3.9", pytorch-version: "1.12"} - timeout-minutes: 30 + timeout-minutes: 40 steps: - name: Workaround for https://github.com/actions/checkout/issues/760 diff --git a/src/pytorch_lightning/serve/servable_module_validator.py b/src/pytorch_lightning/serve/servable_module_validator.py index ddee2a729a1f9..305e520e422b3 100644 --- a/src/pytorch_lightning/serve/servable_module_validator.py +++ b/src/pytorch_lightning/serve/servable_module_validator.py @@ -47,7 +47,7 @@ def __init__( server: Literal["fastapi", "ml_server", "torchserve", "sagemaker"] = "fastapi", host: str = "127.0.0.1", port: int = 8080, - timeout: int = 10, + timeout: int = 20, exit_on_failure: bool = True, ): super().__init__() @@ -109,7 +109,8 @@ def on_train_start(self, trainer: "pl.Trainer", servable_module: "pl.LightningMo except requests.exceptions.ConnectionError: pass if time.time() - t0 > self.timeout: - raise Exception(f"The Server didn't start in {self.timeout}") + process.kill() + raise Exception(f"The server didn't start within {self.timeout} seconds.") time.sleep(0.1) payload = servable_module.configure_payload() From d9eb8567f0f0f27c112efa06d1da4e9b24d6beb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 18 Aug 2022 09:52:16 +0200 Subject: [PATCH 185/230] Update changelog after 1.7.2. release (#14251) --- src/pytorch_lightning/CHANGELOG.md | 47 ++++++++---------------------- 1 file changed, 12 insertions(+), 35 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 906286102b9e6..11b08afe0f963 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -8,15 +8,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added -- Added `FullyShardedNativeNativeMixedPrecisionPlugin` to handle precision for `DDPFullyShardedNativeStrategy` ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) - - Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290)) -- Added profiling to these hooks: `on_before_batch_transfer`, `transfer_batch_to_device`, `on_after_batch_transfer`, `configure_gradient_clipping`, `clip_gradients` ([#14069](https://github.com/Lightning-AI/lightning/pull/14069)) - - - @@ -28,17 +23,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Raised a `MisconfigurationException` if batch transfer hooks are overriden with `IPUAccelerator` ([#13961](https://github.com/Lightning-AI/lightning/pull/13961)) -- Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) - - - Replaced the unwrapping logic in strategies with direct access to unwrapped `LightningModule` ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) -- The `WandbLogger.name` property no longer returns the name of the experiment, and instead returns the project's name ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) - - -- The default project name in `WandbLogger` is now "lightning_logs" ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) - ### Deprecated @@ -75,45 +62,35 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) -### Fixed - -- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) +## [1.7.2] - 2022-08-17 +### Added -- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Added `FullyShardedNativeNativeMixedPrecisionPlugin` to handle precision for `DDPFullyShardedNativeStrategy` ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) +- Added profiling to these hooks: `on_before_batch_transfer`, `transfer_batch_to_device`, `on_after_batch_transfer`, `configure_gradient_clipping`, `clip_gradients` ([#14069](https://github.com/Lightning-AI/lightning/pull/14069)) +### Changed -- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) +- The `WandbLogger.name` property no longer returns the name of the experiment, and instead returns the project's name ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) +- The default project name in `WandbLogger` is now "lightning_logs" ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) +- Updated compatibility for LightningLite to run with the latest DeepSpeed 0.7.0 ([13967](https://github.com/Lightning-AI/lightning/pull/13967)) +### Fixed +- Fixed a bug that caused spurious `AttributeError` when multiple `DataLoader` classes are imported ([#14117](https://github.com/Lightning-AI/lightning/pull/14117)) +- Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) +- Fixed resuming from a checkpoint when using Stochastic Weight Averaging (SWA) ([#9938](https://github.com/Lightning-AI/lightning/pull/9938)) - Fixed the device placement when `LightningModule.cuda()` gets called without specifying a device index and the current cuda device was not 0 ([#14128](https://github.com/Lightning-AI/lightning/pull/14128)) - - - Avoided false positive warning about using `sync_dist` when using torchmetrics ([#14143](https://github.com/Lightning-AI/lightning/pull/14143)) - - - Avoid `metadata.entry_points` deprecation warning on Python 3.10 ([#14052](https://github.com/Lightning-AI/lightning/pull/14052)) - - - Fixed epoch-end logging results not being reset after the end of the epoch ([#14061](https://github.com/Lightning-AI/lightning/pull/14061)) - - - Avoid raising the sampler warning if num_replicas=1 ([#14097](https://github.com/Lightning-AI/lightning/pull/14097)) - - - Fixed saving hyperparameters in a composition where the parent class is not a `LightningModule` or `LightningDataModule` ([#14151](https://github.com/Lightning-AI/lightning/pull/14151)) - - - Avoided requiring the FairScale package to use precision with the fsdp native strategy ([#14092](https://github.com/Lightning-AI/lightning/pull/14092)) - - - Fixed an issue in which the default name for a run in `WandbLogger` would be set to the project name instead of a randomly generated string ([#14145](https://github.com/Lightning-AI/lightning/pull/14145)) - - - Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) - ## [1.7.1] - 2022-08-09 ### Fixed From 52051c582395c3c6eca3fffa579b6b3b1550f700 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Thu, 18 Aug 2022 10:11:49 +0200 Subject: [PATCH 186/230] release LAI docs as stable (#14250) --- .github/workflows/docs-deploy.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml index dd589baf2fa46..f1df928fef569 100644 --- a/.github/workflows/docs-deploy.yml +++ b/.github/workflows/docs-deploy.yml @@ -1,7 +1,7 @@ name: "Deploy Docs" on: push: - branches: [master] + branches: ["release/app"] jobs: # https://github.com/marketplace/actions/deploy-to-github-pages From 401eb2c535f3b34b39313fbe58407fb8cbc56a24 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 18 Aug 2022 08:45:59 +0000 Subject: [PATCH 187/230] Update tensorboard requirement from <2.10.0,>=2.9.1 to >=2.9.1,<2.11.0 in /requirements (#14200) Update tensorboard requirement in /requirements Updates the requirements on [tensorboard](https://github.com/tensorflow/tensorboard) to permit the latest version. - [Release notes](https://github.com/tensorflow/tensorboard/releases) - [Changelog](https://github.com/tensorflow/tensorboard/blob/master/RELEASE.md) - [Commits](https://github.com/tensorflow/tensorboard/compare/2.9.1...2.10.0) --- updated-dependencies: - dependency-name: tensorboard dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/base.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 49e2243319206..ee892701dc041 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -6,7 +6,7 @@ torch>=1.9.*, <=1.12.0 tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 -tensorboard>=2.9.1, <2.10.0 +tensorboard>=2.9.1, <2.11.0 torchmetrics>=0.7.0, <0.9.3 # needed for using fixed compare_version pyDeprecate>=0.3.1, <=0.3.2 packaging>=17.0, <=21.3 From 047f0aa1e47161ac5c1a8828ecbd8923936d8989 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 18 Aug 2022 11:09:23 +0200 Subject: [PATCH 188/230] Fix type check for non-standard schedulers in horovod (#14215) --- src/pytorch_lightning/CHANGELOG.md | 6 ++++++ src/pytorch_lightning/strategies/horovod.py | 5 ++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 11b08afe0f963..e38f39b8c39b7 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -62,6 +62,12 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Removed the experimental `pytorch_lightning.utiltiies.meta` functions in favor of built-in https://github.com/pytorch/torchdistx support ([#13868](https://github.com/Lightning-AI/lightning/pull/13868)) +### Fixed + +- Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215)) + + + ## [1.7.2] - 2022-08-17 ### Added diff --git a/src/pytorch_lightning/strategies/horovod.py b/src/pytorch_lightning/strategies/horovod.py index a0e928535e407..6329d1e4091e0 100644 --- a/src/pytorch_lightning/strategies/horovod.py +++ b/src/pytorch_lightning/strategies/horovod.py @@ -31,7 +31,6 @@ from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.imports import _HOROVOD_AVAILABLE from pytorch_lightning.utilities.rank_zero import rank_zero_only -from pytorch_lightning.utilities.types import _LRScheduler if _HOROVOD_AVAILABLE: import horovod.torch as hvd @@ -114,8 +113,8 @@ def _unpack_lightning_optimizer(opt: Optimizer) -> Optimizer: lr_scheduler_configs = self.lr_scheduler_configs for config in lr_scheduler_configs: scheduler = config.scheduler - assert isinstance(scheduler, _LRScheduler) - scheduler.base_lrs = [lr * self.world_size for lr in scheduler.base_lrs] + if hasattr(scheduler, "base_lrs"): + scheduler.base_lrs = [lr * self.world_size for lr in scheduler.base_lrs] # type: ignore[union-attr] assert self.lightning_module is not None # Horovod: broadcast parameters & optimizer state to ensure consistent initialization From caa0d594b1a4e26237add1780cc11d65833e5090 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 18 Aug 2022 10:57:13 +0000 Subject: [PATCH 189/230] Update torch requirement from <=1.12.0,>=1.9.* to >=1.9.0.a,<1.13.0 in /requirements (#14088) * Update torch requirement in /requirements Updates the requirements on [torch](https://github.com/pytorch/pytorch) to permit the latest version. - [Release notes](https://github.com/pytorch/pytorch/releases) - [Changelog](https://github.com/pytorch/pytorch/blob/master/RELEASE.md) - [Commits](https://github.com/pytorch/pytorch/compare/v1.9.0-rc1...v1.12.1) --- updated-dependencies: - dependency-name: torch dependency-type: direct:production ... Signed-off-by: dependabot[bot] * Update base.txt * Update adjust_versions.py Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec Co-authored-by: Akihiro Nitta --- requirements/pytorch/adjust-versions.py | 3 ++- requirements/pytorch/base.txt | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/requirements/pytorch/adjust-versions.py b/requirements/pytorch/adjust-versions.py index 13e02798f5835..b5305014acc36 100644 --- a/requirements/pytorch/adjust-versions.py +++ b/requirements/pytorch/adjust-versions.py @@ -5,7 +5,8 @@ # IMPORTANT: this list needs to be sorted in reverse VERSIONS = [ - dict(torch="1.12.0", torchvision="0.13.0", torchtext="0.13.0"), # stable + dict(torch="1.12.1", torchvision="0.13.1", torchtext="0.13.1"), # stable + dict(torch="1.12.0", torchvision="0.13.0", torchtext="0.13.0"), dict(torch="1.11.0", torchvision="0.12.0", torchtext="0.12.0"), dict(torch="1.10.2", torchvision="0.11.3", torchtext="0.11.2"), dict(torch="1.10.1", torchvision="0.11.2", torchtext="0.11.1"), diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index ee892701dc041..0c72678229208 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment numpy>=1.17.2, <1.23.1 -torch>=1.9.*, <=1.12.0 +torch>=1.9.*, <1.13.0 tqdm>=4.57.0, <4.65.0 PyYAML>=5.4, <=6.0 fsspec[http]>=2021.05.0, !=2021.06.0, <2022.6.0 From 8ab65ffade7976918b9295c630272f730aa6a71f Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Thu, 18 Aug 2022 04:55:49 -0700 Subject: [PATCH 190/230] Fixes note formatting and more (#14264) Fixes formatting for a note and makes headings for arguments and parameters stand out --- docs/source-app/workflows/byoc/index.rst | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/source-app/workflows/byoc/index.rst b/docs/source-app/workflows/byoc/index.rst index ba15112bac689..2cabf046939ba 100644 --- a/docs/source-app/workflows/byoc/index.rst +++ b/docs/source-app/workflows/byoc/index.rst @@ -46,15 +46,21 @@ Here's an example: lightning create cluster my-byoc-cluster --provider aws --role-arn arn:aws:iam::1234567890:role/lai-byoc --external-id dummy --region us-west-2 --instance-types t3.xlarge --enable-performance -..note:: Cluster creation is going to take an hour or more after you run this command. +.. note:: Cluster creation is going to take an hour or more after you run this command. -**Arguments** +---- + +Arguments +^^^^^^^^^ * cluster_name: The name of the cluster to be created .. note:: Cluster names can only contain lowercase letters, numbers, and periodic hyphens ( - ). -**Parameters:** +---- + +Parameters +^^^^^^^^^^ +------------------------+----------------------------------------------------------------------------------------------------+ |Parameter | Descritption | @@ -104,7 +110,7 @@ Delete a Lightning BYOC cluster Deletes a Lightning BYOC cluster. Lightning AI removes cluster artifacts and any resources running on the cluster. -.. warning:: Using the --force parameter when deleting a cluster does not clean up any resources managed by Lightning AI. Check your cloud provider to verify that existing cloud resources are deleted. +.. warning:: Using the ``--force`` parameter when deleting a cluster does not clean up any resources managed by Lightning AI. Check your cloud provider to verify that existing cloud resources are deleted. Deletion permanently removes not only the record of all runs on a cluster, but all associated artifacts, metrics, logs, etc. From e949362a6ba9ef5429b00286d891ffb4af1e8830 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Thu, 18 Aug 2022 17:42:29 +0530 Subject: [PATCH 191/230] Enable `on_before_batch_transfer` for `DPStrategy` and `IPUAccelerator` (#14023) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- .../accelerators/gpu_intermediate.rst | 4 +- .../source-pytorch/accelerators/ipu_basic.rst | 5 +- src/pytorch_lightning/CHANGELOG.md | 3 + src/pytorch_lightning/core/hooks.py | 7 -- src/pytorch_lightning/core/module.py | 39 ++++++----- src/pytorch_lightning/loggers/tensorboard.py | 1 + .../loops/dataloader/evaluation_loop.py | 12 ++-- .../loops/epoch/prediction_epoch_loop.py | 1 + src/pytorch_lightning/loops/fit_loop.py | 13 ++-- .../trainer/configuration_validator.py | 2 +- src/pytorch_lightning/utilities/fetching.py | 4 +- .../utilities/model_summary/model_summary.py | 1 + tests/tests_pytorch/core/test_datamodules.py | 69 ------------------- tests/tests_pytorch/models/test_hooks.py | 15 +--- .../trainer/test_config_validator.py | 2 +- 15 files changed, 54 insertions(+), 124 deletions(-) diff --git a/docs/source-pytorch/accelerators/gpu_intermediate.rst b/docs/source-pytorch/accelerators/gpu_intermediate.rst index 4ea765d94675f..a6a641168658e 100644 --- a/docs/source-pytorch/accelerators/gpu_intermediate.rst +++ b/docs/source-pytorch/accelerators/gpu_intermediate.rst @@ -47,8 +47,8 @@ after which the root node will aggregate the results. :doc:`Manual Optimization <../model/manual_optimization>` with DP. Use DDP which is more stable and at least 3x faster. .. warning:: DP only supports scattering and gathering primitive collections of tensors like lists, dicts, etc. - Therefore the hooks :meth:`~pytorch_lightning.core.hooks.ModelHooks.on_before_batch_transfer`, - :meth:`~pytorch_lightning.core.hooks.ModelHooks.transfer_batch_to_device` and :meth:`~pytorch_lightning.core.hooks.ModelHooks.on_after_batch_transfer` + Therefore :meth:`~pytorch_lightning.core.hooks.ModelHooks.transfer_batch_to_device` and + :meth:`~pytorch_lightning.core.hooks.ModelHooks.on_after_batch_transfer` do not apply in this mode and if you have overridden any of them, an exception will be raised. .. testcode:: diff --git a/docs/source-pytorch/accelerators/ipu_basic.rst b/docs/source-pytorch/accelerators/ipu_basic.rst index 5302945fc6cc4..06cd056029bcc 100644 --- a/docs/source-pytorch/accelerators/ipu_basic.rst +++ b/docs/source-pytorch/accelerators/ipu_basic.rst @@ -67,6 +67,5 @@ Please see the `MNIST example List[Logger]: """Reference to the list of loggers in the Trainer.""" return self.trainer.loggers if self._trainer else [] + def _call_batch_hook(self, hook_name, *args) -> Any: + if self._trainer: + datahook_selector = self._trainer._data_connector._datahook_selector + obj = datahook_selector.get_instance(hook_name) + trainer_method = ( + self._trainer._call_lightning_module_hook + if isinstance(obj, self.__class__) + else self._trainer._call_lightning_datamodule_hook + ) + return trainer_method(hook_name, *args) + else: + hook = getattr(self, hook_name) + return hook(*args) + + def _on_before_batch_transfer(self, batch: Any, dataloader_idx: int = 0) -> Any: + return self._call_batch_hook("on_before_batch_transfer", batch, dataloader_idx) + def _apply_batch_transfer_handler( self, batch: Any, device: Optional[torch.device] = None, dataloader_idx: int = 0 ) -> Any: device = device or self.device - - def call_hook(hook_name, *args): - if self._trainer: - datahook_selector = self._trainer._data_connector._datahook_selector - obj = datahook_selector.get_instance(hook_name) - trainer_method = ( - self._trainer._call_lightning_module_hook - if isinstance(obj, self.__class__) - else self._trainer._call_lightning_datamodule_hook - ) - return trainer_method(hook_name, *args) - else: - hook = getattr(self, hook_name) - return hook(*args) - - batch = call_hook("on_before_batch_transfer", batch, dataloader_idx) - batch = call_hook("transfer_batch_to_device", batch, device, dataloader_idx) - batch = call_hook("on_after_batch_transfer", batch, dataloader_idx) + batch = self._call_batch_hook("transfer_batch_to_device", batch, device, dataloader_idx) + batch = self._call_batch_hook("on_after_batch_transfer", batch, dataloader_idx) return batch def print(self, *args, **kwargs) -> None: @@ -1822,6 +1823,7 @@ def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = Non ) input_sample = self.example_input_array + input_sample = self._on_before_batch_transfer(input_sample) input_sample = self._apply_batch_transfer_handler(input_sample) if not _TORCH_GREATER_EQUAL_1_10 and "example_outputs" not in kwargs: @@ -1902,6 +1904,7 @@ def to_torchscript( example_inputs = self.example_input_array # automatically send example inputs to the right device and use trace + example_inputs = self._on_before_batch_transfer(example_inputs) example_inputs = self._apply_batch_transfer_handler(example_inputs) torchscript_module = torch.jit.trace(func=self.eval(), example_inputs=example_inputs, **kwargs) else: diff --git a/src/pytorch_lightning/loggers/tensorboard.py b/src/pytorch_lightning/loggers/tensorboard.py index dacecf129523b..25e1fa8be5193 100644 --- a/src/pytorch_lightning/loggers/tensorboard.py +++ b/src/pytorch_lightning/loggers/tensorboard.py @@ -242,6 +242,7 @@ def log_graph(self, model: "pl.LightningModule", input_array: Optional[Tensor] = input_array = model.example_input_array if input_array is not None: + input_array = model._on_before_batch_transfer(input_array) input_array = model._apply_batch_transfer_handler(input_array) model._running_torchscript = True self.experiment.add_graph(model, input_array) diff --git a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py index 3a9c9ec0ac391..c8ab1e9cc921b 100644 --- a/src/pytorch_lightning/loops/dataloader/evaluation_loop.py +++ b/src/pytorch_lightning/loops/dataloader/evaluation_loop.py @@ -15,7 +15,6 @@ import shutil import sys from collections import ChainMap, OrderedDict -from functools import partial from typing import Any, Iterable, List, Optional, Sequence, Tuple, Type, Union from deprecate.utils import void @@ -142,11 +141,14 @@ def advance(self, *args: Any, **kwargs: Any) -> None: dataloader_idx = self.current_dataloader_idx dataloader = self.current_dataloader + + def batch_to_device(batch: Any) -> Any: + batch = self.trainer.lightning_module._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx) + batch = self.trainer._call_strategy_hook("batch_to_device", batch, dataloader_idx=dataloader_idx) + return batch + assert self._data_fetcher is not None - self._data_fetcher.setup( - dataloader, - batch_to_device=partial(self.trainer._call_strategy_hook, "batch_to_device", dataloader_idx=dataloader_idx), - ) + self._data_fetcher.setup(dataloader, batch_to_device=batch_to_device) dl_max_batches = self._max_batches[dataloader_idx] kwargs = OrderedDict() diff --git a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index ba16c56feee4c..2c481522f9eaa 100644 --- a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -94,6 +94,7 @@ def advance( # type: ignore[override] if batch is None: raise StopIteration + batch = self.trainer.lightning_module._on_before_batch_transfer(batch, dataloader_idx=dataloader_idx) batch = self.trainer._call_strategy_hook("batch_to_device", batch, dataloader_idx=dataloader_idx) self.batch_progress.increment_ready() diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py index 9e0d53c66b4e3..be45a6e3b094e 100644 --- a/src/pytorch_lightning/loops/fit_loop.py +++ b/src/pytorch_lightning/loops/fit_loop.py @@ -13,8 +13,7 @@ # limitations under the License. import logging import os -from functools import partial -from typing import Optional, Type +from typing import Any, Optional, Type import pytorch_lightning as pl from pytorch_lightning.accelerators import CUDAAccelerator @@ -262,10 +261,14 @@ def advance(self) -> None: # type: ignore[override] log.detail(f"{self.__class__.__name__}: advancing loop") assert self.trainer.train_dataloader is not None dataloader = self.trainer.train_dataloader + + def batch_to_device(batch: Any) -> Any: + batch = self.trainer.lightning_module._on_before_batch_transfer(batch, dataloader_idx=0) + batch = self.trainer._call_strategy_hook("batch_to_device", batch, dataloader_idx=0) + return batch + assert self._data_fetcher is not None - self._data_fetcher.setup( - dataloader, batch_to_device=partial(self.trainer._call_strategy_hook, "batch_to_device", dataloader_idx=0) - ) + self._data_fetcher.setup(dataloader, batch_to_device=batch_to_device) with self.trainer.profiler.profile("run_training_epoch"): self._outputs = self.epoch_loop.run(self._data_fetcher) diff --git a/src/pytorch_lightning/trainer/configuration_validator.py b/src/pytorch_lightning/trainer/configuration_validator.py index 6cf3e6d52ed95..74c477b245708 100644 --- a/src/pytorch_lightning/trainer/configuration_validator.py +++ b/src/pytorch_lightning/trainer/configuration_validator.py @@ -151,7 +151,7 @@ def __verify_eval_loop_configuration(trainer: "pl.Trainer", model: "pl.Lightning def __verify_batch_transfer_support(trainer: "pl.Trainer", model: "pl.LightningModule") -> None: """Raise Misconfiguration exception since these hooks are not supported in DP mode.""" - batch_transfer_hooks = ("on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer") + batch_transfer_hooks = ("transfer_batch_to_device", "on_after_batch_transfer") datahook_selector = trainer._data_connector._datahook_selector for hook in batch_transfer_hooks: # TODO: Remove this blocker once batch transfer to device is integrated in Lightning for DP mode. diff --git a/src/pytorch_lightning/utilities/fetching.py b/src/pytorch_lightning/utilities/fetching.py index a4518e147da02..4869b72134c25 100644 --- a/src/pytorch_lightning/utilities/fetching.py +++ b/src/pytorch_lightning/utilities/fetching.py @@ -219,7 +219,9 @@ def __init__(self, prefetch_batches: int = 1, store_on_device: bool = True) -> N self._has_len = False def setup( # type: ignore[override] - self, dataloader: Iterable, batch_to_device: Optional[Callable[[Any], Any]] = None + self, + dataloader: Iterable, + batch_to_device: Optional[Callable[[Any], Any]] = None, ) -> None: super().setup(dataloader) self._has_len = has_len(dataloader) diff --git a/src/pytorch_lightning/utilities/model_summary/model_summary.py b/src/pytorch_lightning/utilities/model_summary/model_summary.py index d28aabfddc4f8..cb2cc0d64ba71 100644 --- a/src/pytorch_lightning/utilities/model_summary/model_summary.py +++ b/src/pytorch_lightning/utilities/model_summary/model_summary.py @@ -261,6 +261,7 @@ def _forward_example_input(self) -> None: trainer = self._model._trainer input_ = model.example_input_array + input_ = model._on_before_batch_transfer(input_) input_ = model._apply_batch_transfer_handler(input_) mode = model.training diff --git a/tests/tests_pytorch/core/test_datamodules.py b/tests/tests_pytorch/core/test_datamodules.py index 41a5eb874af2e..3521149fb2427 100644 --- a/tests/tests_pytorch/core/test_datamodules.py +++ b/tests/tests_pytorch/core/test_datamodules.py @@ -265,75 +265,6 @@ def test_full_loop(tmpdir): assert result[0]["test_acc"] > 0.6 -@pytest.mark.parametrize( - "accelerator,device", - [ - pytest.param("gpu", "cuda:0", marks=RunIf(min_cuda_gpus=1)), - pytest.param("mps", "mps:0", marks=RunIf(mps=True)), - ], -) -@mock.patch( - "pytorch_lightning.strategies.Strategy.lightning_module", - new_callable=PropertyMock, -) -def test_dm_apply_batch_transfer_handler(get_module_mock, accelerator, device): - expected_device = torch.device(device) - - class CustomBatch: - def __init__(self, data): - self.samples = data[0] - self.targets = data[1] - - class CurrentTestDM(LightningDataModule): - rank = 0 - transfer_batch_to_device_hook_rank = None - on_before_batch_transfer_hook_rank = None - on_after_batch_transfer_hook_rank = None - - def on_before_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx == 0 - self.on_before_batch_transfer_hook_rank = self.rank - self.rank += 1 - batch.samples += 1 - return batch - - def on_after_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx == 0 - assert batch.samples.device == batch.targets.device == expected_device - self.on_after_batch_transfer_hook_rank = self.rank - self.rank += 1 - batch.targets *= 2 - return batch - - def transfer_batch_to_device(self, batch, device, dataloader_idx): - assert dataloader_idx == 0 - self.transfer_batch_to_device_hook_rank = self.rank - self.rank += 1 - batch.samples = batch.samples.to(device) - batch.targets = batch.targets.to(device) - return batch - - dm = CurrentTestDM() - model = BoringModel() - - batch = CustomBatch((torch.zeros(5, 32), torch.ones(5, 1, dtype=torch.long))) - - trainer = Trainer(accelerator=accelerator, devices=1) - model.trainer = trainer - # running .fit() would require us to implement custom data loaders, we mock the model reference instead - get_module_mock.return_value = model - - trainer._data_connector.attach_datamodule(model, datamodule=dm) - batch_gpu = trainer.strategy.batch_to_device(batch, expected_device) - - assert dm.on_before_batch_transfer_hook_rank == 0 - assert dm.transfer_batch_to_device_hook_rank == 1 - assert dm.on_after_batch_transfer_hook_rank == 2 - assert batch_gpu.samples.device == batch_gpu.targets.device == expected_device - assert torch.allclose(batch_gpu.samples.cpu(), torch.ones(5, 32)) - assert torch.allclose(batch_gpu.targets.cpu(), torch.ones(5, 1, dtype=torch.long) * 2) - - def test_dm_reload_dataloaders_every_n_epochs(tmpdir): """Test datamodule, where trainer argument reload_dataloaders_every_n_epochs is set to a non negative integer.""" diff --git a/tests/tests_pytorch/models/test_hooks.py b/tests/tests_pytorch/models/test_hooks.py index a2235c592d5fb..158371a3097c5 100644 --- a/tests/tests_pytorch/models/test_hooks.py +++ b/tests/tests_pytorch/models/test_hooks.py @@ -133,16 +133,8 @@ def __init__(self, data): class CurrentTestModel(BoringModel): rank = 0 transfer_batch_to_device_hook_rank = None - on_before_batch_transfer_hook_rank = None on_after_batch_transfer_hook_rank = None - def on_before_batch_transfer(self, batch, dataloader_idx): - assert dataloader_idx == 0 - self.on_before_batch_transfer_hook_rank = self.rank - self.rank += 1 - batch.samples += 1 - return batch - def on_after_batch_transfer(self, batch, dataloader_idx): assert dataloader_idx == 0 assert batch.samples.device == batch.targets.device == expected_device @@ -168,11 +160,10 @@ def transfer_batch_to_device(self, batch, device, dataloader_idx): model_getter_mock.return_value = model batch_gpu = trainer.strategy.batch_to_device(batch, expected_device) - assert model.on_before_batch_transfer_hook_rank == 0 - assert model.transfer_batch_to_device_hook_rank == 1 - assert model.on_after_batch_transfer_hook_rank == 2 + assert model.transfer_batch_to_device_hook_rank == 0 + assert model.on_after_batch_transfer_hook_rank == 1 assert batch_gpu.samples.device == batch_gpu.targets.device == expected_device - assert torch.allclose(batch_gpu.samples.cpu(), torch.ones(5, 32)) + assert torch.allclose(batch_gpu.samples.cpu(), torch.zeros(5, 32)) assert torch.allclose(batch_gpu.targets.cpu(), torch.ones(5, 1, dtype=torch.long) * 2) diff --git a/tests/tests_pytorch/trainer/test_config_validator.py b/tests/tests_pytorch/trainer/test_config_validator.py index bb973fe10ca1c..7fba63ba7ae24 100644 --- a/tests/tests_pytorch/trainer/test_config_validator.py +++ b/tests/tests_pytorch/trainer/test_config_validator.py @@ -197,7 +197,7 @@ def setup(self, pl_module, trainer): @pytest.mark.parametrize("trainer_kwargs", [{"accelerator": "ipu"}, {"accelerator": "gpu", "strategy": "dp"}]) -@pytest.mark.parametrize("hook", ["on_before_batch_transfer", "transfer_batch_to_device", "on_after_batch_transfer"]) +@pytest.mark.parametrize("hook", ["transfer_batch_to_device", "on_after_batch_transfer"]) def test_raise_exception_with_batch_transfer_hooks(monkeypatch, hook, trainer_kwargs, tmpdir): """Test that an exception is raised when overriding batch_transfer_hooks.""" if trainer_kwargs.get("accelerator") == "gpu": From bd55b56f56e9ecf10839ea11ef30ac158bc547e1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 18 Aug 2022 14:14:09 +0200 Subject: [PATCH 192/230] Update scikit-learn requirement from <=1.1.1,>0.22.1 to >0.22.1,<1.1.3 in /requirements (#14276) Update scikit-learn requirement in /requirements Updates the requirements on [scikit-learn](https://github.com/scikit-learn/scikit-learn) to permit the latest version. - [Release notes](https://github.com/scikit-learn/scikit-learn/releases) - [Commits](https://github.com/scikit-learn/scikit-learn/compare/0.22.2...1.1.2) --- updated-dependencies: - dependency-name: scikit-learn dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/test.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/test.txt b/requirements/pytorch/test.txt index f8bd5793a0af6..87f619e64e8c5 100644 --- a/requirements/pytorch/test.txt +++ b/requirements/pytorch/test.txt @@ -9,7 +9,7 @@ mypy==0.971 # needed in tests cloudpickle>=1.3, <=2.1.0 -scikit-learn>0.22.1, <=1.1.1 +scikit-learn>0.22.1, <1.1.3 onnxruntime<1.13.0 psutil<=5.9.1 # for `DeviceStatsMonitor` pandas>1.0, <=1.4.3 # needed in benchmarks From 7879628a3a0ff0b7ef6b48f91bc1d2d4375389e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 18 Aug 2022 14:55:08 +0200 Subject: [PATCH 193/230] Fix access to logger attribute when multiple loggers are used (#14234) * Fix access to logger attribute when multiple loggers are used * add changelog --- src/pytorch_lightning/CHANGELOG.md | 2 ++ src/pytorch_lightning/core/module.py | 3 ++- tests/tests_pytorch/deprecated_api/test_remove_1-8.py | 6 ++++++ 3 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 714fb82fcc87a..d4d717a94dd4f 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -70,6 +70,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an assertion error when using a `ReduceOnPlateau` scheduler with the Horovod strategy ([#14215](https://github.com/Lightning-AI/lightning/pull/14215)) +- Fixed an `AttributeError` when accessing `LightningModule.logger` and the Trainer has multiple loggers ([#14234](https://github.com/Lightning-AI/lightning/pull/14234)) + ## [1.7.2] - 2022-08-17 diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index e3d5cf4e97c11..e02552547d3b8 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -18,6 +18,7 @@ import numbers import os import tempfile +import warnings import weakref from contextlib import contextmanager from pathlib import Path @@ -38,7 +39,7 @@ from pytorch_lightning.core.saving import ModelIO from pytorch_lightning.loggers import Logger, LoggerCollection from pytorch_lightning.trainer.connectors.logger_connector.fx_validator import _FxValidator -from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType, warnings +from pytorch_lightning.utilities import _IS_WINDOWS, _TORCH_GREATER_EQUAL_1_10, GradClipAlgorithmType from pytorch_lightning.utilities.apply_func import apply_to_collection, convert_to_tensors from pytorch_lightning.utilities.cloud_io import get_filesystem from pytorch_lightning.utilities.distributed import distributed_available, sync_ddp diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py index 91be34c55078f..a69071fd67610 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-8.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-8.py @@ -692,6 +692,12 @@ def test_v1_8_0_logger_collection(tmpdir): with pytest.deprecated_call(match="`LoggerCollection` is deprecated in v1.6"): _ = LoggerCollection([logger1, logger2]) + model = BoringModel() + trainer = Trainer(logger=[logger1, logger2]) + model.trainer = trainer + with pytest.deprecated_call(match="logger` will return the first logger"): + _ = model.logger + def test_v1_8_0_precision_plugin_checkpoint_hooks(tmpdir): class PrecisionPluginSaveHook(PrecisionPlugin): From 326f7565b06d3b479b07b84ece5a18d92375e7d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Thu, 18 Aug 2022 16:06:39 +0200 Subject: [PATCH 194/230] Forward extra keyword arguments in `LightningDataModule.from_datasets` (#14185) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: otaj Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 3 +- src/pytorch_lightning/core/datamodule.py | 30 ++++++++---- tests/tests_pytorch/core/test_datamodules.py | 48 ++++++++++++++++++++ 3 files changed, 72 insertions(+), 9 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d4d717a94dd4f..8646b1e1848bb 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -12,7 +12,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Added prefix to log message in `seed_everything` with rank info ([#13290](https://github.com/Lightning-AI/lightning/issues/13290)) -- +- Added support for passing extra init-parameters to the `LightningDataModule.from_datasets` ([#14185](https://github.com/Lightning-AI/lightning/issues/14185)) + ### Changed diff --git a/src/pytorch_lightning/core/datamodule.py b/src/pytorch_lightning/core/datamodule.py index 60a010ff7c3b9..4edde3fe6a3ae 100644 --- a/src/pytorch_lightning/core/datamodule.py +++ b/src/pytorch_lightning/core/datamodule.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """LightningDataModule for loading DataLoaders with ease.""" +import inspect from argparse import ArgumentParser, Namespace from typing import Any, Dict, IO, List, Mapping, Optional, Sequence, Tuple, Union @@ -109,19 +110,22 @@ def from_datasets( predict_dataset: Optional[Union[Dataset, Sequence[Dataset]]] = None, batch_size: int = 1, num_workers: int = 0, + **datamodule_kwargs: Any, ): r""" Create an instance from torch.utils.data.Dataset. Args: - train_dataset: (optional) Dataset to be used for train_dataloader() - val_dataset: (optional) Dataset or list of Dataset to be used for val_dataloader() - test_dataset: (optional) Dataset or list of Dataset to be used for test_dataloader() - predict_dataset: (optional) Dataset or list of Dataset to be used for predict_dataloader() - batch_size: Batch size to use for each dataloader. Default is 1. + train_dataset: Optional dataset to be used for train_dataloader() + val_dataset: Optional dataset or list of Dataset to be used for val_dataloader() + test_dataset: Optional dataset or list of Dataset to be used for test_dataloader() + predict_dataset: Optional dataset or list of Dataset to be used for predict_dataloader() + batch_size: Batch size to use for each dataloader. Default is 1. This parameter gets forwarded to the + ``__init__`` if the datamodule has such a name defined in its signature. num_workers: Number of subprocesses to use for data loading. 0 means that the - data will be loaded in the main process. Number of CPUs available. - + data will be loaded in the main process. Number of CPUs available. This parameter gets forwarded to the + ``__init__`` if the datamodule has such a name defined in its signature. + **datamodule_kwargs: Additional parameters that get passed down to the datamodule's ``__init__``. """ def dataloader(ds: Dataset, shuffle: bool = False) -> DataLoader: @@ -150,7 +154,17 @@ def predict_dataloader(): return [dataloader(ds) for ds in predict_dataset] return dataloader(predict_dataset) - datamodule = cls() + candidate_kwargs = dict(batch_size=batch_size, num_workers=num_workers) + accepted_params = inspect.signature(cls.__init__).parameters + accepts_kwargs = any(param.kind == param.VAR_KEYWORD for param in accepted_params.values()) + if accepts_kwargs: + special_kwargs = candidate_kwargs + else: + accepted_params = set(accepted_params) + accepted_params.discard("self") + special_kwargs = {k: v for k, v in candidate_kwargs.items() if k in accepted_params} + + datamodule = cls(**datamodule_kwargs, **special_kwargs) if train_dataset is not None: datamodule.train_dataloader = train_dataloader if val_dataset is not None: diff --git a/tests/tests_pytorch/core/test_datamodules.py b/tests/tests_pytorch/core/test_datamodules.py index 3521149fb2427..23419c102eb2c 100644 --- a/tests/tests_pytorch/core/test_datamodules.py +++ b/tests/tests_pytorch/core/test_datamodules.py @@ -366,6 +366,54 @@ def test_dm_init_from_datasets_dataloaders(iterable): ) +def test_dm_init_from_datasets_with_init_params(): + """Test that extra kwargs can be passed down to the init via the ``LightningDataModule.from_datasets`` method. + + The two special arguments batch_size and num_workers get passed down depending on whether the __init__ accepts them. + """ + # No additional parameters + LightningDataModule.from_datasets(DummyDS(), batch_size=4, num_workers=2) + + class KnownExtraParametersDataModule(LightningDataModule): + def __init__(self, batch_size=1, num_workers=0): + super().__init__() + self.batch_size = batch_size + self.num_workers = num_workers + + # batch_size and num_workers get special treatment - they are part of the `from_datasets` signature + dm = KnownExtraParametersDataModule.from_datasets(DummyDS(), batch_size=4, num_workers=2) + assert dm.batch_size == 4 + assert dm.num_workers == 2 + + class UnknownExtraParametersDataModule(LightningDataModule): + def __init__(self, other, batch_size=1): + super().__init__() + self.other = other + self.batch_size = batch_size + + # additional parameter `other` gets forwarded, alongside the special `batch_size` parameter + dm = UnknownExtraParametersDataModule.from_datasets(DummyDS(), batch_size=4, num_workers=2, other=5) + assert dm.batch_size == 4 + assert dm.other == 5 + + # positional arguments raise an error as they would when instantiating the datamodule normally + with pytest.raises(TypeError, match="missing 1 required positional argument: 'other'"): + UnknownExtraParametersDataModule.from_datasets(DummyDS(), batch_size=4, num_workers=2) + + class KwargsParametersDataModule(LightningDataModule): + def __init__(self, num_workers, **kwargs): + super().__init__() + self.num_workers = num_workers + for key, value in kwargs.items(): + setattr(self, key, value) + + # everything gets forwarded, because there is `**kwargs` present + dm = KwargsParametersDataModule.from_datasets(DummyDS(), batch_size=10, num_workers=100, another=None) + assert dm.batch_size == 10 + assert dm.num_workers == 100 + assert dm.another is None + + # all args class DataModuleWithHparams_0(LightningDataModule): def __init__(self, arg0, arg1, kwarg0=None): From 6285a83d381503ce40e328844bae4130f7bf0a46 Mon Sep 17 00:00:00 2001 From: Akihiro Nitta Date: Fri, 19 Aug 2022 00:07:15 +0900 Subject: [PATCH 195/230] CI: Let dependabot check GHA updates weekly (#14274) --- .github/dependabot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ab67c9026b55b..69ec41a16537e 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -27,7 +27,7 @@ updates: directory: "/" # Check for updates once a week schedule: - interval: "monthly" + interval: "weekly" # Labels on pull requests for version updates only labels: - "ci" From e64a4c3836e19af432a9ebe60be483e2435f946b Mon Sep 17 00:00:00 2001 From: Hyunjoo Lee Date: Fri, 19 Aug 2022 02:16:22 +0900 Subject: [PATCH 196/230] Fix mypy typing errors attributed to `pytorch_lightning/demos/mnist_datamodule.py` (#13929) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Rohit Gupta --- src/pytorch_lightning/demos/mnist_datamodule.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/pytorch_lightning/demos/mnist_datamodule.py b/src/pytorch_lightning/demos/mnist_datamodule.py index bfa428899c95a..0f5eea99f3e96 100644 --- a/src/pytorch_lightning/demos/mnist_datamodule.py +++ b/src/pytorch_lightning/demos/mnist_datamodule.py @@ -13,11 +13,10 @@ # limitations under the License. import logging import os -import platform import random import time import urllib -from typing import Any, Callable, Optional, Tuple +from typing import Any, Callable, Optional, Tuple, Union from urllib.error import HTTPError from warnings import warn @@ -26,7 +25,7 @@ from torch.utils.data import DataLoader, Dataset, random_split from pytorch_lightning import LightningDataModule -from pytorch_lightning.utilities.imports import _TORCHVISION_AVAILABLE +from pytorch_lightning.utilities.imports import _IS_WINDOWS, _TORCHVISION_AVAILABLE if _TORCHVISION_AVAILABLE: from torchvision import transforms as transform_lib @@ -113,13 +112,14 @@ def _try_load(path_data: str, trials: int = 30, delta: float = 1.0) -> Tuple[Ten time.sleep(delta * random.random()) else: break + assert res is not None if exception is not None: # raise the caught exception raise exception return res @staticmethod - def normalize_tensor(tensor: Tensor, mean: float = 0.0, std: float = 1.0) -> Tensor: + def normalize_tensor(tensor: Tensor, mean: Union[int, float] = 0.0, std: Union[int, float] = 1.0) -> Tensor: mean = torch.as_tensor(mean, dtype=tensor.dtype, device=tensor.device) std = torch.as_tensor(std, dtype=tensor.dtype, device=tensor.device) return tensor.sub(mean).div(std) @@ -171,7 +171,7 @@ def __init__( batch_size: desired batch size. """ super().__init__(*args, **kwargs) - if num_workers and platform.system() == "Windows": + if num_workers and _IS_WINDOWS: # see: https://stackoverflow.com/a/59680818 warn( f"You have requested num_workers={num_workers} on Windows," @@ -185,8 +185,6 @@ def __init__( self.normalize = normalize self.seed = seed self.batch_size = batch_size - self.dataset_train = ... - self.dataset_val = ... @property def num_classes(self) -> int: From f7ac57050ff0065ad05535b54ab9501fe1fb7db7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Thu, 18 Aug 2022 20:26:58 +0200 Subject: [PATCH 197/230] Update mising CODEOWNERS for the PL package (#14280) * Update CODEOWNERS * Update .github/CODEOWNERS --- .github/CODEOWNERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 0b4692731bff9..dc3b2187d0fd5 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -26,7 +26,7 @@ /docs/source-app/expertise_levels @williamfalcon @Felonious-Spellfire @RobertLaurella # Packages -/src/pytorch_lightning @carmocca @justusschock +/src/pytorch_lightning @borda @awaelchli @carmocca @justusschock @rohitgr7 @otaj /src/pytorch_lightning/accelerators @williamfalcon @tchaton @SeanNaren @awaelchli @justusschock @kaushikb11 /src/pytorch_lightning/callbacks @williamfalcon @tchaton @carmocca @borda @kaushikb11 /src/pytorch_lightning/core @tchaton @borda @carmocca @justusschock @kaushikb11 From d9c60901705b55bb45149eebf8c0e442d4bec115 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Fri, 19 Aug 2022 00:04:21 +0530 Subject: [PATCH 198/230] Deprecate `on_colab_kaggle` func (#14247) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- src/pytorch_lightning/CHANGELOG.md | 2 ++ src/pytorch_lightning/strategies/utils.py | 4 ++++ tests/tests_pytorch/deprecated_api/test_remove_1-10.py | 6 ++++++ 3 files changed, 12 insertions(+) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 8646b1e1848bb..d93c4ab479b94 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -48,6 +48,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Deprecated the `pl_module` argument in `LightningParallelModule`, `LightningDistributedModule`, `LightningShardedDataParallel`, `LightningBaguaModule` and `LightningDeepSpeedModule` wrapper classes ([#13738](https://github.com/Lightning-AI/lightning/pull/13738)) +- Deprecated the `on_colab_kaggle` function ([#14247](https://github.com/Lightning-AI/lightning/pull/14247)) + ### Removed diff --git a/src/pytorch_lightning/strategies/utils.py b/src/pytorch_lightning/strategies/utils.py index cdae7bf434eca..ec7a1bd6ffb89 100644 --- a/src/pytorch_lightning/strategies/utils.py +++ b/src/pytorch_lightning/strategies/utils.py @@ -16,9 +16,13 @@ import torch from pytorch_lightning.utilities.enums import PrecisionType +from pytorch_lightning.utilities.rank_zero import rank_zero_deprecation def on_colab_kaggle() -> bool: + rank_zero_deprecation( + "The function `on_colab_kaggle` has been deprecated in v1.8.0 and will be removed in v1.10.0." + ) return bool(os.getenv("COLAB_GPU") or os.getenv("KAGGLE_URL_BASE")) diff --git a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py index 186e526313bba..40a4069001505 100644 --- a/tests/tests_pytorch/deprecated_api/test_remove_1-10.py +++ b/tests/tests_pytorch/deprecated_api/test_remove_1-10.py @@ -22,6 +22,7 @@ from pytorch_lightning.strategies.bagua import LightningBaguaModule from pytorch_lightning.strategies.deepspeed import LightningDeepSpeedModule from pytorch_lightning.strategies.ipu import LightningIPUModule +from pytorch_lightning.strategies.utils import on_colab_kaggle from tests_pytorch.helpers.runif import RunIf from tests_pytorch.helpers.utils import no_warning_call @@ -63,3 +64,8 @@ def test_v1_10_deprecated_unwrap_lightning_module(): def test_v1_10_deprecated_unwrap_lightning_module_sharded(): with pytest.deprecated_call(match=r"The function `unwrap_lightning_module_sharded` is deprecated in v1.8.0"): unwrap_lightning_module_sharded(BoringModel()) + + +def test_v1_10_deprecated_on_colab_kaggle_func(): + with pytest.deprecated_call(match="The function `on_colab_kaggle` has been deprecated in v1.8.0"): + on_colab_kaggle() From d71dba372176b5f8b83d1d2ed0f85a82ac6091dc Mon Sep 17 00:00:00 2001 From: Raphael Randschau Date: Thu, 18 Aug 2022 14:12:59 -0700 Subject: [PATCH 199/230] BYOC: fix default types for cluster instance types (#14260) --- src/lightning_app/cli/lightning_cli_create.py | 2 +- tests/tests_app/cli/test_cli.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli_create.py b/src/lightning_app/cli/lightning_cli_create.py index 7e9a6b9d2143b..dec940e89e259 100644 --- a/src/lightning_app/cli/lightning_cli_create.py +++ b/src/lightning_app/cli/lightning_cli_create.py @@ -79,7 +79,7 @@ def create_cluster( region=region, role_arn=role_arn, external_id=external_id, - instance_types=instance_types.split(",") if instance_types is not None else None, + instance_types=instance_types.split(",") if instance_types is not None else [], edit_before_creation=edit_before_creation, cost_savings=not enable_performance, wait=wait, diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 48e1a26bb6f2b..428ba0e535328 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -75,8 +75,8 @@ def test_main_lightning_cli_help(): [ (["--instance-types", "t3.xlarge"], ["t3.xlarge"], True), (["--instance-types", "t3.xlarge,t3.2xlarge"], ["t3.xlarge", "t3.2xlarge"], True), - ([], None, True), - (["--enable-performance"], None, False), + ([], [], True), + (["--enable-performance"], [], False), ], ) def test_create_cluster( From a8c6e69b43d12a639c1f4de7abd9fac521e28d21 Mon Sep 17 00:00:00 2001 From: Kaushik B <45285388+kaushikb11@users.noreply.github.com> Date: Fri, 19 Aug 2022 09:40:44 +0530 Subject: [PATCH 200/230] Fix wrong num padding for RichProgressBar (#14296) --- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../callbacks/progress/rich_progress.py | 7 +++---- .../callbacks/progress/test_rich_progress_bar.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index d93c4ab479b94..113400bb870aa 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -76,6 +76,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an `AttributeError` when accessing `LightningModule.logger` and the Trainer has multiple loggers ([#14234](https://github.com/Lightning-AI/lightning/pull/14234)) +- Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) + + ## [1.7.2] - 2022-08-17 ### Added diff --git a/src/pytorch_lightning/callbacks/progress/rich_progress.py b/src/pytorch_lightning/callbacks/progress/rich_progress.py index ac27397640d4c..8ca2cb6671cab 100644 --- a/src/pytorch_lightning/callbacks/progress/rich_progress.py +++ b/src/pytorch_lightning/callbacks/progress/rich_progress.py @@ -451,13 +451,12 @@ def on_predict_batch_end(self, trainer, pl_module, outputs, batch, batch_idx, da def _get_train_description(self, current_epoch: int) -> str: train_description = f"Epoch {current_epoch}" + if self.trainer.max_epochs is not None: + train_description += f"/{self.trainer.max_epochs - 1}" if len(self.validation_description) > len(train_description): # Padding is required to avoid flickering due of uneven lengths of "Epoch X" # and "Validation" Bar description - num_digits = len(str(current_epoch)) - required_padding = (len(self.validation_description) - len(train_description) + 1) - num_digits - for _ in range(required_padding): - train_description += " " + train_description = f"{train_description:{len(self.validation_description)}}" return train_description def _stop_progress(self) -> None: diff --git a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py index f1ccf2a2726a2..1638d618fe95f 100644 --- a/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py +++ b/tests/tests_pytorch/callbacks/progress/test_rich_progress_bar.py @@ -400,3 +400,15 @@ def test_step(self, batch, batch_idx): trainer.test(model, verbose=False) assert pbar.calls["test"] == [] + + +@RunIf(rich=True) +def test_rich_progress_bar_padding(): + progress_bar = RichProgressBar() + trainer = Mock() + trainer.max_epochs = 1 + progress_bar._trainer = trainer + + train_description = progress_bar._get_train_description(current_epoch=0) + assert "Epoch 0/0" in train_description + assert len(progress_bar.validation_description) == len(train_description) From bbb406bce9493aeb104adb6aa40b9623201c2a29 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 19 Aug 2022 14:14:16 +0900 Subject: [PATCH 201/230] Update ipython[all] requirement from <=8.1.1 to <8.4.1 in /requirements (#14281) Updates the requirements on [ipython[all]](https://github.com/ipython/ipython) to permit the latest version. - [Release notes](https://github.com/ipython/ipython/releases) - [Commits](https://github.com/ipython/ipython/compare/rel-0.8.4...8.4.0) --- updated-dependencies: - dependency-name: ipython[all] dependency-type: direct:production ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- requirements/pytorch/examples.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/pytorch/examples.txt b/requirements/pytorch/examples.txt index 288e3a10889c3..7f6682c974a47 100644 --- a/requirements/pytorch/examples.txt +++ b/requirements/pytorch/examples.txt @@ -3,4 +3,4 @@ torchvision>=0.10.*, <=0.13.0 gym[classic_control]>=0.17.0, <0.24.2 -ipython[all] <=8.1.1 +ipython[all] <8.4.1 From 2dbdb00b03a74ba32fa5e30c3ecc383b5ee767a8 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 19 Aug 2022 05:31:47 +0000 Subject: [PATCH 202/230] Bump actions/setup-python from 2 to 4 (#14287) Bumps [actions/setup-python](https://github.com/actions/setup-python) from 2 to 4. - [Release notes](https://github.com/actions/setup-python/releases) - [Commits](https://github.com/actions/setup-python/compare/v2...v4) --- updated-dependencies: - dependency-name: actions/setup-python dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-app-cloud-e2e-test.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index b2281389e7358..9a5a10a95cd33 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -25,7 +25,7 @@ jobs: steps: - uses: actions/checkout@v2 - name: Set up Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: "3.8" diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 173e2a44a61f4..17a2c073274f7 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -59,7 +59,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} if: ${{ (steps.skip.outputs.continue == '1') }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 0bb9916ee302a..1e230770c8922 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -48,7 +48,7 @@ jobs: echo "::set-output name=continue::1" fi - - uses: actions/setup-python@v2 + - uses: actions/setup-python@v4 if: ${{ (steps.skip.outputs.continue == '1') }} with: python-version: ${{ matrix.python-version }} From 278df1cd209be7e1c5baa515b6401fe5281e273a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 19 Aug 2022 14:37:48 +0900 Subject: [PATCH 203/230] Bump actions/setup-node from 2 to 3 (#14286) Bumps [actions/setup-node](https://github.com/actions/setup-node) from 2 to 3. - [Release notes](https://github.com/actions/setup-node/releases) - [Commits](https://github.com/actions/setup-node/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/setup-node dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-app-examples.yml | 2 +- .github/workflows/ci-app-tests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-app-examples.yml b/.github/workflows/ci-app-examples.yml index 01570f59c2c77..8af5a2fc5a39e 100644 --- a/.github/workflows/ci-app-examples.yml +++ b/.github/workflows/ci-app-examples.yml @@ -66,7 +66,7 @@ jobs: shell: bash - name: Setup Node.js - uses: actions/setup-node@v2 + uses: actions/setup-node@v3 with: node-version: '16' diff --git a/.github/workflows/ci-app-tests.yml b/.github/workflows/ci-app-tests.yml index fe3cc36dc16d3..8d9c538eb665b 100644 --- a/.github/workflows/ci-app-tests.yml +++ b/.github/workflows/ci-app-tests.yml @@ -77,7 +77,7 @@ jobs: # redis-port: 6379 - name: Setup Node.js - uses: actions/setup-node@v2 + uses: actions/setup-node@v3 with: node-version: '16' From c9b3cda0e0fe712aef75d6803af6bebd4485e3bc Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 19 Aug 2022 07:01:54 +0000 Subject: [PATCH 204/230] Bump actions/upload-artifact from 2 to 3 (#14289) Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 2 to 3. - [Release notes](https://github.com/actions/upload-artifact/releases) - [Commits](https://github.com/actions/upload-artifact/compare/v2...v3) --- updated-dependencies: - dependency-name: actions/upload-artifact dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-app-cloud-e2e-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml index 9a5a10a95cd33..07b253f5d6f8c 100644 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ b/.github/workflows/ci-app-cloud-e2e-test.yml @@ -160,7 +160,7 @@ jobs: # Delete the artifacts if successful rm -r ${VIDEO_LOCATION}/${{ matrix.app_name }} - - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v3 if: ${{ always() }} with: From 090bbc8605ed6aca80a509185ff341b5895275db Mon Sep 17 00:00:00 2001 From: Justin Goheen <26209687+JustinGoheen@users.noreply.github.com> Date: Fri, 19 Aug 2022 07:03:43 -0400 Subject: [PATCH 205/230] Fix mypy errors attributed to `pytorch_lightning.core.module.py` (#13603) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli Co-authored-by: Carlos Mocholí --- pyproject.toml | 1 - .../core/mixins/device_dtype_mixin.py | 2 +- src/pytorch_lightning/core/module.py | 105 ++++++++++-------- .../overrides/data_parallel.py | 4 +- src/pytorch_lightning/utilities/types.py | 1 + 5 files changed, 60 insertions(+), 53 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9f7cc28d0b002..45f65b4c444e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,7 +52,6 @@ module = [ "pytorch_lightning.callbacks.progress.rich_progress", "pytorch_lightning.callbacks.quantization", "pytorch_lightning.core.datamodule", - "pytorch_lightning.core.module", "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", "pytorch_lightning.profilers.base", diff --git a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py index 2916d8b07cb4e..5086583d8e26f 100644 --- a/src/pytorch_lightning/core/mixins/device_dtype_mixin.py +++ b/src/pytorch_lightning/core/mixins/device_dtype_mixin.py @@ -37,7 +37,7 @@ def dtype(self, new_dtype: Union[str, torch.dtype]) -> None: raise RuntimeError("Cannot set the dtype explicitly. Please use module.to(new_dtype).") @property - def device(self) -> Union[str, torch.device]: + def device(self) -> torch.device: device = self._device # make this more explicit to always include the index diff --git a/src/pytorch_lightning/core/module.py b/src/pytorch_lightning/core/module.py index e02552547d3b8..0926cc52ecd57 100644 --- a/src/pytorch_lightning/core/module.py +++ b/src/pytorch_lightning/core/module.py @@ -22,7 +22,7 @@ import weakref from contextlib import contextmanager from pathlib import Path -from typing import Any, Callable, Dict, List, Mapping, Optional, overload, Sequence, Tuple, Union +from typing import Any, Callable, Dict, Generator, List, Mapping, Optional, overload, Sequence, Tuple, Union import torch from torch import ScriptModule, Tensor @@ -47,12 +47,20 @@ from pytorch_lightning.utilities.imports import _TORCH_GREATER_EQUAL_1_11, _TORCH_GREATER_EQUAL_1_13 from pytorch_lightning.utilities.rank_zero import rank_zero_debug, rank_zero_deprecation, rank_zero_warn from pytorch_lightning.utilities.signature_utils import is_param_in_hook_signature -from pytorch_lightning.utilities.types import _METRIC_COLLECTION, EPOCH_OUTPUT, LRSchedulerTypeUnion, STEP_OUTPUT +from pytorch_lightning.utilities.types import ( + _METRIC_COLLECTION, + EPOCH_OUTPUT, + LRSchedulerPLType, + LRSchedulerTypeUnion, + STEP_OUTPUT, +) from pytorch_lightning.utilities.warnings import WarningCache warning_cache = WarningCache() log = logging.getLogger(__name__) +MODULE_OPTIMIZERS = Union[Optimizer, LightningOptimizer, List[Optimizer], List[LightningOptimizer]] + class LightningModule( DeviceDtypeModuleMixin, @@ -104,7 +112,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None: self._current_fx_name: Optional[str] = None self._automatic_optimization: bool = True self._truncated_bptt_steps: int = 0 - self._param_requires_grad_state = {} + self._param_requires_grad_state: Dict[str, bool] = {} self._metric_attributes: Optional[Dict[int, str]] = None self._should_prevent_trainer_and_dataloaders_deepcopy: bool = False # TODO: remove in 1.8 @@ -121,14 +129,10 @@ def optimizers(self, use_pl_optimizer: Literal[False]) -> Union[Optimizer, List[ ... @overload - def optimizers( - self, use_pl_optimizer: bool - ) -> Union[Optimizer, LightningOptimizer, List[Optimizer], List[LightningOptimizer]]: + def optimizers(self, use_pl_optimizer: bool) -> MODULE_OPTIMIZERS: ... - def optimizers( - self, use_pl_optimizer: bool = True - ) -> Union[Optimizer, LightningOptimizer, List[Optimizer], List[LightningOptimizer]]: + def optimizers(self, use_pl_optimizer: bool = True) -> MODULE_OPTIMIZERS: """Returns the optimizer(s) that are being used during training. Useful for manual optimization. Args: @@ -140,7 +144,7 @@ def optimizers( A single optimizer, or a list of optimizers in case multiple ones are present. """ if use_pl_optimizer: - opts = list(self.trainer.strategy._lightning_optimizers.values()) + opts: MODULE_OPTIMIZERS = list(self.trainer.strategy._lightning_optimizers.values()) else: opts = self.trainer.optimizers @@ -150,7 +154,7 @@ def optimizers( # multiple opts return opts - def lr_schedulers(self) -> Optional[Union[LRSchedulerTypeUnion, List[LRSchedulerTypeUnion]]]: + def lr_schedulers(self) -> Union[None, List[LRSchedulerPLType], LRSchedulerPLType]: """Returns the learning rate scheduler(s) that are being used during training. Useful for manual optimization. @@ -162,7 +166,7 @@ def lr_schedulers(self) -> Optional[Union[LRSchedulerTypeUnion, List[LRScheduler return None # ignore other keys "interval", "frequency", etc. - lr_schedulers = [config.scheduler for config in self.trainer.lr_scheduler_configs] + lr_schedulers: List[LRSchedulerPLType] = [config.scheduler for config in self.trainer.lr_scheduler_configs] # single scheduler if len(lr_schedulers) == 1: @@ -175,13 +179,13 @@ def lr_schedulers(self) -> Optional[Union[LRSchedulerTypeUnion, List[LRScheduler def trainer(self) -> "pl.Trainer": if not self._running_torchscript and self._trainer is None: raise RuntimeError(f"{self.__class__.__qualname__} is not attached to a `Trainer`.") - return self._trainer + return self._trainer # type: ignore[return-value] @trainer.setter def trainer(self, trainer: Optional["pl.Trainer"]) -> None: for v in self.children(): if isinstance(v, LightningModule): - v.trainer = trainer + v.trainer = trainer # type: ignore[assignment] if trainer is not None and not isinstance(trainer, weakref.ProxyTypes): trainer = weakref.proxy(trainer) self._trainer = trainer @@ -228,7 +232,7 @@ def local_rank(self) -> int: return self.trainer.local_rank if self._trainer else 0 @property - def on_gpu(self): + def on_gpu(self) -> bool: """Returns ``True`` if this model is currently located on a GPU. Useful to set flags around the LightningModule for different CPU vs GPU behavior. @@ -264,7 +268,7 @@ def logger(self) -> Optional[Logger]: # this should match the implementation of `trainer.logger` # we don't reuse it so we can properly set the deprecation stacklevel if self._trainer is None: - return + return None loggers = self.trainer.loggers if len(loggers) == 0: return None @@ -287,15 +291,15 @@ def loggers(self) -> List[Logger]: """Reference to the list of loggers in the Trainer.""" return self.trainer.loggers if self._trainer else [] - def _call_batch_hook(self, hook_name, *args) -> Any: + def _call_batch_hook(self, hook_name: str, *args: Any) -> Any: if self._trainer: datahook_selector = self._trainer._data_connector._datahook_selector obj = datahook_selector.get_instance(hook_name) - trainer_method = ( - self._trainer._call_lightning_module_hook - if isinstance(obj, self.__class__) - else self._trainer._call_lightning_datamodule_hook - ) + if isinstance(obj, self.__class__): + trainer_method = self._trainer._call_lightning_module_hook + else: + trainer_method = self._trainer._call_lightning_datamodule_hook + return trainer_method(hook_name, *args) else: hook = getattr(self, hook_name) @@ -312,7 +316,7 @@ def _apply_batch_transfer_handler( batch = self._call_batch_hook("on_after_batch_transfer", batch, dataloader_idx) return batch - def print(self, *args, **kwargs) -> None: + def print(self, *args: Any, **kwargs: Any) -> None: r""" Prints only from process 0. Use this in any distributed mode to log only once. @@ -463,7 +467,7 @@ def log( logger=logger, on_step=on_step, on_epoch=on_epoch, - reduce_fx=reduce_fx, + reduce_fx=reduce_fx, # type: ignore[arg-type] enable_graph=enable_graph, add_dataloader_idx=add_dataloader_idx, batch_size=batch_size, @@ -578,7 +582,9 @@ def log_grad_norm(self, grad_norm_dict): """ self.log_dict(grad_norm_dict, on_step=True, on_epoch=True, prog_bar=False, logger=True) - def all_gather(self, data: Union[Tensor, Dict, List, Tuple], group: Optional[Any] = None, sync_grads: bool = False): + def all_gather( + self, data: Union[Tensor, Dict, List, Tuple], group: Optional[Any] = None, sync_grads: bool = False + ) -> Union[Tensor, Dict, List, Tuple]: r""" Allows users to call ``self.all_gather()`` from the LightningModule, thus making the ``all_gather`` operation accelerator agnostic. ``all_gather`` is a function provided by accelerators to gather a tensor from several @@ -598,7 +604,7 @@ def all_gather(self, data: Union[Tensor, Dict, List, Tuple], group: Optional[Any data = convert_to_tensors(data, device=self.device) return apply_to_collection(data, Tensor, all_gather, group=group, sync_grads=sync_grads) - def forward(self, *args, **kwargs) -> Any: + def forward(self, *args: Any, **kwargs: Any) -> Any: r""" Same as :meth:`torch.nn.Module.forward()`. @@ -611,7 +617,7 @@ def forward(self, *args, **kwargs) -> Any: """ return super().forward(*args, **kwargs) - def training_step(self, *args, **kwargs) -> STEP_OUTPUT: + def training_step(self, *args: Any, **kwargs: Any) -> STEP_OUTPUT: r""" Here you compute and return the training loss and some additional metrics for e.g. the progress bar or logger. @@ -769,7 +775,7 @@ def training_epoch_end(self, training_step_outputs): ... """ - def validation_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: r""" Operates on a single batch of data from the validation set. In this step you'd might generate examples or calculate anything of interest like accuracy. @@ -858,7 +864,7 @@ def validation_step(self, batch, batch_idx, dataloader_idx=0): the model goes back to training mode and gradients are enabled. """ - def validation_step_end(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def validation_step_end(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: """Use this when validating with dp because :meth:`validation_step` will operate on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss. @@ -955,7 +961,7 @@ def validation_epoch_end(self, outputs): self.log("final_metric", final_value) """ - def test_step(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: r""" Operates on a single batch of data from the test set. In this step you'd normally generate examples or calculate anything of interest @@ -1035,7 +1041,7 @@ def test_step(self, batch, batch_idx, dataloader_idx=0): to training mode and gradients are enabled. """ - def test_step_end(self, *args, **kwargs) -> Optional[STEP_OUTPUT]: + def test_step_end(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: """Use this when testing with DP because :meth:`test_step` will operate on only part of the batch. However, this is still optional and only needed for things like softmax or NCE loss. @@ -1200,7 +1206,7 @@ def configure_callbacks(self): """ return [] - def configure_optimizers(self): + def configure_optimizers(self) -> Any: r""" Choose what optimizers and learning-rate schedulers to use in your optimization. Normally you'd need one. But in the case of GANs or similar you might have multiple. @@ -1374,7 +1380,7 @@ def configure_optimizers(self): """ rank_zero_warn("`configure_optimizers` must be implemented to be used with the Lightning Trainer") - def manual_backward(self, loss: Tensor, *args, **kwargs) -> None: + def manual_backward(self, loss: Tensor, *args: Any, **kwargs: Any) -> None: """Call this directly from your :meth:`training_step` when doing optimizations manually. By using this, Lightning can ensure that all the proper scaling gets applied when using mixed precision. @@ -1399,7 +1405,7 @@ def training_step(...): self.trainer.strategy.backward(loss, None, None, *args, **kwargs) def backward( - self, loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args, **kwargs + self, loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args: Any, **kwargs: Any ) -> None: """Called to perform backward on the loss returned in :meth:`training_step`. Override this hook with your own implementation if you need to. @@ -1442,7 +1448,7 @@ def toggle_optimizer(self, optimizer: Union[Optimizer, LightningOptimizer], opti # Then iterate over the current optimizer's parameters and set its `requires_grad` # properties accordingly - for group in optimizer.param_groups: + for group in optimizer.param_groups: # type: ignore[union-attr] for param in group["params"]: param.requires_grad = param_requires_grad_state[param] self._param_requires_grad_state = param_requires_grad_state @@ -1469,7 +1475,7 @@ def clip_gradients( optimizer: Optimizer, gradient_clip_val: Optional[Union[int, float]] = None, gradient_clip_algorithm: Optional[str] = None, - ): + ) -> None: """Handles gradient clipping internally. Note: @@ -1523,7 +1529,7 @@ def configure_gradient_clipping( optimizer_idx: int, gradient_clip_val: Optional[Union[int, float]] = None, gradient_clip_algorithm: Optional[str] = None, - ): + ) -> None: """Perform gradient clipping for the optimizer parameters. Called before :meth:`optimizer_step`. Args: @@ -1584,7 +1590,7 @@ def lr_scheduler_step(self, scheduler, optimizer_idx, metric): """ if metric is None: - scheduler.step() + scheduler.step() # type: ignore[call-arg] else: scheduler.step(metric) @@ -1672,7 +1678,7 @@ def optimizer_step( """ optimizer.step(closure=optimizer_closure) - def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int): + def optimizer_zero_grad(self, epoch: int, batch_idx: int, optimizer: Optimizer, optimizer_idx: int) -> None: """Override this method to change the default behaviour of ``optimizer.zero_grad()``. Args: @@ -1741,12 +1747,11 @@ def tbptt_split_batch(self, batch, split_size): for t in range(0, time_dims[0], split_size): batch_split = [] for i, x in enumerate(batch): + split_x: Union[Tensor, List[Tensor]] if isinstance(x, Tensor): split_x = x[:, t : t + split_size] - elif isinstance(x, collections.abc.Sequence): - split_x = [None] * len(x) - for batch_idx in range(len(x)): - split_x[batch_idx] = x[batch_idx][t : t + split_size] + elif isinstance(x, collections.Sequence): + split_x = [x[batch_idx][t : t + split_size] for batch_idx in range(len(x))] batch_split.append(split_x) @@ -1782,7 +1787,7 @@ def unfreeze(self) -> None: self.train() - def _verify_is_manual_optimization(self, fn_name): + def _verify_is_manual_optimization(self, fn_name: str) -> None: if self.automatic_optimization: raise MisconfigurationException( f"to use {fn_name}, please disable automatic optimization:" @@ -1790,7 +1795,7 @@ def _verify_is_manual_optimization(self, fn_name): ) @torch.no_grad() - def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = None, **kwargs): + def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = None, **kwargs: Any) -> None: """Saves the model in ONNX format. Args: @@ -1829,7 +1834,7 @@ def to_onnx(self, file_path: Union[str, Path], input_sample: Optional[Any] = Non if not _TORCH_GREATER_EQUAL_1_10 and "example_outputs" not in kwargs: self.eval() - if isinstance(input_sample, Tuple): + if isinstance(input_sample, tuple): kwargs["example_outputs"] = self(*input_sample) else: kwargs["example_outputs"] = self(input_sample) @@ -1843,7 +1848,7 @@ def to_torchscript( file_path: Optional[Union[str, Path]] = None, method: Optional[str] = "script", example_inputs: Optional[Any] = None, - **kwargs, + **kwargs: Any, ) -> Union[ScriptModule, Dict[str, ScriptModule]]: """By default compiles the whole model to a :class:`~torch.jit.ScriptModule`. If you want to use tracing, please provided the argument ``method='trace'`` and make sure that either the `example_inputs` argument is @@ -1953,7 +1958,7 @@ def use_amp(self, use_amp: bool) -> None: self._use_amp = use_amp @contextmanager - def _prevent_trainer_and_dataloaders_deepcopy(self) -> None: + def _prevent_trainer_and_dataloaders_deepcopy(self) -> Generator[None, None, None]: self._should_prevent_trainer_and_dataloaders_deepcopy = True yield self._should_prevent_trainer_and_dataloaders_deepcopy = False @@ -1988,4 +1993,6 @@ def _register_sharded_tensor_state_dict_hooks_if_available(self) -> None: self._register_load_state_dict_pre_hook(pre_load_state_dict_hook, True) else: # We need to make sure the self inside the method is a weakref proxy - self.__class__._register_load_state_dict_pre_hook(weakref.proxy(self), pre_load_state_dict_hook, True) + self.__class__._register_load_state_dict_pre_hook( + weakref.proxy(self), pre_load_state_dict_hook, True # type: ignore[arg-type] + ) diff --git a/src/pytorch_lightning/overrides/data_parallel.py b/src/pytorch_lightning/overrides/data_parallel.py index 98d23cee391bc..b296d1d8697f4 100644 --- a/src/pytorch_lightning/overrides/data_parallel.py +++ b/src/pytorch_lightning/overrides/data_parallel.py @@ -13,7 +13,7 @@ # limitations under the License. import numbers import warnings -from typing import Any, cast, Optional, Union +from typing import Any, Optional, Union import torch from torch import Tensor @@ -77,7 +77,7 @@ def forward(self, *inputs: Any, **kwargs: Any) -> Any: output = super().forward(*inputs, **kwargs) def output_transform(data: Any) -> Any: - device = cast(torch.device, self.lightning_module.device) + device = self.lightning_module.device data = python_scalar_to_tensor(data, device) data = unsqueeze_scalar_tensor(data) return data diff --git a/src/pytorch_lightning/utilities/types.py b/src/pytorch_lightning/utilities/types.py index 18e2db6feb6c6..9f2db6422612f 100644 --- a/src/pytorch_lightning/utilities/types.py +++ b/src/pytorch_lightning/utilities/types.py @@ -168,6 +168,7 @@ def no_sync(self) -> Generator: LRSchedulerTypeTuple = (torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) LRSchedulerTypeUnion = Union[torch.optim.lr_scheduler._LRScheduler, torch.optim.lr_scheduler.ReduceLROnPlateau] LRSchedulerType = Union[Type[torch.optim.lr_scheduler._LRScheduler], Type[torch.optim.lr_scheduler.ReduceLROnPlateau]] +LRSchedulerPLType = Union[_LRScheduler, ReduceLROnPlateau] @dataclass From 8662ab25d4e32196bbb4489100ca46c3fa885248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Fri, 19 Aug 2022 16:05:23 +0200 Subject: [PATCH 206/230] Adjust mergify's number of reviewer rules (#14293) --- .github/mergify.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/mergify.yml b/.github/mergify.yml index 096e05e804b4e..eb69860666c06 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -62,8 +62,8 @@ pull_request_rules: - -conflict # skip if conflict - -draft # filter-out GH draft PRs - label="ready" - - "#approved-reviews-by<3" # number of review approvals - - "#review-requested<3" # number of requested reviews + - "#approved-reviews-by<2" # number of review approvals + - "#review-requested<2" # number of requested reviews actions: request_reviews: teams: From 0b1a29b5eba423cec440ccc2761244f0d9efda4c Mon Sep 17 00:00:00 2001 From: Alec Merdler Date: Fri, 19 Aug 2022 12:29:44 -0400 Subject: [PATCH 207/230] Remove references to local app admin view (#14306) The local app '/admin' route is being removed because it does not provide any value, so we need to remove references from it in the CLI. --- src/lightning_app/cli/lightning_cli.py | 4 ++-- src/lightning_app/core/constants.py | 1 - tests/tests_app/core/test_lightning_api.py | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 81d2a773b4619..876a663913a73 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -16,7 +16,7 @@ from lightning_app.cli.lightning_cli_create import create from lightning_app.cli.lightning_cli_delete import delete from lightning_app.cli.lightning_cli_list import get_list -from lightning_app.core.constants import get_lightning_cloud_url, LOCAL_LAUNCH_ADMIN_VIEW +from lightning_app.core.constants import get_lightning_cloud_url from lightning_app.runners.runtime import dispatch from lightning_app.runners.runtime_type import RuntimeType from lightning_app.utilities.app_logs import _app_logs_reader @@ -38,7 +38,7 @@ def get_app_url(runtime_type: RuntimeType, *args) -> str: lightning_app = args[0] return f"{get_lightning_cloud_url()}/me/apps/{lightning_app.id}" else: - return "http://127.0.0.1:7501/admin" if LOCAL_LAUNCH_ADMIN_VIEW else "http://127.0.0.1:7501/view" + return "http://127.0.0.1:7501/view" def main(): diff --git a/src/lightning_app/core/constants.py b/src/lightning_app/core/constants.py index 84a0da49c21cf..74a8dc17f141f 100644 --- a/src/lightning_app/core/constants.py +++ b/src/lightning_app/core/constants.py @@ -23,7 +23,6 @@ USER_ID = os.getenv("USER_ID", "1234") FRONTEND_DIR = os.path.join(os.path.dirname(lightning_app.__file__), "ui") PACKAGE_LIGHTNING = os.getenv("PACKAGE_LIGHTNING", None) -LOCAL_LAUNCH_ADMIN_VIEW = bool(int(os.getenv("LOCAL_LAUNCH_ADMIN_VIEW", "0"))) CLOUD_UPLOAD_WARNING = int(os.getenv("CLOUD_UPLOAD_WARNING", "2")) DISABLE_DEPENDENCY_CACHE = bool(int(os.getenv("DISABLE_DEPENDENCY_CACHE", "0"))) # Project under which the resources need to run in cloud. If this env is not set, diff --git a/tests/tests_app/core/test_lightning_api.py b/tests/tests_app/core/test_lightning_api.py index 1b2bf2fb52fd9..104ff198b864c 100644 --- a/tests/tests_app/core/test_lightning_api.py +++ b/tests/tests_app/core/test_lightning_api.py @@ -337,7 +337,6 @@ async def test_health_endpoint_failure(): ("/", 200), ("/asdf", 200), ("/view/component_a", 200), - ("/admin", 200), ), ) @pytest.mark.anyio From 0ca3b5aa1b16667cc2d006c3833f4953b5706e72 Mon Sep 17 00:00:00 2001 From: Krishna Kalyan Date: Sat, 20 Aug 2022 00:39:16 +0100 Subject: [PATCH 208/230] Fix mypy errors attributed to `pytorch_lightning.callbacks.quantization` (#13782) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: otaj <6065855+otaj@users.noreply.github.com> Co-authored-by: Adrian Wälchli --- pyproject.toml | 1 - .../callbacks/quantization.py | 48 +++++++++++-------- .../connectors/checkpoint_connector.py | 2 +- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 45f65b4c444e0..14e49ffb24b8e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,6 @@ warn_no_return = "False" # mypy --no-error-summary 2>&1 | tr ':' ' ' | awk '{print $1}' | sort | uniq | sed 's/\.py//g; s|src/||g; s|\/|\.|g' | xargs -I {} echo '"{}",' module = [ "pytorch_lightning.callbacks.progress.rich_progress", - "pytorch_lightning.callbacks.quantization", "pytorch_lightning.core.datamodule", "pytorch_lightning.demos.boring_classes", "pytorch_lightning.demos.mnist_datamodule", diff --git a/src/pytorch_lightning/callbacks/quantization.py b/src/pytorch_lightning/callbacks/quantization.py index af983ef101b0b..d89bed0394105 100644 --- a/src/pytorch_lightning/callbacks/quantization.py +++ b/src/pytorch_lightning/callbacks/quantization.py @@ -41,25 +41,28 @@ def wrap_qat_forward_context( - quant_cb, model: "pl.LightningModule", func: Callable, trigger_condition: Optional[Union[Callable, int]] = None + quant_cb: "QuantizationAwareTraining", + model: "pl.LightningModule", + func: Callable, + trigger_condition: Optional[Union[Callable, int]] = None, ) -> Callable: """Decorator to wrap forward path as it is needed to quantize inputs and dequantize outputs for in/out compatibility Moreover this version has the (de)quantization conditional as it may not be needed for the training all the time.""" # todo: consider using registering hook before/after forward @functools.wraps(func) - def wrapper(data) -> Any: - _is_func_true = isinstance(trigger_condition, Callable) and trigger_condition(model.trainer) + def wrapper(data: Any) -> Any: + _is_func_true = callable(trigger_condition) and trigger_condition(model.trainer) _is_count_true = isinstance(trigger_condition, int) and quant_cb._forward_calls < trigger_condition _quant_run = trigger_condition is None or _is_func_true or _is_count_true # apply custom trigger if _quant_run: quant_cb._forward_calls += 1 - data = model.quant(data) + data = model.quant(data) # type: ignore[operator] data = func(data) # apply custom trigger if _quant_run: - data = model.dequant(data) + data = model.dequant(data) # type: ignore[operator] return data return wrapper @@ -70,10 +73,10 @@ def wrap_quantize_forward_context(model: "pl.LightningModule", func: Callable) - compatibility.""" # todo: consider using registering hook before/after forward @functools.wraps(func) - def wrapper(data) -> Any: - data = model.quant(data) + def wrapper(data: Any) -> Any: + data = model.quant(data) # type: ignore[operator] data = func(data) - data = model.dequant(data) + data = model.dequant(data) # type: ignore[operator] return data return wrapper @@ -181,7 +184,9 @@ def __init__( ) self._observer_type = observer_type - if collect_quantization is not None and not isinstance(collect_quantization, (int, Callable)): + if collect_quantization is not None and not ( + isinstance(collect_quantization, int) or callable(collect_quantization) + ): raise MisconfigurationException( f'Unsupported `collect_quantization` "{collect_quantization}", allowed are `int` or `Callable`.' ) @@ -200,8 +205,8 @@ def __init__( self._observer_disabled_stages = set(self.OBSERVER_STAGES) - observer_enabled_stages self._forward_calls = 0 - self._fake_quant_to_initial_state_dict = {} - self._last_fake_quant_to_observer_enabled = {} + self._fake_quant_to_initial_state_dict: Dict[FakeQuantizeBase, Dict[str, Any]] = {} + self._last_fake_quant_to_observer_enabled: Dict[FakeQuantizeBase, Tensor] = {} self._module_prepared = False def _check_feasible_fuse(self, model: "pl.LightningModule") -> bool: @@ -227,7 +232,7 @@ def _restore_last_observer_enabled(self) -> None: for fake_quant, observer_enabled in self._last_fake_quant_to_observer_enabled.items(): fake_quant.observer_enabled.copy_(observer_enabled) - def _prepare_model(self, model: torch.nn.Module) -> None: + def _prepare_model(self, model: "pl.LightningModule") -> None: if self._module_prepared: return # QuantStub converts tensors from floating point to quantized @@ -237,7 +242,7 @@ def _prepare_model(self, model: torch.nn.Module) -> None: # manually specify where tensors will be converted from quantized # to floating point in the quantized model self.__module_forward = model.forward - model.forward = wrap_qat_forward_context( + model.forward = wrap_qat_forward_context( # type: ignore [assignment] quant_cb=self, model=model, func=model.forward, trigger_condition=self._collect_quantization ) @@ -247,7 +252,7 @@ def _prepare_model(self, model: torch.nn.Module) -> None: if self._observer_type == "histogram": model.qconfig = torch.quantization.get_default_qconfig(self._qconfig) elif self._observer_type == "average": - extra_kwargs = {} + extra_kwargs: Dict[str, Optional[int]] = {} if _TORCH_GREATER_EQUAL_1_12: extra_kwargs["version"] = 0 # version=None corresponds to using FakeQuantize rather than @@ -258,7 +263,7 @@ def _prepare_model(self, model: torch.nn.Module) -> None: model.qconfig = torch.quantization.get_default_qat_qconfig(self._qconfig, **extra_kwargs) elif isinstance(self._qconfig, QConfig): - model.qconfig = self._qconfig + model.qconfig = self._qconfig # type: ignore [assignment] if self._check_feasible_fuse(model): fuse_modules(model, self._modules_to_fuse, inplace=True) @@ -273,12 +278,12 @@ def _prepare_model(self, model: torch.nn.Module) -> None: } self._module_prepared = True - def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule"): + def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: self._prepare_model(pl_module) def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: if not self._convert_on_fit_end: - pl_module.forward = self.__module_forward + pl_module.forward = self.__module_forward # type: ignore [assignment] return pl_module.eval() # Convert the observed model to a quantized model. This does several things: @@ -288,9 +293,12 @@ def on_fit_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> torch.quantization.convert(pl_module, inplace=True) # check we shall preserve wrapper if self._input_compatible: - pl_module.forward = wrap_quantize_forward_context(model=pl_module, func=self.__module_forward) + pl_module.forward = wrap_quantize_forward_context( # type: ignore [assignment] + model=pl_module, + func=self.__module_forward, + ) else: - pl_module.forward = self.__module_forward + pl_module.forward = self.__module_forward # type: ignore [assignment] def on_train_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: if "train" in self._observer_disabled_stages: @@ -336,7 +344,7 @@ def state_dict(self) -> Dict[str, Any]: keys = {"_qconfig", "_observer_type", "_collect_quantization", "_modules_to_fuse", "_input_compatible"} return {n: getattr(self, n) for n in keys} - def _load_before_model(self, model: torch.nn.Module, state_dict: Dict[str, Any]) -> None: + def _load_before_model(self, model: "pl.LightningModule", state_dict: Dict[str, Any]) -> None: """Special hook that gets called by the CheckpointConnector *before* the model gets loaded. This hook replaces the :meth:`on_load_checkpoint` and :meth:`load_state_dict` callback methods which get called diff --git a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py index 22f61c845360d..e1dccd11a0d0a 100644 --- a/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py +++ b/src/pytorch_lightning/trainer/connectors/checkpoint_connector.py @@ -245,7 +245,7 @@ def _restore_quantization_callbacks(self) -> None: if state: # The Quantization callbacks have a special method that must be called before restoring the weights # of the model - callback._load_before_model(self.trainer.model, deepcopy(state)) + callback._load_before_model(self.trainer.lightning_module, deepcopy(state)) def restore_callbacks(self) -> None: """Restores all callbacks from the pre-loaded checkpoint.""" From 6e96fc2936abe2dc6522563573fde5f42aaf46c1 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 20 Aug 2022 10:17:40 -0400 Subject: [PATCH 209/230] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 6f075f5fd42b6..36e34736c1d1e 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,5 @@ -### \*\* NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build research workflows and production pipelines\*\* +### \*\* NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build lightning apps that glue together everything around the models, without the pain of infrastructure, cost management, scaling and everything else. +\*\*
From 238646508f83a828ab27c87dd8124ae58cb4ae66 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 20 Aug 2022 10:18:37 -0400 Subject: [PATCH 210/230] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 36e34736c1d1e..6ff71ac085ff0 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@
-**Build high-performance (PyTorch) models, research workflows, ML production pipelines.** +**Build high-performance (PyTorch) models and lightning apps that glue together everything around the models, without the pain of infrastructure, cost management, scaling and everything else.** ______________________________________________________________________ From a73fef4fa2b53f22a9e19204add7db3029a1d78c Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 20 Aug 2022 10:19:04 -0400 Subject: [PATCH 211/230] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ff71ac085ff0..ffaee4e53a8e8 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@
-**Build high-performance (PyTorch) models and lightning apps that glue together everything around the models, without the pain of infrastructure, cost management, scaling and everything else.** +**Build PyTorch models and lightning apps that glue together everything around the models, without the pain of infrastructure, cost management, scaling and everything else.** ______________________________________________________________________ From aa83255c2cbd28c06bec6586362bf9ce2a3950c5 Mon Sep 17 00:00:00 2001 From: William Falcon Date: Sat, 20 Aug 2022 14:03:25 -0400 Subject: [PATCH 212/230] Update README.md --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index ffaee4e53a8e8..534a598b59446 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ -### \*\* NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build lightning apps that glue together everything around the models, without the pain of infrastructure, cost management, scaling and everything else. -\*\* +### \*\* NEWS: PyTorch Lightning has been renamed Lightning! In addition to building models, you can now build lightning apps that glue together everything around the models, without the pain of infrastructure, cost management, scaling and everything else.\*\*
From db1835a82c17ee685a10e2d065056740421b1fde Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Sun, 21 Aug 2022 23:55:03 +0530 Subject: [PATCH 213/230] Fix an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation (#13964) --- src/pytorch_lightning/CHANGELOG.md | 3 + .../loops/dataloader/prediction_loop.py | 3 +- .../loops/epoch/prediction_epoch_loop.py | 3 +- src/pytorch_lightning/loops/fit_loop.py | 3 +- .../trainer/connectors/data_connector.py | 10 +- src/pytorch_lightning/trainer/trainer.py | 21 +-- .../tests_pytorch/trainer/test_dataloaders.py | 125 ++++++++---------- 7 files changed, 83 insertions(+), 85 deletions(-) diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 113400bb870aa..3214aa59764c7 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -108,6 +108,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed not preserving set attributes on `DataLoader` and `BatchSampler` when instantiated inside `*_dataloader` hooks ([#14212](https://github.com/Lightning-AI/lightning/pull/14212)) +- Fixed an issue to avoid the impact of sanity check on `reload_dataloaders_every_n_epochs` for validation ([#13964](https://github.com/Lightning-AI/lightning/pull/13964)) + + ## [1.7.1] - 2022-08-09 ### Fixed diff --git a/src/pytorch_lightning/loops/dataloader/prediction_loop.py b/src/pytorch_lightning/loops/dataloader/prediction_loop.py index f026ae5c7c813..2faf66a3e0c3e 100644 --- a/src/pytorch_lightning/loops/dataloader/prediction_loop.py +++ b/src/pytorch_lightning/loops/dataloader/prediction_loop.py @@ -60,7 +60,8 @@ def max_batches(self) -> List[int]: @property def dataloaders(self) -> Sequence[DataLoader]: """Returns all prediction dataloaders.""" - return self.trainer.predict_dataloaders + dataloaders = self.trainer.predict_dataloaders + return [] if dataloaders is None else dataloaders @property def skip(self) -> bool: diff --git a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py index 2c481522f9eaa..64f4523d914f3 100644 --- a/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py +++ b/src/pytorch_lightning/loops/epoch/prediction_epoch_loop.py @@ -163,8 +163,9 @@ def _get_batch_indices(self, dataloader_idx: int) -> List[List[int]]: """Returns a reference to the seen batch indices if the dataloader has a batch sampler wrapped by our :class:`~pytorch_lightning.overrides.distributed.IndexBatchSamplerWrapper`.""" # the batch_sampler is not be defined in case of CombinedDataLoaders + assert self.trainer.predict_dataloaders batch_sampler = getattr( - self.trainer.predict_dataloaders[dataloader_idx], # type: ignore[has-type] + self.trainer.predict_dataloaders[dataloader_idx], "batch_sampler", None, ) diff --git a/src/pytorch_lightning/loops/fit_loop.py b/src/pytorch_lightning/loops/fit_loop.py index be45a6e3b094e..21273e70a5365 100644 --- a/src/pytorch_lightning/loops/fit_loop.py +++ b/src/pytorch_lightning/loops/fit_loop.py @@ -209,7 +209,8 @@ def on_run_start(self) -> None: # type: ignore[override] self.trainer.reset_train_dataloader(self.trainer.lightning_module) # reload the evaluation dataloaders too for proper display in the progress bar - self.epoch_loop.val_loop._reload_evaluation_dataloaders() + if self.epoch_loop._should_check_val_epoch(): + self.epoch_loop.val_loop._reload_evaluation_dataloaders() data_fetcher_cls = _select_data_fetcher(self.trainer) self._data_fetcher = data_fetcher_cls(prefetch_batches=self.prefetch_batches) diff --git a/src/pytorch_lightning/trainer/connectors/data_connector.py b/src/pytorch_lightning/trainer/connectors/data_connector.py index e20eac2ffae57..dae4e4a3527df 100644 --- a/src/pytorch_lightning/trainer/connectors/data_connector.py +++ b/src/pytorch_lightning/trainer/connectors/data_connector.py @@ -61,13 +61,19 @@ def __init__(self, trainer: "pl.Trainer", multiple_trainloader_mode: str = "max_ def _should_reload_train_dl(self) -> bool: """Check if train dataloader should be reloaded.""" n_epochs = self.trainer.reload_dataloaders_every_n_epochs - return n_epochs and (self.trainer.current_epoch - self.trainer._last_train_dl_reload_epoch >= n_epochs) + return n_epochs and ( + self.trainer._last_train_dl_reload_epoch is None + or self.trainer.current_epoch - self.trainer._last_train_dl_reload_epoch >= n_epochs + ) @property def _should_reload_val_dl(self) -> bool: """Check if validation dataloader should be reloaded.""" n_epochs = self.trainer.reload_dataloaders_every_n_epochs - return n_epochs and (self.trainer.current_epoch - self.trainer._last_val_dl_reload_epoch >= n_epochs) + return n_epochs and ( + self.trainer._last_val_dl_reload_epoch is None + or self.trainer.current_epoch - self.trainer._last_val_dl_reload_epoch >= n_epochs + ) def on_trainer_init( self, diff --git a/src/pytorch_lightning/trainer/trainer.py b/src/pytorch_lightning/trainer/trainer.py index 5983324f2f62d..8bee0ac6dfb7f 100644 --- a/src/pytorch_lightning/trainer/trainer.py +++ b/src/pytorch_lightning/trainer/trainer.py @@ -628,12 +628,12 @@ def _setup_on_init(self) -> None: self.num_sanity_val_batches = [] self.num_test_batches = [] self.num_val_batches = [] + self.num_predict_batches = [] self.test_dataloaders = None self.val_dataloaders = None - self._last_train_dl_reload_epoch = float("-inf") - self._last_val_dl_reload_epoch = float("-inf") - - self.num_predict_batches = [] + self.predict_dataloaders = None + self._last_train_dl_reload_epoch = None + self._last_val_dl_reload_epoch: Optional[int] = None def _call_and_handle_interrupt(self, trainer_fn: Callable, *args: Any, **kwargs: Any) -> Any: r""" @@ -715,8 +715,6 @@ def _fit_impl( self.state.fn = TrainerFn.FITTING self.state.status = TrainerStatus.RUNNING self.training = True - self._last_train_dl_reload_epoch = float("-inf") - self._last_val_dl_reload_epoch = float("-inf") # if a datamodule comes in as the second arg, then fix it for the user if isinstance(train_dataloaders, LightningDataModule): @@ -1930,13 +1928,18 @@ def reset_val_dataloader(self, model: Optional["pl.LightningModule"] = None) -> has_step = is_overridden("validation_step", pl_module) enable_validation = self.limit_val_batches > 0 if source.is_defined() and has_step and enable_validation: + # store epoch of dataloader reset for reload_dataloaders_every_n_epochs + # it should not reload again if it has already reloaded during sanity_check + if self.state.fn == TrainerFn.FITTING and ( + (self.sanity_checking and self.fit_loop.epoch_loop._should_check_val_epoch()) + or not self.sanity_checking + ): + self._last_val_dl_reload_epoch = self.current_epoch + self.num_val_batches, self.val_dataloaders = self._data_connector._reset_eval_dataloader( RunningStage.VALIDATING, model=pl_module ) - # store epoch of dataloader reset for reload_dataloaders_every_n_epochs - self._last_val_dl_reload_epoch = self.current_epoch - def reset_test_dataloader(self, model: Optional["pl.LightningModule"] = None) -> None: """Resets the test dataloader and determines the number of batches. diff --git a/tests/tests_pytorch/trainer/test_dataloaders.py b/tests/tests_pytorch/trainer/test_dataloaders.py index 34504392dc0c1..317a35af3d1ab 100644 --- a/tests/tests_pytorch/trainer/test_dataloaders.py +++ b/tests/tests_pytorch/trainer/test_dataloaders.py @@ -950,74 +950,47 @@ def test_dataloaders_load_only_once_no_sanity_check(tmpdir): assert tracker.mock_calls == expected_sequence -@pytest.mark.parametrize("n", [1, 2]) -def test_dataloaders_load_every_n_epochs(tmpdir, n): - train_reload_epochs, val_reload_epochs = [], [] - - class TestModel(BoringModel): - def train_dataloader(self): - train_reload_epochs.append(self.current_epoch) - return super().train_dataloader() - - def val_dataloader(self): - val_reload_epochs.append(self.current_epoch) - return super().val_dataloader() - - model = TestModel() - - trainer = Trainer( - default_root_dir=tmpdir, - limit_train_batches=0.3, - limit_val_batches=0.3, - reload_dataloaders_every_n_epochs=n, - max_epochs=5, - ) - - tracker = Mock() - model.train_dataloader = Mock(wraps=model.train_dataloader) - model.val_dataloader = Mock(wraps=model.val_dataloader) - model.test_dataloader = Mock(wraps=model.test_dataloader) - - tracker.attach_mock(model.train_dataloader, "train_dataloader") - tracker.attach_mock(model.val_dataloader, "val_dataloader") - tracker.attach_mock(model.test_dataloader, "test_dataloader") - - trainer.fit(model) - trainer.test(model) - - # Verify the sequence - expected_sequence = [call.val_dataloader(), call.train_dataloader()] # Sanity check first - if n == 1: - expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 4 - elif n == 2: - expected_sequence += [call.train_dataloader(), call.val_dataloader()] * 2 - expected_sequence += [call.test_dataloader()] - - assert tracker.mock_calls == expected_sequence - - # Verify epoch of reloads - if n == 1: - assert train_reload_epochs == [0, 1, 2, 3, 4] - assert val_reload_epochs == [0, 1, 2, 3, 4] - elif n == 2: - assert train_reload_epochs == [0, 2, 4] - assert val_reload_epochs == [0, 2, 4] - - @pytest.mark.parametrize( - "n, train_reload_epochs_expect, val_reload_epochs_expect", + ( + "num_sanity_val_steps, check_val_every_n_epoch, reload_dataloaders_every_n_epochs," + " train_reload_epochs_expect,val_reload_epochs_expect,val_step_epochs_expect" + ), [ - # Sanity check at epoch 0 creates a validation dataloader, but validation is - # checked (and in this case reloaded) every n epochs starting from epoch n-1 - (3, [0, 2, 4, 6, 8], [0, 2, 5, 8]), - (5, [0, 2, 4, 6, 8], [0, 4, 9]), + # general case where sanity check reloads the dataloaders for validation on current_epoch=0 + (0, 1, 1, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + (1, 1, 1, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 2, 3, 4, 5, 6, 7, 8, 9], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + # case where check_val_every_n_epoch < reload_dataloaders_every_n_epochs so expected val_reload_epoch + # and val_step_epoch will be different + (0, 1, 2, [0, 2, 4, 6, 8], [0, 2, 4, 6, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + (1, 1, 2, [0, 2, 4, 6, 8], [2, 4, 6, 8], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), + (0, 3, 4, [0, 4, 8], [2, 8], [2, 5, 8]), + (1, 3, 4, [0, 4, 8], [2, 8], [2, 5, 8]), + # case where check_val_every_n_epoch > reload_dataloaders_every_n_epochs so expected val_reload_epoch + # and val_step_epoch will be same + (0, 2, 1, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 3, 5, 7, 9], [1, 3, 5, 7, 9]), + (1, 2, 1, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], [1, 3, 5, 7, 9], [1, 3, 5, 7, 9]), + (0, 3, 2, [0, 2, 4, 6, 8], [2, 5, 8], [2, 5, 8]), + (1, 3, 2, [0, 2, 4, 6, 8], [2, 5, 8], [2, 5, 8]), + (0, 5, 2, [0, 2, 4, 6, 8], [4, 9], [4, 9]), + (1, 5, 2, [0, 2, 4, 6, 8], [4, 9], [4, 9]), + # case where check_val_every_n_epoch = reload_dataloaders_every_n_epochs so expected val_reload_epoch + # and val_step_epoch will be same + (0, 2, 2, [0, 2, 4, 6, 8], [1, 3, 5, 7, 9], [1, 3, 5, 7, 9]), + (1, 2, 2, [0, 2, 4, 6, 8], [1, 3, 5, 7, 9], [1, 3, 5, 7, 9]), ], ) def test_dataloaders_load_every_n_epochs_infrequent_val( - tmpdir, n, train_reload_epochs_expect, val_reload_epochs_expect + tmpdir, + num_sanity_val_steps, + check_val_every_n_epoch, + reload_dataloaders_every_n_epochs, + train_reload_epochs_expect, + val_reload_epochs_expect, + val_step_epochs_expect, ): """Test dataloader reload behavior when infrequently checking validation set (via check_val_every_n_epoch)""" - train_reload_epochs, val_reload_epochs = [], [] + sanity_val_check_epochs, train_reload_epochs, val_reload_epochs = [], [], [] + sanity_val_step_epochs, val_step_epochs = [], [] class TestModel(BoringModel): def train_dataloader(self): @@ -1025,29 +998,39 @@ def train_dataloader(self): return super().train_dataloader() def val_dataloader(self): - val_reload_epochs.append(self.current_epoch) + if self.trainer.sanity_checking: + sanity_val_check_epochs.append(self.current_epoch) + else: + val_reload_epochs.append(self.current_epoch) return super().val_dataloader() + def validation_step(self, *args, **kwargs): + if self.trainer.sanity_checking: + sanity_val_step_epochs.append(self.current_epoch) + else: + val_step_epochs.append(self.current_epoch) + + return super().validation_step(*args, **kwargs) + model = TestModel() trainer = Trainer( default_root_dir=tmpdir, - limit_train_batches=0.3, - limit_val_batches=0.3, - check_val_every_n_epoch=n, - reload_dataloaders_every_n_epochs=2, + limit_train_batches=1, + limit_val_batches=1, + check_val_every_n_epoch=check_val_every_n_epoch, + reload_dataloaders_every_n_epochs=reload_dataloaders_every_n_epochs, max_epochs=10, + num_sanity_val_steps=num_sanity_val_steps, ) - model.test_dataloader = Mock(wraps=model.test_dataloader) - trainer.fit(model) - trainer.test(model) # Verify epoch of reloads + sanity_val_check_epochs_expect = [0] if num_sanity_val_steps else [] + assert sanity_val_check_epochs == sanity_val_step_epochs == sanity_val_check_epochs_expect assert train_reload_epochs == train_reload_epochs_expect assert val_reload_epochs == val_reload_epochs_expect - - model.test_dataloader.assert_called_once() + assert val_step_epochs == val_step_epochs_expect def test_dataloaders_load_every_n_epochs_frequent_val(tmpdir): From 52a4457ae3f4aef81ce81d6e19cca14855c64c05 Mon Sep 17 00:00:00 2001 From: Sherin Thomas Date: Mon, 22 Aug 2022 04:56:52 +0530 Subject: [PATCH 214/230] Pinning starsessions to 1.x (#14333) The recent release of starsessions 2.0 has broken lightning app as some of the arguments are removed. This PR also fixes a bug in our setup tools that prevents our internal # strict parameter being considered. Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec --- .actions/setup_tools.py | 5 +++++ requirements/app/base.txt | 2 +- requirements/app/cloud.txt | 1 - 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.actions/setup_tools.py b/.actions/setup_tools.py index a76e81246798c..0b84cec001e6d 100644 --- a/.actions/setup_tools.py +++ b/.actions/setup_tools.py @@ -67,6 +67,11 @@ def load_requirements( # remove version restrictions unless they are strict if unfreeze and "<" in req and "strict" not in comment: req = re.sub(r",? *<=? *[\d\.\*]+", "", req).strip() + + # adding strict back to the comment + if "strict" in comment: + req += " # strict" + reqs.append(req) return reqs diff --git a/requirements/app/base.txt b/requirements/app/base.txt index fcde2f18a300a..4a0ee3a90a95e 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -1,7 +1,7 @@ lightning-cloud==0.5.3 packaging deepdiff>=5.7.0, <=5.8.1 -starsessions +starsessions>=1.2.1, <2.0 # strict fsspec>=2022.01.0, <=2022.7.1 s3fs>=2022.1.0, <=2022.7.1 croniter # for now until we found something more robust. diff --git a/requirements/app/cloud.txt b/requirements/app/cloud.txt index 6644a56a2894b..21611fedccdc7 100644 --- a/requirements/app/cloud.txt +++ b/requirements/app/cloud.txt @@ -1,4 +1,3 @@ -starsessions redis>=4.0.0, <=4.2.4 docker>=5.0.0, <=5.0.3 # setuptools==59.5.0 From 9cf9bc582002a9fc3e0d3b890dc1aff9734cc870 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 22 Aug 2022 10:45:05 +0000 Subject: [PATCH 215/230] Bump tj-actions/changed-files from 24 to 28 (#14337) Bumps [tj-actions/changed-files](https://github.com/tj-actions/changed-files) from 24 to 28. - [Release notes](https://github.com/tj-actions/changed-files/releases) - [Changelog](https://github.com/tj-actions/changed-files/blob/main/HISTORY.md) - [Commits](https://github.com/tj-actions/changed-files/compare/v24...v28) --- updated-dependencies: - dependency-name: tj-actions/changed-files dependency-type: direct:production update-type: version-update:semver-major ... Signed-off-by: dependabot[bot] Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-pr-gatekeeper.yml | 2 +- .github/workflows/ci-pytorch-test-conda.yml | 2 +- .github/workflows/ci-pytorch-test-full.yml | 2 +- .github/workflows/ci-pytorch-test-slow.yml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci-pr-gatekeeper.yml b/.github/workflows/ci-pr-gatekeeper.yml index 1e808c3397128..93f55689165e5 100644 --- a/.github/workflows/ci-pr-gatekeeper.yml +++ b/.github/workflows/ci-pr-gatekeeper.yml @@ -20,7 +20,7 @@ jobs: fetch-depth: "2" # To retrieve the preceding commit. - name: Get changed files using defaults id: changed-files - uses: tj-actions/changed-files@v24 + uses: tj-actions/changed-files@v28 - name: Determine changes id: touched run: | diff --git a/.github/workflows/ci-pytorch-test-conda.yml b/.github/workflows/ci-pytorch-test-conda.yml index a0ec35973b579..2a63189caa019 100644 --- a/.github/workflows/ci-pytorch-test-conda.yml +++ b/.github/workflows/ci-pytorch-test-conda.yml @@ -37,7 +37,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v23.1 + uses: tj-actions/changed-files@v28 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch-test-full.yml b/.github/workflows/ci-pytorch-test-full.yml index 17a2c073274f7..6f246e62e35fb 100644 --- a/.github/workflows/ci-pytorch-test-full.yml +++ b/.github/workflows/ci-pytorch-test-full.yml @@ -39,7 +39,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v23.1 + uses: tj-actions/changed-files@v28 - name: Decide if the test should be skipped id: skip diff --git a/.github/workflows/ci-pytorch-test-slow.yml b/.github/workflows/ci-pytorch-test-slow.yml index 1e230770c8922..194a0bd910010 100644 --- a/.github/workflows/ci-pytorch-test-slow.yml +++ b/.github/workflows/ci-pytorch-test-slow.yml @@ -30,7 +30,7 @@ jobs: - name: Get changed files id: changed-files - uses: tj-actions/changed-files@v23.1 + uses: tj-actions/changed-files@v28 - name: Decide if the test should be skipped id: skip From 7a617ec90e1566c763be8ac7a200af1e4025412c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Mon, 22 Aug 2022 15:19:53 +0200 Subject: [PATCH 216/230] Add back support for logging in the gradient clipping hooks (#14298) * Add back support for logging in the gradient clipping hooks * Docs and CHANGELOG * Fix tests --- docs/source-pytorch/visualize/logging_advanced.rst | 2 +- src/pytorch_lightning/CHANGELOG.md | 3 +++ .../trainer/connectors/logger_connector/fx_validator.py | 9 +++++++-- .../trainer/logging_/test_logger_connector.py | 9 +-------- .../tests_pytorch/trainer/logging_/test_loop_logging.py | 2 ++ 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/docs/source-pytorch/visualize/logging_advanced.rst b/docs/source-pytorch/visualize/logging_advanced.rst index d820fb6c1fa71..788deffdb3fea 100644 --- a/docs/source-pytorch/visualize/logging_advanced.rst +++ b/docs/source-pytorch/visualize/logging_advanced.rst @@ -355,7 +355,7 @@ In LightningModule * - Method - on_step - on_epoch - * - on_after_backward, on_before_backward, on_before_optimizer_step, on_before_zero_grad, training_step, training_step_end + * - on_after_backward, on_before_backward, on_before_optimizer_step, optimizer_step, configure_gradient_clipping, on_before_zero_grad, training_step, training_step_end - True - False * - training_epoch_end, test_epoch_end, test_step, test_step_end, validation_epoch_end, validation_step, validation_step_end diff --git a/src/pytorch_lightning/CHANGELOG.md b/src/pytorch_lightning/CHANGELOG.md index 3214aa59764c7..07c34bbc0e579 100644 --- a/src/pytorch_lightning/CHANGELOG.md +++ b/src/pytorch_lightning/CHANGELOG.md @@ -76,6 +76,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed an `AttributeError` when accessing `LightningModule.logger` and the Trainer has multiple loggers ([#14234](https://github.com/Lightning-AI/lightning/pull/14234)) +- Added back support for `log`ging in the `configure_gradient_clipping` hook after unintended removal in v1.7.2 ([#14298](https://github.com/Lightning-AI/lightning/issues/14298)) + + - Fixed wrong num padding for `RichProgressBar` ([#14296](https://github.com/Lightning-AI/lightning/pull/14296)) diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py index 56ad53ef4ba04..87ff7428103b5 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/fx_validator.py @@ -44,8 +44,13 @@ class _LogOptions(TypedDict): allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), "lr_scheduler_step": None, - "configure_gradient_clipping": None, - "clip_gradients": None, + # should match `optimizer_step` + "configure_gradient_clipping": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), + "clip_gradients": _LogOptions( + allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False + ), "on_before_zero_grad": _LogOptions( allowed_on_step=(False, True), allowed_on_epoch=(False, True), default_on_step=True, default_on_epoch=False ), diff --git a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py index c2be22c61244b..1c346ac1e8bc5 100644 --- a/tests/tests_pytorch/trainer/logging_/test_logger_connector.py +++ b/tests/tests_pytorch/trainer/logging_/test_logger_connector.py @@ -183,12 +183,7 @@ class HookedModel(BoringModel): def __init__(self, not_supported): super().__init__() pl_module_hooks = get_members(LightningModule) - pl_module_hooks.difference_update( - { - "log", - "log_dict", - } - ) + pl_module_hooks.difference_update({"log", "log_dict"}) # remove `nn.Module` hooks module_hooks = get_members(torch.nn.Module) pl_module_hooks.difference_update(module_hooks) @@ -236,8 +231,6 @@ def test_fx_validator_integration(tmpdir): "on_validation_model_eval": "You can't", "on_validation_model_train": "You can't", "lr_scheduler_step": "You can't", - "configure_gradient_clipping": "You can't", - "clip_gradients": "You can't", "on_save_checkpoint": "You can't", "on_load_checkpoint": "You can't", "on_exception": "You can't", diff --git a/tests/tests_pytorch/trainer/logging_/test_loop_logging.py b/tests/tests_pytorch/trainer/logging_/test_loop_logging.py index c37087253e94b..66c7bdcd25cf3 100644 --- a/tests/tests_pytorch/trainer/logging_/test_loop_logging.py +++ b/tests/tests_pytorch/trainer/logging_/test_loop_logging.py @@ -54,6 +54,8 @@ def _make_assertion(model, hooks, result_mock, on_step, on_epoch, extra_kwargs): "on_after_backward", "on_before_optimizer_step", "optimizer_step", + "configure_gradient_clipping", + "clip_gradients", "on_before_zero_grad", "optimizer_zero_grad", "training_step", From 4c47619f38331e4a2c13c0562e5c46500cfd3d70 Mon Sep 17 00:00:00 2001 From: Adam Bobowski <100693297+adam-lightning@users.noreply.github.com> Date: Mon, 22 Aug 2022 22:15:36 +0200 Subject: [PATCH 217/230] [App] Add cloud platform exception (#13928) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Adrian Wälchli --- src/lightning_app/core/work.py | 6 +++++- src/lightning_app/utilities/exceptions.py | 8 ++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/src/lightning_app/core/work.py b/src/lightning_app/core/work.py index e7c800c0d15fa..99d8c3611a09d 100644 --- a/src/lightning_app/core/work.py +++ b/src/lightning_app/core/work.py @@ -518,7 +518,11 @@ def _cleanup_calls(calls: Dict[str, Any]): @abc.abstractmethod def run(self, *args, **kwargs): - """Override to add your own logic.""" + """Override to add your own logic. + + Raises: + LightningPlatformException: If resource exceeds platform quotas or other constraints. + """ pass def on_exception(self, exception: BaseException): diff --git a/src/lightning_app/utilities/exceptions.py b/src/lightning_app/utilities/exceptions.py index c83a621e225f7..93bf5b7f319e8 100644 --- a/src/lightning_app/utilities/exceptions.py +++ b/src/lightning_app/utilities/exceptions.py @@ -41,6 +41,14 @@ class LightningWorkException(Exception): """Exception used to inform users of misuse with LightningWork.""" +class LightningPlatformException(Exception): # pragma: no cover + """Exception used to inform users of issues related to platform the LightningApp is running on. + + It gets raised by the Lightning Launcher on the platform side when the app is running in the cloud, and is useful + when framework or user code needs to catch exceptions specific to the platform, e.g., when resources exceed quotas. + """ + + class LightningAppStateException(Exception): """Exception to inform users of app state errors.""" From 4586d118298cea2fe080197f79cb21dd8d324cbc Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 23 Aug 2022 17:50:30 +0200 Subject: [PATCH 218/230] adding CI for e2e on Azure (#14282) * start CI * wip * matrix * name * wip * prune * rm * ls * cache * dir * ls * name * cleaning * clone * git * git * if * private * . * local_id * clean * var * group check * ci --- .azure/app-cloud-e2e.yml | 140 +++++++++++++++ .github/checkgroup.yml | 14 +- .github/workflows/ci-app-cloud-e2e-test.yml | 187 -------------------- 3 files changed, 142 insertions(+), 199 deletions(-) create mode 100644 .azure/app-cloud-e2e.yml delete mode 100644 .github/workflows/ci-app-cloud-e2e-test.yml diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml new file mode 100644 index 0000000000000..d5aeee88c020a --- /dev/null +++ b/.azure/app-cloud-e2e.yml @@ -0,0 +1,140 @@ +# Python package +# Create and test a Python package on multiple Python versions. +# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more: +# https://docs.microsoft.com/azure/devops/pipelines/languages/python + +trigger: + tags: + include: + - '*' + branches: + include: + - "master" + - "release/*" + - "refs/tags/*" + +pr: + - "master" + - "release/*" + +variables: + # variables are automatically exported as environment variables so this will override pip's default cache dir + - name: pip_cache_dir + value: $(Pipeline.Workspace)/.pip + +jobs: + - job: App_cloud_e2e_testing + pool: + vmImage: 'ubuntu-latest' + timeoutInMinutes: "30" + cancelTimeoutInMinutes: "2" + strategy: + matrix: + 'App: v0_app': + name: "v0_app" + 'App: boring_app': + name: "boring_app" + 'App: template_streamlit_ui': + name: "template_streamlit_ui" +# 'App: template_react_ui': # TODO: clarify visibility private/public +# name: "template_react_ui" + 'App: template_jupyterlab': # TODO: clarify where these files lives + name: "template_jupyterlab" + 'App: idle_timeout': + name: "idle_timeout" + 'App: collect_failures': + name: "collect_failures" + 'App: custom_work_dependencies': + name: "custom_work_dependencies" + 'App: drive': + name: "drive" + 'App: payload': + name: "payload" + 'App: commands_and_api': + name: "commands_and_api" + workspace: + clean: all + steps: + - bash: | + python --version + pip --version + displayName: 'Info' + + # TODO: parse the PR number + - bash: | + ID=$(date +%s) + echo "##vso[task.setvariable variable=local_id]$ID" + + - task: Cache@2 + inputs: + key: 'pip | "$(name)" | requirements/app/base.txt' + restoreKeys: | + pip | "$(Agent.OS)" + path: $(pip_cache_dir) + displayName: Cache pip + + - bash: python -m pip install -r requirements/app/devel.txt --quiet --find-links ${TORCH_URL} + env: + TORCH_URL: https://download.pytorch.org/whl/cpu/torch_stable.html + displayName: 'Install dependencies' + + - bash: | + python -m pip install playwright + python -m playwright install --with-deps + displayName: 'Install Playwright system dependencies' + + - bash: pip install -e . + displayName: 'Install lightning' + + - bash: | + git clone https://github.com/Lightning-AI/LAI-lightning-template-jupyterlab-App examples/app_template_jupyterlab + cp examples/app_template_jupyterlab/tests/test_template_jupyterlab.py tests/tests_app_examples/test_template_jupyterlab.py + condition: eq(variables['name'], 'template_jupyterlab') + displayName: 'Clone Template Jupyter Lab Repo' + + - bash: git clone https://github.com/Lightning-AI/lightning-template-react examples/app_template_react_ui + condition: eq(variables['name'], 'template_react_ui') + displayName: 'Clone Template React UI Repo' + + - bash: | + mkdir -p ${VIDEO_LOCATION} + ls -l examples/${TEST_APP_NAME} + ls -l tests/tests_app_examples + python -m pytest tests/tests_app_examples/test_${TEST_APP_NAME}.py::test_${TEST_APP_NAME}_example_cloud --timeout=900 --capture=no -v --color=yes + env: + HEADLESS: '1' + PACKAGE_LIGHTNING: '1' + CLOUD: '1' + VIDEO_LOCATION: '$(Build.ArtifactStagingDirectory)/videos' + PR_NUMBER: $(local_id) + TEST_APP_NAME: $(name) + HAR_LOCATION: './artifacts/hars' + SLOW_MO: '50' + LAI_USER: $(LAI_USER) + LAI_PASS: $(LAI_PASS) + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY) + LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL) + displayName: 'Run the tests' + + - publish: '$(Build.ArtifactStagingDirectory)/videos' + displayName: 'Publish videos' + artifact: $(name) + + - bash: | + time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" + env: + LAI_USER: $(LAI_USER) + LAI_PASS: $(LAI_PASS) + LIGHTNING_USER_ID: $(LIGHTNING_USER_ID) + LIGHTNING_API_KEY: $(LIGHTNING_API_KEY) + LIGHTNING_USERNAME: $(LIGHTNING_USERNAME) + LIGHTNING_CLOUD_URL: $(LIGHTNING_CLOUD_URL) + PR_NUMBER: $(local_id) + TEST_APP_NAME: $(name) + GRID_USER_ID: $(LIGHTNING_USER_ID) # TODO: clarify the meaning + GRID_USER_KEY: $(LIGHTNING_API_KEY) # TODO: clarify the meaning + GRID_URL: $(LIGHTNING_CLOUD_URL) + _GRID_USERNAME: $(LIGHTNING_USERNAME) + displayName: 'Clean Previous Apps' diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index c2654eddd7ca1..6389fdf71ee70 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -108,7 +108,7 @@ subprojects: - id: "lightning_app" paths: - - ".github/workflows/ci-app*.yml" + - ".azure/app-cloud-e2e.yml" - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" @@ -117,17 +117,7 @@ subprojects: # the examples are used in the app CI - "examples/app_*" checks: - - "Cloud Test (boring_app)" - - "Cloud Test (collect_failures)" - - "Cloud Test (commands_and_api)" - - "Cloud Test (custom_work_dependencies)" - - "Cloud Test (drive)" - - "Cloud Test (idle_timeout)" - - "Cloud Test (payload)" - - "Cloud Test (template_jupyterlab)" - - "Cloud Test (template_react_ui)" - - "Cloud Test (template_streamlit_ui)" - - "Cloud Test (v0_app)" + - "App.cloud-e2e" - "doctest (app)" - "make-docs (app)" - "pytest (macOS-11, 3.8, latest)" diff --git a/.github/workflows/ci-app-cloud-e2e-test.yml b/.github/workflows/ci-app-cloud-e2e-test.yml deleted file mode 100644 index 07b253f5d6f8c..0000000000000 --- a/.github/workflows/ci-app-cloud-e2e-test.yml +++ /dev/null @@ -1,187 +0,0 @@ -name: cloud-testing - -# Used to run the e2e tests on lightning.ai - -on: # Trigger the workflow on push or pull request, but only for the master branch - push: - branches: [master, "release/*"] - pull_request: - branches: [master, "release/*"] - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }} - cancel-in-progress: ${{ github.ref != 'refs/heads/master' }} - -jobs: - # This is job should once only once per PR to detect file changes so run required jobs. - # see .github/file-filters.yml to define file filters and run the jobs based on the output of each filter. - # More info: https://github.com/marketplace/actions/paths-changes-filter - - changes: - runs-on: ubuntu-latest - # Set job outputs to the values from filter step - outputs: - app_examples: ${{ steps.filter.outputs.app_examples }} - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - - uses: dorny/paths-filter@v2 - id: filter - with: - filters: .github/file-filters.yml - - cloud-test: - name: Cloud Test - needs: changes - if: ${{ needs.changes.outputs.app_examples == 'true' }} - runs-on: ubuntu-20.04 - strategy: - fail-fast: false - matrix: - app_name: - - v0_app - - boring_app -# - quick_start # TODO: fix this - - template_streamlit_ui - - template_react_ui - - template_jupyterlab - - idle_timeout - - collect_failures - - custom_work_dependencies - - drive - - payload - - commands_and_api - timeout-minutes: 35 - steps: - - uses: actions/checkout@v2 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 - with: - python-version: "3.8" - - - name: Get PR ID - id: PR - run: | - if [ -z ${{github.event.number}} ]; then - echo "::set-output name=ID::$(date +%s)" - else - echo "::set-output name=ID::${{github.event.number}}" - fi - -# TODO: Enable cache -# - name: Cache virtualenv -# id: cache-venv -# uses: actions/cache@v3 -# with: -# path: ./.venv/ -# key: ${{ runner.os }}-pip-${{ matrix.app_name }}-${{ hashFiles('requirements/app/base.txt', 'requirements/app/*.txt', 'src/lightning_app/__version__.py') }} -# restore-keys: ${{ runner.os }}-venv-${{ matrix.app_name }}- - - - name: Install dependencies - shell: bash - run: | - pip --version - python -m pip install -r requirements/app/devel.txt --no-cache --quiet --find-links https://download.pytorch.org/whl/cpu/torch_stable.html -# if: steps.cache-venv.outputs.cache-hit != 'true' # TODO: Enable cache - - - name: Cache Playwright dependencies - id: playwright-cache - uses: actions/cache@v3 - with: - path: ~/.cache/ms-playwright - key: ${{ runner.os }}-playwright-${{ matrix.app_name }}-${{ hashFiles('requirements/app/base.txt', 'requirements/app/*.txt', 'src/lightning_app/__version__.py') }} - restore-keys: ${{ runner.os }}-playwright-${{ matrix.app_name }}- - - - name: Install Playwright system dependencies - shell: bash - run: | - python -m pip install playwright - python -m playwright install --with-deps -# if: steps.playwright-cache.outputs.cache-hit != 'true' # TODO: Enable cache - - - name: Install lightning - run: | - pip install -e . - shell: bash - - - name: Lightning Install quick-start - if: ${{ (matrix.app_name == 'quick_start') }} - shell: bash - run: | - python -m lightning install app lightning/quick-start -y - - - name: Clone Template React UI Repo - uses: actions/checkout@v3 - with: - repository: Lightning-AI/lightning-template-react - token: ${{ secrets.PAT_GHOST }} - ref: 'master' - path: examples/app_template_react_ui - - - name: Clone Template Jupyter Lab Repo - uses: actions/checkout@v3 - with: - repository: Lightning-AI/lightning-template-jupyterlab - token: ${{ secrets.PAT_GHOST }} - ref: 'master' - path: examples/app_template_jupyterlab - - - name: Copy Template Jupyter Lab Repo tests - shell: bash - run: cp examples/app_template_jupyterlab/tests/test_template_jupyterlab.py tests/tests_app_examples/test_template_jupyterlab.py - - - name: List pip dependency - shell: bash - run: | - pip list - - - name: Run the tests - env: - LAI_USER: ${{ secrets.LAI_USER }} - LAI_PASS: ${{ secrets.LAI_PASS }} - LIGHTNING_USER_ID: ${{ secrets.LIGHTNING_USER_ID }} - LIGHTNING_API_KEY: ${{ secrets.LIGHTNING_API_KEY }} - LIGHTNING_USERNAME: ${{ secrets.LIGHTNING_USERNAME }} - LIGHTNING_CLOUD_URL: ${{ secrets.LIGHTNING_CLOUD_URL }} - CLOUD: "1" - VIDEO_LOCATION: ./artifacts/videos - PR_NUMBER: ${{ steps.PR.outputs.ID }} - TEST_APP_NAME: ${{ matrix.app_name }} - HAR_LOCATION: ./artifacts/hars - SLOW_MO: 50 - shell: bash - run: | - mkdir -p ${VIDEO_LOCATION} - HEADLESS=1 PACKAGE_LIGHTNING=1 python -m pytest tests/tests_app_examples/test_${{ matrix.app_name }}.py::test_${{ matrix.app_name }}_example_cloud --timeout=900 --capture=no -v --color=yes - # Delete the artifacts if successful - rm -r ${VIDEO_LOCATION}/${{ matrix.app_name }} - - - uses: actions/upload-artifact@v3 - - if: ${{ always() }} - with: - name: test-artifacts - path: ./artifacts/videos - - - name: Clean Previous Apps - if: ${{ always() }} - env: - LAI_USER: ${{ secrets.LAI_USER }} - LAI_PASS: ${{ secrets.LAI_PASS }} - LIGHTNING_USER_ID: ${{ secrets.LIGHTNING_USER_ID }} - LIGHTNING_API_KEY: ${{ secrets.LIGHTNING_API_KEY }} - LIGHTNING_USERNAME: ${{ secrets.LIGHTNING_USERNAME }} - LIGHTNING_CLOUD_URL: ${{ secrets.LIGHTNING_CLOUD_URL }} - PR_NUMBER: ${{ steps.PR.outputs.ID }} - TEST_APP_NAME: ${{ matrix.app_name }} - GRID_USER_ID: ${{ secrets.LIGHTNING_USER_ID }} - GRID_USER_KEY: ${{ secrets.LIGHTNING_API_KEY }} - GRID_URL: ${{ secrets.LIGHTNING_CLOUD_URL }} - _GRID_USERNAME: ${{ secrets.LIGHTNING_USERNAME }} - shell: bash - run: | - time python -c "from lightning.app import testing; testing.delete_cloud_lightning_apps()" From 99ba95a38e5468c6fd31aa4274b301c161be4dc0 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 23 Aug 2022 17:52:58 +0200 Subject: [PATCH 219/230] fix imports of collections.abc for py3.10 (#14345) fix collections.abc for py3.10 Co-authored-by: Sherin Thomas --- src/lightning_app/utilities/apply_func.py | 3 +-- .../trainer/connectors/logger_connector/result.py | 3 +-- src/pytorch_lightning/trainer/supporters.py | 3 +-- src/pytorch_lightning/utilities/apply_func.py | 3 +-- src/pytorch_lightning/utilities/fetching.py | 3 +-- tests/tests_pytorch/overrides/test_distributed.py | 2 +- tests/tests_pytorch/utilities/test_auto_restart.py | 3 +-- 7 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/lightning_app/utilities/apply_func.py b/src/lightning_app/utilities/apply_func.py index 8fdcf25d7ba20..e8ec148e6c517 100644 --- a/src/lightning_app/utilities/apply_func.py +++ b/src/lightning_app/utilities/apply_func.py @@ -15,9 +15,8 @@ import dataclasses from collections import defaultdict, OrderedDict -from collections.abc import Mapping, Sequence from copy import deepcopy -from typing import Any, Callable, Optional, Tuple, Union +from typing import Any, Callable, Mapping, Optional, Sequence, Tuple, Union from lightning_app.utilities.exceptions import MisconfigurationException diff --git a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py index a28599b5f20be..7fb56c21c2edf 100644 --- a/src/pytorch_lightning/trainer/connectors/logger_connector/result.py +++ b/src/pytorch_lightning/trainer/connectors/logger_connector/result.py @@ -11,10 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from collections.abc import Generator from dataclasses import asdict, dataclass, replace from functools import partial, wraps -from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, cast, Dict, Generator, List, Optional, Tuple, Union import torch from torch import Tensor diff --git a/src/pytorch_lightning/trainer/supporters.py b/src/pytorch_lightning/trainer/supporters.py index 101af14fe2d64..63213dcbed4b7 100644 --- a/src/pytorch_lightning/trainer/supporters.py +++ b/src/pytorch_lightning/trainer/supporters.py @@ -12,9 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from collections.abc import Iterable, Iterator, Mapping, Sequence from dataclasses import asdict, dataclass, field -from typing import Any, Callable, Dict, List, Optional, Union +from typing import Any, Callable, Dict, Iterable, Iterator, List, Mapping, Optional, Sequence, Union import torch from torch.utils.data import Dataset diff --git a/src/pytorch_lightning/utilities/apply_func.py b/src/pytorch_lightning/utilities/apply_func.py index 8729520ee9d96..9f85ea52661c7 100644 --- a/src/pytorch_lightning/utilities/apply_func.py +++ b/src/pytorch_lightning/utilities/apply_func.py @@ -17,10 +17,9 @@ import operator from abc import ABC from collections import defaultdict, OrderedDict -from collections.abc import Mapping, Sequence from copy import copy, deepcopy from functools import partial -from typing import Any, Callable, List, Optional, Tuple, Union +from typing import Any, Callable, List, Mapping, Optional, Sequence, Tuple, Union import numpy as np import torch diff --git a/src/pytorch_lightning/utilities/fetching.py b/src/pytorch_lightning/utilities/fetching.py index 4869b72134c25..fa7e395fbb82b 100644 --- a/src/pytorch_lightning/utilities/fetching.py +++ b/src/pytorch_lightning/utilities/fetching.py @@ -13,9 +13,8 @@ # limitations under the License. from abc import ABC, abstractmethod -from collections.abc import Iterable, Iterator from copy import deepcopy -from typing import Any, Callable, List, Optional, Sized, Tuple +from typing import Any, Callable, Iterable, Iterator, List, Optional, Sized, Tuple import torch from torch.utils.data.dataloader import DataLoader diff --git a/tests/tests_pytorch/overrides/test_distributed.py b/tests/tests_pytorch/overrides/test_distributed.py index e425859fe34df..77c3eb40bfb10 100644 --- a/tests/tests_pytorch/overrides/test_distributed.py +++ b/tests/tests_pytorch/overrides/test_distributed.py @@ -11,7 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from collections.abc import Iterable +from typing import Iterable import pytest from torch.utils.data import BatchSampler, SequentialSampler diff --git a/tests/tests_pytorch/utilities/test_auto_restart.py b/tests/tests_pytorch/utilities/test_auto_restart.py index 8a888ce09c90a..a3bf115313bd0 100644 --- a/tests/tests_pytorch/utilities/test_auto_restart.py +++ b/tests/tests_pytorch/utilities/test_auto_restart.py @@ -17,11 +17,10 @@ import random import random as python_random from collections import defaultdict -from collections.abc import Iterable from contextlib import suppress from copy import deepcopy from dataclasses import asdict -from typing import Iterator, List, Optional +from typing import Iterable, Iterator, List, Optional from unittest import mock from unittest.mock import ANY From e027ecf9d8ed17bca9965766248fcd952e765a91 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 23 Aug 2022 17:53:13 +0200 Subject: [PATCH 220/230] chlog after App `0.5.6` & `0.5.7` (#14352) * chlog after App 0.5.6 & 0.5.7 * . --- src/lightning_app/CHANGELOG.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index de2416b4208a9..6b32fde4cc60b 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -4,7 +4,7 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). -## [0.6.0] - 2022-MM-DD +## [0.6.0] - 2022-08-DD ### Added @@ -51,10 +51,27 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Fixed -- Resolved a bug where the install command was not installing the latest version of an app/component by default ([#14181](https://github.com/Lightning-AI/lightning/pull/14181)) +- Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) -- Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) +## [0.5.7] - 2022-08-22 + +### Changed + +- Release LAI docs as stable ([#14250](https://github.com/Lightning-AI/lightning/pull/14250)) +- Compatibility for Python 3.10 + +### Fixed + +- Pinning starsessions to 1.x ([#14333](https://github.com/Lightning-AI/lightning/pull/14333)) +- Parsed local package versions ([#13933](https://github.com/Lightning-AI/lightning/pull/13933)) + + +## [0.5.6] - 2022-08-16 + +### Fixed + +- Resolved a bug where the `install` command was not installing the latest version of an app/component by default ([#14181](https://github.com/Lightning-AI/lightning/pull/14181)) ## [0.5.5] - 2022-08-9 From bb634310e765e5e988220c7c8f421fb09b9dd0f0 Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Tue, 23 Aug 2022 16:10:52 +0000 Subject: [PATCH 221/230] [CI] Bump CUDA in Docker images to 11.6.1 (#14348) * bump cuda in docker images to 11.6.1 * PUSH TO HUB. REVERT THIS! * conda forge for 11.6 * cuda 11.5 * revert conda changes * 11.6 back again * 11.6 back again, all of them * maybe all passes now * maybe all passes now * final push * Revert "PUSH TO HUB. REVERT THIS!" This reverts commit 602bfce224cf22e24421448887844937e0aff9f0. * Apply suggestions from code review Co-authored-by: Jirka Borovec --- .azure/gpu-benchmark.yml | 2 +- .azure/gpu-tests.yml | 2 +- .github/checkgroup.yml | 12 ++++++------ .github/workflows/ci-pytorch-dockers.yml | 13 +++++++------ .github/workflows/release-docker.yml | 2 +- dockers/README.md | 4 ++-- dockers/base-conda/Dockerfile | 12 +++++++----- dockers/base-cuda/Dockerfile | 5 +++-- 8 files changed, 28 insertions(+), 24 deletions(-) diff --git a/.azure/gpu-benchmark.yml b/.azure/gpu-benchmark.yml index 0de590f2c54a6..968186fbd275d 100644 --- a/.azure/gpu-benchmark.yml +++ b/.azure/gpu-benchmark.yml @@ -28,7 +28,7 @@ jobs: cancelTimeoutInMinutes: "2" pool: azure-jirka-spot container: - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" options: "--runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all --shm-size=32g" workspace: clean: all diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index 683212cd55d4b..d3fb42d33d278 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -26,7 +26,7 @@ jobs: strategy: matrix: 'PyTorch - stable': - image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.3.1" + image: "pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1" # how long to run the job before automatically cancelling timeoutInMinutes: "80" # how much time to give 'run always even if cancelled tasks' before stopping them diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 6389fdf71ee70..4b67bc076b32d 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -81,21 +81,21 @@ subprojects: - ".github/workflows/*docker*.yml" - "setup.py" checks: - - "build-conda (3.8, 1.10)" - - "build-conda (3.8, 1.9)" - - "build-conda (3.9, 1.11)" - - "build-conda (3.9, 1.12)" + - "build-conda (3.8, 1.9, 11.1.1)" + - "build-conda (3.8, 1.10.1, 11.1.1)" + - "build-conda (3.9, 1.11, 11.3.1)" + - "build-conda (3.9, 1.12, 11.3.1)" - "build-cuda (3.8, 1.9, 11.1.1)" - "build-cuda (3.9, 1.10, 11.3.1)" - "build-cuda (3.9, 1.11, 11.3.1)" - - "build-cuda (3.9, 1.12, 11.3.1)" + - "build-cuda (3.9, 1.12, 11.6.1)" - "build-cuda (3.9, 1.9, 11.1.1)" - "build-hpu (1.5.0, 1.11.0)" - "build-ipu (3.9, 1.9)" - "build-NGC" - "build-pl (3.9, 1.10, 11.3.1)" - "build-pl (3.9, 1.11, 11.3.1)" - - "build-pl (3.9, 1.12, 11.3.1)" + - "build-pl (3.9, 1.12, 11.6.1)" - "build-pl (3.9, 1.9, 11.1.1)" - "build-xla (3.7, 1.12)" diff --git a/.github/workflows/ci-pytorch-dockers.yml b/.github/workflows/ci-pytorch-dockers.yml index a05dbbb5bc8ef..6cb28885e79ef 100644 --- a/.github/workflows/ci-pytorch-dockers.yml +++ b/.github/workflows/ci-pytorch-dockers.yml @@ -36,7 +36,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -96,7 +96,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} # Used in Lightning-AI/tutorials - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} steps: @@ -133,10 +133,10 @@ jobs: fail-fast: false matrix: include: - - {python_version: "3.8", pytorch_version: "1.9"} - - {python_version: "3.8", pytorch_version: "1.10"} - - {python_version: "3.9", pytorch_version: "1.11"} - - {python_version: "3.9", pytorch_version: "1.12"} + - {python_version: "3.8", pytorch_version: "1.9", cuda_version: "11.1.1"} + - {python_version: "3.8", pytorch_version: "1.10.1", cuda_version: "11.1.1"} + - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} steps: - uses: actions/checkout@v3 - uses: docker/setup-buildx-action@v2 @@ -150,6 +150,7 @@ jobs: build-args: | PYTHON_VERSION=${{ matrix.python_version }} PYTORCH_VERSION=${{ matrix.pytorch_version }} + CUDA_VERSION=${{ matrix.cuda_version }} file: dockers/base-conda/Dockerfile push: ${{ env.PUSH_TO_HUB }} tags: pytorchlightning/pytorch_lightning:base-conda-py${{ matrix.python_version }}-torch${{ matrix.pytorch_version }} diff --git a/.github/workflows/release-docker.yml b/.github/workflows/release-docker.yml index 6901a24204683..2de330ea5ca75 100644 --- a/.github/workflows/release-docker.yml +++ b/.github/workflows/release-docker.yml @@ -19,7 +19,7 @@ jobs: - {python_version: "3.9", pytorch_version: "1.9", cuda_version: "11.1.1"} - {python_version: "3.9", pytorch_version: "1.10", cuda_version: "11.3.1"} - {python_version: "3.9", pytorch_version: "1.11", cuda_version: "11.3.1"} - - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.3.1"} + - {python_version: "3.9", pytorch_version: "1.12", cuda_version: "11.6.1"} steps: - name: Checkout uses: actions/checkout@v2 diff --git a/dockers/README.md b/dockers/README.md index b1ff9826b6c1f..4b203437ff8ab 100644 --- a/dockers/README.md +++ b/dockers/README.md @@ -11,7 +11,7 @@ git clone https://github.com/Lightning-AI/lightning.git docker image build -t pytorch-lightning:latest -f dockers/base-cuda/Dockerfile . # build with specific arguments -docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.11 --build-arg CUDA_VERSION=11.3.1 . +docker image build -t pytorch-lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 -f dockers/base-cuda/Dockerfile --build-arg PYTHON_VERSION=3.9 --build-arg PYTORCH_VERSION=1.12 --build-arg CUDA_VERSION=11.6.1 . ``` To run your docker use @@ -45,7 +45,7 @@ sudo systemctl restart docker and later run the docker image with `--gpus all`. For example, ``` -docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.11-cuda11.3.1 +docker run --rm -it --gpus all pytorchlightning/pytorch_lightning:base-cuda-py3.9-torch1.12-cuda11.6.1 ``` ## Run Jupyter server diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 0a7c8884974c0..03d2fb547ba6d 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -42,16 +42,17 @@ RUN \ curl \ unzip \ ca-certificates \ - libopenmpi-dev \ - && \ + libopenmpi-dev +RUN \ # Install conda and python. # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385 curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-py38_${CONDA_VERSION}-Linux-x86_64.sh && \ chmod +x ~/miniconda.sh && \ ~/miniconda.sh -b && \ - rm ~/miniconda.sh && \ + rm ~/miniconda.sh +RUN \ # Cleaning apt-get autoremove -y && \ apt-get clean && \ @@ -73,9 +74,10 @@ COPY environment.yml environment.yml # conda init RUN \ conda update -n base -c defaults conda && \ + CUDA_VERSION_MM=$(python -c "print('.'.join('$CUDA_VERSION'.split('.')[:2]))") && \ conda create -y --name $CONDA_ENV \ - python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION} \ - -c nvidia -c pytorch -c pytorch-test -c pytorch-nightly && \ + python=${PYTHON_VERSION} pytorch=${PYTORCH_VERSION} torchvision torchtext cudatoolkit=${CUDA_VERSION_MM} \ + -c nvidia -c pytorch -c pytorch-test && \ conda init bash && \ # NOTE: this requires that the channel is presented in the yaml before packages \ printf "import re;\nfname = 'environment.yml';\nreq = open(fname).read();\nfor n in ['python', 'pytorch', 'torchtext', 'torchvision']:\n req = re.sub(rf'- {n}[>=]+', f'# - {n}=', req);\nopen(fname, 'w').write(req)" > prune.py && \ diff --git a/dockers/base-cuda/Dockerfile b/dockers/base-cuda/Dockerfile index 01372574e4618..be613f3b6415f 100644 --- a/dockers/base-cuda/Dockerfile +++ b/dockers/base-cuda/Dockerfile @@ -140,8 +140,9 @@ RUN \ RUN \ # install Bagua CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ - python -c "import bagua_core; bagua_core.install_deps()" && \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ + if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ python -c "import bagua; print(bagua.__version__)" COPY requirements/pytorch/check-avail-extras.py check-avail-extras.py From 2265bd85c915d563acd400c001e4bd5087758957 Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Tue, 23 Aug 2022 10:06:46 -0700 Subject: [PATCH 222/230] Bring back access app state (#14258) * Recreated the access_app_state file * Update the site's TOC to include the file * Update code sample file path * Minor formatting update Co-authored-by: Jirka Borovec Co-authored-by: Sherin Thomas --- docs/source-app/index.rst | 1 + .../access_app_state/access_app_state.rst | 59 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 docs/source-app/workflows/access_app_state/access_app_state.rst diff --git a/docs/source-app/index.rst b/docs/source-app/index.rst index 10a65db660d7f..6c701ffd574d9 100644 --- a/docs/source-app/index.rst +++ b/docs/source-app/index.rst @@ -216,6 +216,7 @@ Keep Learning :maxdepth: 1 :caption: How to... + Access the App State Add a web user interface (UI) Add a web link Arrange app tabs diff --git a/docs/source-app/workflows/access_app_state/access_app_state.rst b/docs/source-app/workflows/access_app_state/access_app_state.rst new file mode 100644 index 0000000000000..1e233281fb25c --- /dev/null +++ b/docs/source-app/workflows/access_app_state/access_app_state.rst @@ -0,0 +1,59 @@ +.. _access_app_state: + +################ +Access App State +################ + +**Audience:** Users who want to know how the App State can be accessed. + +**Level:** Basic + +********************** +What is the App State? +********************** + +In Lightning, each component is stateful and their state is composed of all attributes defined within their **__init__** method. + +The **App State** is the collection of all the components' states forming the App. + +************************************ +What is special about the App State? +************************************ + +The **App State** is always up-to-date, even running an App in the cloud on multiple machines. +This means that every time an attribute is modified in a Work, that information is automatically +broadcasted to the Flow. With this mechanism, any Component can **react** to any other +Component's **state changes** through the Flow and complex systems can be easily implemented. +Lightning requires a state based driven mindset when implementing the Flow. + +*************************************** +When do I need to access the App State? +*************************************** + +As a user, you are interacting with your component attributes, so most likely, +you won't need to access the Component's state directly, but it can be helpful to +understand how the state works under the hood. + +For example, here we define a **Flow** component and **Work** component, where the Work increments a counter indefinitely and the Flow prints its state which contains the Work. + +You can easily check the state of your entire App as follows: + +.. literalinclude:: ../../code_samples/quickstart/app_01.py + +Run the App with: + +.. code-block:: bash + + lightning run app docs/quickstart/app_01.py + +And here's the output you get when running the App using **Lightning CLI**: + +.. code-block:: console + + INFO: Your app has started. View it in your browser: http://127.0.0.1:7501/view + State: {'works': {'w': {'vars': {'counter': 1}}}} + State: {'works': {'w': {'vars': {'counter': 2}}}} + State: {'works': {'w': {'vars': {'counter': 3}}}} + State: {'works': {'w': {'vars': {'counter': 3}}}} + State: {'works': {'w': {'vars': {'counter': 4}}}} + ... From 8ff2e01025e45577bb9121645cb2ee90c311141b Mon Sep 17 00:00:00 2001 From: Laverne Henderson Date: Tue, 23 Aug 2022 10:31:18 -0700 Subject: [PATCH 223/230] Update for M1 Mac installations (#14350) * Update for M1 Mac installations * Apply suggestions from code review * Update PL installation * Update based on feedback Co-authored-by: Jirka Borovec --- docs/source-app/installation.rst | 2 +- docs/source-app/installation_mac.rst | 22 +++++++++++++++++++ docs/source-pytorch/starter/installation.rst | 2 ++ .../starter/installation_mac.rst | 22 +++++++++++++++++++ 4 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 docs/source-app/installation_mac.rst create mode 100644 docs/source-pytorch/starter/installation_mac.rst diff --git a/docs/source-app/installation.rst b/docs/source-app/installation.rst index 1828e30cb49c4..340fd86da802a 100644 --- a/docs/source-app/installation.rst +++ b/docs/source-app/installation.rst @@ -13,7 +13,7 @@ Don't know what this is? Follow our `beginner guide here ` * Python 3.8.x or later (3.8.x, 3.9.x, 3.10.x) -Or read the `Windows installation article `_. +Or read the `Apple Silicon Macs installation article `_ or the `Windows installation article `_. ---- diff --git a/docs/source-app/installation_mac.rst b/docs/source-app/installation_mac.rst new file mode 100644 index 0000000000000..180a3a88936ff --- /dev/null +++ b/docs/source-app/installation_mac.rst @@ -0,0 +1,22 @@ +:orphan: + +################################## +Installation on Apple Silicon Macs +################################## + +Apple Silicon (M1, M2, M3) Mac environments need a bit of tweaking before you install. + +---- + +**************** +Install with pip +**************** + +Install the ``lightning`` package + + .. code:: bash + + export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 + export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1 + + pip install lightning diff --git a/docs/source-pytorch/starter/installation.rst b/docs/source-pytorch/starter/installation.rst index c5f7760945fce..6c9fa47c724a5 100644 --- a/docs/source-pytorch/starter/installation.rst +++ b/docs/source-pytorch/starter/installation.rst @@ -19,6 +19,8 @@ Now you can install using `pip `_ u pip install pytorch-lightning +Or read the `Apple Silicon Macs installation article `_. + -------------- ****************** diff --git a/docs/source-pytorch/starter/installation_mac.rst b/docs/source-pytorch/starter/installation_mac.rst new file mode 100644 index 0000000000000..180a3a88936ff --- /dev/null +++ b/docs/source-pytorch/starter/installation_mac.rst @@ -0,0 +1,22 @@ +:orphan: + +################################## +Installation on Apple Silicon Macs +################################## + +Apple Silicon (M1, M2, M3) Mac environments need a bit of tweaking before you install. + +---- + +**************** +Install with pip +**************** + +Install the ``lightning`` package + + .. code:: bash + + export GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 + export GRPC_PYTHON_BUILD_SYSTEM_ZLIB=1 + + pip install lightning From 9040b1ed2c49c613f4eaa400be1052a9a842f110 Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 23 Aug 2022 19:31:36 +0200 Subject: [PATCH 224/230] CI: enable testing of react e2e (#14364) enable testing of react e2e --- .azure/app-cloud-e2e.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.azure/app-cloud-e2e.yml b/.azure/app-cloud-e2e.yml index d5aeee88c020a..c7262c2bc5c61 100644 --- a/.azure/app-cloud-e2e.yml +++ b/.azure/app-cloud-e2e.yml @@ -36,8 +36,8 @@ jobs: name: "boring_app" 'App: template_streamlit_ui': name: "template_streamlit_ui" -# 'App: template_react_ui': # TODO: clarify visibility private/public -# name: "template_react_ui" + 'App: template_react_ui': + name: "template_react_ui" 'App: template_jupyterlab': # TODO: clarify where these files lives name: "template_jupyterlab" 'App: idle_timeout': From 5661458bf475a2f4ff895a78fe79b4c8c466d1c9 Mon Sep 17 00:00:00 2001 From: Rohit Gupta Date: Wed, 24 Aug 2022 00:58:53 +0530 Subject: [PATCH 225/230] Fix make test (#14273) --- Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index aae9d77bb0594..c434fc19c99f6 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,8 @@ export SLURM_LOCALID=0 # assume you have installed need packages export SPHINX_MOCK_REQUIREMENTS=1 +# install only Lightning Trainer packages +export PACKAGE_NAME=pytorch clean: # clean all temp runs @@ -31,7 +33,7 @@ test: clean pip install -e . -r requirements/pytorch/devel.txt pip install -r requirements/pytorch/strategies.txt # run tests with coverage - python -m coverage run --source pytorch_lightning -m pytest pytorch_lightning tests -v + python -m coverage run --source src/pytorch_lightning -m pytest src/pytorch_lightning tests/tests_pytorch -v python -m coverage report docs: clean From 2182d755c47266c3f3cd74d8a457947dd313e030 Mon Sep 17 00:00:00 2001 From: Dmitry Frolov Date: Tue, 23 Aug 2022 15:48:22 -0400 Subject: [PATCH 226/230] [CLI] Adding opportunity to see basic cluster logs (#14334) * pinning starsessions * pinning starsessions * adding strict back to requirements.txt * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Duplicated * Basic implementation * Basic implementation * Basic implementation * Basic implementation * Common things moved to log helpers file * Decomposing logs reader classes for reusing * Setting colors for log levels * Manifest trimming * Changes added to CHANGELOG * Prettifications * Prettifications * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Logs function name change * Logs function name change * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * attempt to fix the pydanitc import * Tests + command name fixes * Extending tests * Adding limit argument * Unmerging CI fix * Unmerging CI fix * Adding fields for errors * Adding log level fixed field width * Adding absent typing + exeptions raising * Adding socket error logging * Addressing comments on cluster list function return value * Addressing comments on adding e2e tests * Adding version range for arrow package in reqs * New unit tests * arrow time parsing callback modified + unit tests * helpers updated * helpers updated * helpers updated * One more test * CMD test fix * CMD test fix * CMD test fix * CMD test fix * CMD test fix * LightningClient mocking * Flaky test removed Co-authored-by: hhsecond Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Jirka Borovec --- requirements/app/base.txt | 1 + src/lightning_app/CHANGELOG.md | 3 + src/lightning_app/cli/cmd_clusters.py | 8 +- src/lightning_app/cli/lightning_cli.py | 92 +++++++++++++++ src/lightning_app/utilities/app_logs.py | 28 +---- src/lightning_app/utilities/cli_helpers.py | 14 +++ src/lightning_app/utilities/cluster_logs.py | 108 ++++++++++++++++++ src/lightning_app/utilities/log_helpers.py | 34 ++++++ .../utilities/logs_socket_api.py | 84 +++++++++++++- tests/tests_app/cli/test_cli.py | 10 ++ tests/tests_app/utilities/test_cli_helpers.py | 36 +++++- tests/tests_app/utilities/test_log_helpers.py | 23 ++++ .../utilities/test_logs_socket_api.py | 32 ++++++ tests/tests_clusters/test_cluster_logs.py | 103 +++++++++++++++++ 14 files changed, 546 insertions(+), 30 deletions(-) create mode 100644 src/lightning_app/utilities/cluster_logs.py create mode 100644 src/lightning_app/utilities/log_helpers.py create mode 100644 tests/tests_app/utilities/test_log_helpers.py create mode 100644 tests/tests_app/utilities/test_logs_socket_api.py create mode 100644 tests/tests_clusters/test_cluster_logs.py diff --git a/requirements/app/base.txt b/requirements/app/base.txt index 4a0ee3a90a95e..d40fe4ef8a85e 100644 --- a/requirements/app/base.txt +++ b/requirements/app/base.txt @@ -6,3 +6,4 @@ fsspec>=2022.01.0, <=2022.7.1 s3fs>=2022.1.0, <=2022.7.1 croniter # for now until we found something more robust. traitlets<5.2.0 # Traitlets 5.2.X fails: https://github.com/ipython/traitlets/issues/741 +arrow>=1.2.0, <=1.2.2 diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index 6b32fde4cc60b..cebbd1212d56a 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -14,6 +14,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) +- Add support to see Lightning AI BYOC cluster logs ([#14334](https://github.com/Lightning-AI/lightning/pull/14334)) + + - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) diff --git a/src/lightning_app/cli/cmd_clusters.py b/src/lightning_app/cli/cmd_clusters.py index 7acdc9b63022d..371aaa0f76415 100644 --- a/src/lightning_app/cli/cmd_clusters.py +++ b/src/lightning_app/cli/cmd_clusters.py @@ -93,10 +93,14 @@ def create( click.echo(f"${resp.id} cluster is ${resp.status.phase}") - def list(self): + def get_clusters(self): resp = self.api_client.cluster_service_list_clusters(phase_not_in=[V1ClusterState.DELETED]) + return ClusterList(resp.clusters) + + def list(self): + clusters = self.get_clusters() console = Console() - console.print(ClusterList(resp.clusters).as_table()) + console.print(clusters.as_table()) def delete(self, cluster_id: str = None, force: bool = False, wait: bool = False): if force: diff --git a/src/lightning_app/cli/lightning_cli.py b/src/lightning_app/cli/lightning_cli.py index 876a663913a73..d590cbc667f8a 100644 --- a/src/lightning_app/cli/lightning_cli.py +++ b/src/lightning_app/cli/lightning_cli.py @@ -5,6 +5,7 @@ from pathlib import Path from typing import List, Tuple, Union +import arrow import click import requests import rich @@ -13,6 +14,7 @@ from lightning_app import __version__ as ver from lightning_app.cli import cmd_init, cmd_install, cmd_pl_init, cmd_react_ui_init +from lightning_app.cli.cmd_clusters import AWSClusterManager from lightning_app.cli.lightning_cli_create import create from lightning_app.cli.lightning_cli_delete import delete from lightning_app.cli.lightning_cli_list import get_list @@ -21,10 +23,12 @@ from lightning_app.runners.runtime_type import RuntimeType from lightning_app.utilities.app_logs import _app_logs_reader from lightning_app.utilities.cli_helpers import ( + _arrow_time_callback, _format_input_env_variables, _retrieve_application_url_and_available_commands, ) from lightning_app.utilities.cloud import _get_project +from lightning_app.utilities.cluster_logs import _cluster_logs_reader from lightning_app.utilities.enum import OpenAPITags from lightning_app.utilities.install_components import register_all_external_components from lightning_app.utilities.login import Auth @@ -141,6 +145,94 @@ def logs(app_name: str, components: List[str], follow: bool) -> None: rich.print(f"[{color}]{log_event.component_name}[/{color}] {date} {log_event.message}") +@show.group() +def cluster(): + """Groups cluster commands inside show.""" + pass + + +@cluster.command(name="logs") +@click.argument("cluster_name", required=True) +@click.option( + "--from", + "from_time", + default="24 hours ago", + help="The starting timestamp to query cluster logs from. Human-readable (e.g. '48 hours ago') or ISO 8601 " + "(e.g. '2022-08-23 12:34') formats.", + callback=_arrow_time_callback, +) +@click.option( + "--to", + "to_time", + default="0 seconds ago", + callback=_arrow_time_callback, + help="The end timestamp / relative time increment to query logs for. This is ignored when following logs (with " + "-f/--follow). The same format as --from option has.", +) +@click.option("--limit", default=1000, help="The max number of log lines returned.") +@click.option("-f", "--follow", required=False, is_flag=True, help="Wait for new logs, to exit use CTRL+C.") +def cluster_logs(cluster_name: str, to_time: arrow.Arrow, from_time: arrow.Arrow, limit: int, follow: bool) -> None: + """Show cluster logs. + + Example uses: + + Print cluster logs: + + $ lightning show cluster logs my-cluster + + + Print cluster logs and wait for new logs: + + $ lightning show cluster logs my-cluster --follow + + + Print cluster logs, from 48 hours ago to now: + + $ lightning show cluster logs my-cluster --from "48 hours ago" + + + Print cluster logs, 10 most recent lines: + + $ lightning show cluster logs my-cluster --limit 10 + """ + + client = LightningClient() + cluster_manager = AWSClusterManager() + existing_cluster_list = cluster_manager.get_clusters() + + clusters = {cluster.name: cluster.id for cluster in existing_cluster_list.clusters} + + if not clusters: + raise click.ClickException("You don't have any clusters.") + + if not cluster_name: + raise click.ClickException( + f"You have not specified any clusters. Please select one of available: [{', '.join(clusters.keys())}]" + ) + + if cluster_name not in clusters: + raise click.ClickException( + f"The cluster '{cluster_name}' does not exist." + f" Please select one of the following: [{', '.join(clusters.keys())}]" + ) + + log_reader = _cluster_logs_reader( + client=client, + cluster_id=clusters[cluster_name], + start=from_time.int_timestamp, + end=to_time.int_timestamp, + limit=limit, + follow=follow, + ) + + colors = {"error": "red", "warn": "yellow", "info": "green"} + + for log_event in log_reader: + date = log_event.timestamp.strftime("%m/%d/%Y %H:%M:%S") + color = colors.get(log_event.labels.level, "green") + rich.print(f"[{color}]{log_event.labels.level:5}[/{color}] {date} {log_event.message.rstrip()}") + + @_main.command() def login(): """Log in to your lightning.ai account.""" diff --git a/src/lightning_app/utilities/app_logs.py b/src/lightning_app/utilities/app_logs.py index 536fbaae05093..30533902cdeb8 100644 --- a/src/lightning_app/utilities/app_logs.py +++ b/src/lightning_app/utilities/app_logs.py @@ -1,15 +1,14 @@ import json import queue -import sys from dataclasses import dataclass -from datetime import datetime, timedelta -from json import JSONDecodeError +from datetime import timedelta from threading import Thread from typing import Callable, Iterator, List, Optional import dateutil.parser from websocket import WebSocketApp +from lightning_app.utilities.log_helpers import _error_callback, _OrderedLogEntry from lightning_app.utilities.logs_socket_api import _LightningLogsSocketAPI from lightning_app.utilities.network import LightningClient @@ -27,18 +26,10 @@ class _LogEventLabels: @dataclass -class _LogEvent: - message: str - timestamp: datetime +class _LogEvent(_OrderedLogEntry): component_name: str labels: _LogEventLabels - def __ge__(self, other: "_LogEvent") -> bool: - return self.timestamp >= other.timestamp - - def __gt__(self, other: "_LogEvent") -> bool: - return self.timestamp > other.timestamp - def _push_log_events_to_read_queue_callback(component_name: str, read_queue: queue.PriorityQueue): """Pushes _LogEvents from websocket to read_queue. @@ -65,17 +56,6 @@ def callback(ws_app: WebSocketApp, msg: str): return callback -def _error_callback(ws_app: WebSocketApp, error: Exception): - errors = { - KeyError: "Malformed log message, missing key", - JSONDecodeError: "Malformed log message", - TypeError: "Malformed log format", - ValueError: "Malformed date format", - } - print(f"Error while reading logs ({errors.get(type(error), 'Unknown')})", file=sys.stderr) - ws_app.close() - - def _app_logs_reader( client: LightningClient, project_id: str, @@ -127,7 +107,7 @@ def _app_logs_reader( pass except KeyboardInterrupt: - # User pressed CTRL+C to exit, we sould respect that + # User pressed CTRL+C to exit, we should respect that pass finally: diff --git a/src/lightning_app/utilities/cli_helpers.py b/src/lightning_app/utilities/cli_helpers.py index 6000114c3d4d6..068024b783bd5 100644 --- a/src/lightning_app/utilities/cli_helpers.py +++ b/src/lightning_app/utilities/cli_helpers.py @@ -1,6 +1,8 @@ import re from typing import Dict, Optional +import arrow +import click import requests from lightning_app.core.constants import APP_SERVER_PORT @@ -119,3 +121,15 @@ def _retrieve_application_url_and_available_commands(app_id_or_name_or_url: Opti raise Exception(f"The server didn't process the request properly. Found {resp.json()}") return lightningapp.status.url, _extract_command_from_openapi(resp.json()) return None, None + + +def _arrow_time_callback( + _ctx: "click.core.Context", _param: "click.core.Option", value: str, arw_now=arrow.utcnow() +) -> arrow.Arrow: + try: + return arw_now.dehumanize(value) + except ValueError: + try: + return arrow.get(value) + except (ValueError, TypeError): + raise click.ClickException(f"cannot parse time {value}") diff --git a/src/lightning_app/utilities/cluster_logs.py b/src/lightning_app/utilities/cluster_logs.py new file mode 100644 index 0000000000000..4a9bf1ba9148b --- /dev/null +++ b/src/lightning_app/utilities/cluster_logs.py @@ -0,0 +1,108 @@ +import json +import queue +from dataclasses import dataclass +from threading import Thread +from typing import Callable, Iterator, Optional + +import arrow +import dateutil.parser +from websocket import WebSocketApp + +from lightning_app.utilities.log_helpers import _error_callback, _OrderedLogEntry +from lightning_app.utilities.logs_socket_api import _ClusterLogsSocketAPI +from lightning_app.utilities.network import LightningClient + + +@dataclass +class _ClusterLogEventLabels: + cluster_id: str + grid_url: str + hostname: str + level: str + logger: str + path: Optional[str] = None + workspace: Optional[str] = None + identifier: Optional[str] = None + issuer: Optional[str] = None + error: Optional[str] = None + errorVerbose: Optional[str] = None + + +@dataclass +class _ClusterLogEvent(_OrderedLogEntry): + labels: _ClusterLogEventLabels + + +def _push_log_events_to_read_queue_callback(read_queue: queue.PriorityQueue): + """Pushes _LogEvents from websocket to read_queue. + + Returns callback function used with `on_message_callback` of websocket.WebSocketApp. + """ + + def callback(ws_app: WebSocketApp, msg: str): + # We strongly trust that the contract on API will hold atm :D + event_dict = json.loads(msg) + labels = _ClusterLogEventLabels(**event_dict["labels"]) + + if "message" in event_dict: + message = event_dict["message"] + timestamp = dateutil.parser.isoparse(event_dict["timestamp"]) + event = _ClusterLogEvent( + message=message, + timestamp=timestamp, + labels=labels, + ) + read_queue.put(event) + + return callback + + +def _cluster_logs_reader( + client: LightningClient, + cluster_id: str, + start: arrow.Arrow, + end: arrow.Arrow, + limit: int, + follow: bool, + on_error_callback: Optional[Callable] = None, +) -> Iterator[_ClusterLogEvent]: + + logs_api_client = _ClusterLogsSocketAPI(client.api_client) + read_queue = queue.PriorityQueue() + + # We will use a socket inside a thread to read logs, + # to follow our typical reading pattern + log_socket = logs_api_client.create_cluster_logs_socket( + cluster_id=cluster_id, + start=start, + end=end, + limit=limit, + on_message_callback=_push_log_events_to_read_queue_callback(read_queue), + on_error_callback=on_error_callback or _error_callback, + ) + + log_thread = Thread(target=log_socket.run_forever) + + # Establish connection and begin pushing logs to the print queue + log_thread.start() + + # Print logs from queue when log event is available + try: + while True: + log_event = read_queue.get(timeout=None if follow else 1.0) + yield log_event + + except queue.Empty: + # Empty is raised by queue.get if timeout is reached. Follow = False case. + pass + + except KeyboardInterrupt: + # User pressed CTRL+C to exit, we should respect that + pass + + finally: + # Close connection - it will cause run_forever() to finish -> thread as finishes as well + log_socket.close() + + # The socket was closed, we can just wait for thread to finish. + log_thread.join() diff --git a/src/lightning_app/utilities/log_helpers.py b/src/lightning_app/utilities/log_helpers.py new file mode 100644 index 0000000000000..5938c443ae031 --- /dev/null +++ b/src/lightning_app/utilities/log_helpers.py @@ -0,0 +1,34 @@ +import logging +from dataclasses import dataclass +from datetime import datetime +from json import JSONDecodeError + +from websocket import WebSocketApp + +logger = logging.getLogger(__name__) + + +# This is a superclass to inherit log entry classes from it: +# it implements magic methods to sort logs by timestamps. +@dataclass +class _OrderedLogEntry: + message: str + timestamp: datetime + + def __ge__(self, other: "_OrderedLogEntry") -> bool: + return self.timestamp >= other.timestamp + + def __gt__(self, other: "_OrderedLogEntry") -> bool: + return self.timestamp > other.timestamp + + +# A general error callback for log reading, prints most common types of possible errors. +def _error_callback(ws_app: WebSocketApp, error: Exception): + errors = { + KeyError: "Malformed log message, missing key", + JSONDecodeError: "Malformed log message", + TypeError: "Malformed log format", + ValueError: "Malformed date format", + } + logger.error(f"⚡ Error while reading logs ({errors.get(type(error), 'Unknown')}), {error}") + ws_app.close() diff --git a/src/lightning_app/utilities/logs_socket_api.py b/src/lightning_app/utilities/logs_socket_api.py index 0ab9a5c24f3e5..28569a4879134 100644 --- a/src/lightning_app/utilities/logs_socket_api.py +++ b/src/lightning_app/utilities/logs_socket_api.py @@ -7,7 +7,9 @@ from lightning_app.utilities.login import Auth -class _LightningLogsSocketAPI: +# This class joins common things for reading logs, +# initialization and getting API token +class _LogsSocketAPI: def __init__(self, api_client: ApiClient): self.api_client = api_client self._auth = Auth() @@ -23,8 +25,10 @@ def _get_api_token(self) -> str: ) return token_resp.token + +class _LightningLogsSocketAPI(_LogsSocketAPI): @staticmethod - def _socket_url(host: str, project_id: str, app_id: str, token: str, component: str) -> str: + def _app_logs_socket_url(host: str, project_id: str, app_id: str, token: str, component: str) -> str: return ( f"wss://{host}/v1/projects/{project_id}/appinstances/{app_id}/logs?" f"token={token}&component={component}&follow=true" @@ -84,7 +88,7 @@ def print_log_msg(ws_app, msg): """ _token = self._get_api_token() clean_ws_host = urlparse(self.api_client.configuration.host).netloc - socket_url = self._socket_url( + socket_url = self._app_logs_socket_url( host=clean_ws_host, project_id=project_id, app_id=app_id, @@ -93,3 +97,77 @@ def print_log_msg(ws_app, msg): ) return WebSocketApp(socket_url, on_message=on_message_callback, on_error=on_error_callback) + + +class _ClusterLogsSocketAPI(_LogsSocketAPI): + @staticmethod + def _cluster_logs_socket_url(host: str, cluster_id: str, start: int, end: int, limit: int, token: str) -> str: + return ( + f"wss://{host}/v1/core/clusters/{cluster_id}/logs?" + f"start={start}&end={end}&token={token}&limit={limit}" + f"&follow=true" + ) + + def create_cluster_logs_socket( + self, + cluster_id: str, + start: int, # unix timestamp + end: int, # unix timestamp + limit: int, + on_message_callback: Callable[[WebSocketApp, str], None], + on_error_callback: Optional[Callable[[Exception, str], None]] = None, + ) -> WebSocketApp: + """Creates and returns WebSocketApp to listen to cluster logs. + + .. code-block:: python + # Synchronous reading, run_forever() is blocking + + + def print_log_msg(ws_app, msg): + print(msg) + + + logs_socket = client.create_cluster_logs_socket("cluster_id", 1661100000, 1661101000, print_log_msg) + logs_socket.run_forever() + + .. code-block:: python + # Asynchronous reading (with Threads) + + + def print_log_msg(ws_app, msg): + print(msg) + + + logs_socket = client.create_cluster_logs_socket("cluster_id", 1661100000, 1661101000, print_log_msg) + + logs_thread = Thread(target=cluster_logs_socket.run_forever) + + logs_thread.start() + # ....... + + + logs_socket.close() + + Arguments: + cluster_id: Project ID. + start: Starting timestamp to query cluster logs from. + end: Ending timestamp to query cluster logs to. + limit: A maximal number of log lines to get. + on_message_callback: Callback object which is called when received data. + on_error_callback: Callback object which is called when we get error. + + Returns: + WebSocketApp of the wanted socket + """ + _token = self._get_api_token() + clean_ws_host = urlparse(self.api_client.configuration.host).netloc + socket_url = self._cluster_logs_socket_url( + host=clean_ws_host, + cluster_id=cluster_id, + token=_token, + start=start, + limit=limit, + end=end, + ) + + return WebSocketApp(socket_url, on_message=on_message_callback, on_error=on_error_callback) diff --git a/tests/tests_app/cli/test_cli.py b/tests/tests_app/cli/test_cli.py index 428ba0e535328..1edc9d384cf30 100644 --- a/tests/tests_app/cli/test_cli.py +++ b/tests/tests_app/cli/test_cli.py @@ -57,6 +57,7 @@ def test_main_lightning_cli_help(): assert "list " in res assert "delete " in res assert "create " in res + assert "show " in res res = os.popen("python -m lightning run --help").read() assert "app " in res @@ -67,6 +68,15 @@ def test_main_lightning_cli_help(): assert "work" not in res assert "frontend" not in res + # inspect show group + res = os.popen("python -m lightning show --help").read() + assert "logs " in res + assert "cluster " in res + + # inspect show cluster group + res = os.popen("python -m lightning show cluster --help").read() + assert "logs " in res + @mock.patch("lightning_cloud.login.Auth.authenticate", MagicMock()) @mock.patch("lightning_app.cli.cmd_clusters.AWSClusterManager.create") diff --git a/tests/tests_app/utilities/test_cli_helpers.py b/tests/tests_app/utilities/test_cli_helpers.py index 575802da7b43d..4711ffeddbfde 100644 --- a/tests/tests_app/utilities/test_cli_helpers.py +++ b/tests/tests_app/utilities/test_cli_helpers.py @@ -1,6 +1,9 @@ +from unittest.mock import Mock + +import arrow import pytest -from lightning_app.utilities.cli_helpers import _format_input_env_variables +from lightning_app.utilities.cli_helpers import _arrow_time_callback, _format_input_env_variables def test_format_input_env_variables(): @@ -28,3 +31,34 @@ def test_format_input_env_variables(): _format_input_env_variables(("*FOO#=bar",)) assert _format_input_env_variables(("FOO=bar", "BLA=bloz")) == {"FOO": "bar", "BLA": "bloz"} + + +def test_arrow_time_callback(): + # Check ISO 8601 variations + assert _arrow_time_callback(Mock(), Mock(), "2022.08.23") == arrow.Arrow(2022, 8, 23) + + assert _arrow_time_callback(Mock(), Mock(), "2022.08.23 12:34") == arrow.Arrow(2022, 8, 23, 12, 34) + + assert _arrow_time_callback(Mock(), Mock(), "2022-08-23 12:34") == arrow.Arrow(2022, 8, 23, 12, 34) + + assert _arrow_time_callback(Mock(), Mock(), "2022-08-23 12:34:00.000") == arrow.Arrow(2022, 8, 23, 12, 34) + + # Just check humanized format is parsed + assert type(_arrow_time_callback(Mock(), Mock(), "48 hours ago")) == arrow.Arrow + + assert type(_arrow_time_callback(Mock(), Mock(), "60 minutes ago")) == arrow.Arrow + + assert type(_arrow_time_callback(Mock(), Mock(), "120 seconds ago")) == arrow.Arrow + + # Check raising errors + with pytest.raises(Exception, match="cannot parse time Mon"): + _arrow_time_callback(Mock(), Mock(), "Mon") + + with pytest.raises(Exception, match="cannot parse time Mon Sep 08 16:41:45 2022"): + _arrow_time_callback(Mock(), Mock(), "Mon Sep 08 16:41:45 2022") + + with pytest.raises(Exception, match="cannot parse time 2022.125.12"): + _arrow_time_callback(Mock(), Mock(), "2022.125.12") + + with pytest.raises(Exception, match="cannot parse time 1 time unit ago"): + _arrow_time_callback(Mock(), Mock(), "1 time unit ago") diff --git a/tests/tests_app/utilities/test_log_helpers.py b/tests/tests_app/utilities/test_log_helpers.py new file mode 100644 index 0000000000000..df580a467ac33 --- /dev/null +++ b/tests/tests_app/utilities/test_log_helpers.py @@ -0,0 +1,23 @@ +from unittest import mock, TestCase + +from lightning_app.utilities.log_helpers import _error_callback + + +class TestErrorCallback(TestCase): + def test_known_error(self): + websocket = mock.Mock() + with self.assertLogs("lightning_app.utilities.log_helpers") as captured: + _error_callback(websocket, ValueError()) + # check that there is only one log message + self.assertEqual(len(captured.records), 1) + # and it contains the error message expected + self.assertIn("Error while reading logs (Malformed date format)", captured.records[0].getMessage()) + + def test_unknown_error(self): + websocket = mock.Mock() + with self.assertLogs("lightning_app.utilities.log_helpers") as captured: + _error_callback(websocket, IOError()) + # check that there is only one log message + self.assertEqual(len(captured.records), 1) + # and it contains the error message expected + self.assertIn("Error while reading logs (Unknown)", captured.records[0].getMessage()) diff --git a/tests/tests_app/utilities/test_logs_socket_api.py b/tests/tests_app/utilities/test_logs_socket_api.py new file mode 100644 index 0000000000000..9fc6e2ee4086b --- /dev/null +++ b/tests/tests_app/utilities/test_logs_socket_api.py @@ -0,0 +1,32 @@ +from unittest import mock + +from lightning_app.utilities.logs_socket_api import _ClusterLogsSocketAPI + + +def test_cluster_logs_socket_api(): + websocket_url = _ClusterLogsSocketAPI._cluster_logs_socket_url( + "example.org", "my-cluster", 1661100000, 1661101000, 10, "TOKEN" + ) + + assert ( + websocket_url == "wss://example.org/v1/core/clusters/my-cluster/logs?start=1661100000&end=1661101000" + "&token=TOKEN&limit=10&follow=true" + ) + + api_client = mock.Mock() + api_client.configuration.host = "https://example.com" + api_client.call_api.return_value.token = "TOKEN" + cluster_logs_api = _ClusterLogsSocketAPI(api_client) + + def on_message_func(): + return None + + web_socket_app = cluster_logs_api.create_cluster_logs_socket( + "my-cluster", 1661100000, 1661101000, 10, on_message_func + ) + + assert ( + web_socket_app.url == "wss://example.com/v1/core/clusters/my-cluster/logs?start=1661100000&end=1661101000" + "&token=TOKEN&limit=10&follow=true" + ) + assert web_socket_app.on_message == on_message_func diff --git a/tests/tests_clusters/test_cluster_logs.py b/tests/tests_clusters/test_cluster_logs.py new file mode 100644 index 0000000000000..4855288dcf1ea --- /dev/null +++ b/tests/tests_clusters/test_cluster_logs.py @@ -0,0 +1,103 @@ +import os +import random +import string + +import pytest + +from src.lightning_app.testing.testing import run_cli + + +@pytest.mark.cloud +@pytest.mark.skipif( + os.environ.get("LIGHTNING_BYOC_CLUSTER_NAME") is None, + reason="missing LIGHTNING_BYOC_CLUSTER_NAME environment variable", +) +def test_byoc_cluster_logs() -> None: + # Check a typical retrieving case + cluster_name = os.environ.get("LIGHTNING_BYOC_CLUSTER_NAME") + with run_cli( + [ + "show", + "cluster", + "logs", + cluster_name, + ] + ) as (stdout, stderr): + assert "info" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + # Check a retrieving case with a small number of lines limit + cluster_name = os.environ.get("LIGHTNING_BYOC_CLUSTER_NAME") + with run_cli( + [ + "show", + "cluster", + "logs", + cluster_name, + "--limit", + 10, + ] + ) as (stdout, stderr): + assert "info" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + # Time expanding doesn't break retrieving + with run_cli( + [ + "show", + "cluster", + "logs", + cluster_name, + "--limit", + 10, + "--from", + "48 hours ago", + ] + ) as (stdout, stderr): + assert "info" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + # Time expanding doesn't break retrieving + with run_cli( + [ + "show", + "cluster", + "logs", + cluster_name, + "--limit", + 10, + "--from", + "48 hours ago", + ] + ) as (stdout, stderr): + assert "info" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + # Try non-existing cluster + letters = string.ascii_letters + cluster_name = "".join(random.choice(letters) for i in range(10)) + with run_cli( + [ + "show", + "cluster", + "logs", + cluster_name, + ] + ) as (stdout, stderr): + assert "does not exist" in stdout, f"stdout: {stdout}\nstderr: {stderr}" + + +@pytest.mark.cloud +@pytest.mark.skipif( + os.environ.get("LIGHTNING_CLOUD_CLUSTER_NAME") is None, + reason="missing LIGHTNING_CLOUD_CLUSTER_NAME environment variable", +) +def test_lighting_cloud_logs() -> None: + # Check a retrieving case from lightning-cloud + # We shouldn't show lighting-cloud logs, therefore we expect to see an error here + cluster_name = os.environ.get("LIGHTNING_CLOUD_CLUSTER_NAME" "" "") + with run_cli( + [ + "show", + "cluster", + "logs", + cluster_name, + ] + ) as (stdout, stderr): + assert "Error while reading logs" in stdout, f"stdout: {stdout}\nstderr: {stderr}" From d8ba94b325967fc52231365fe92db76a1d63dc2b Mon Sep 17 00:00:00 2001 From: Jirka Borovec Date: Tue, 23 Aug 2022 23:43:27 +0200 Subject: [PATCH 227/230] release App 0.6.0 RC (#14370) * release App 0.6.0 RC * req --- requirements.txt | 1 + src/lightning_app/CHANGELOG.md | 26 -------------------------- src/lightning_app/__version__.py | 2 +- 3 files changed, 2 insertions(+), 27 deletions(-) diff --git a/requirements.txt b/requirements.txt index c832dedeff2c0..762f6611eda00 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ # the default package dependencies +-r ./requirements/app/base.txt -r ./requirements/pytorch/base.txt diff --git a/src/lightning_app/CHANGELOG.md b/src/lightning_app/CHANGELOG.md index cebbd1212d56a..1e74509e23ad9 100644 --- a/src/lightning_app/CHANGELOG.md +++ b/src/lightning_app/CHANGELOG.md @@ -9,49 +9,23 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Added - Add support for `Lightning App Commands` through the `configure_commands` hook on the Lightning Flow and the `ClientCommand` ([#13602](https://github.com/Lightning-AI/lightning/pull/13602)) - - - Add support for Lightning AI BYOC cluster management ([#13835](https://github.com/Lightning-AI/lightning/pull/13835)) - - - Add support to see Lightning AI BYOC cluster logs ([#14334](https://github.com/Lightning-AI/lightning/pull/14334)) - - - Add support to run Lightning apps on Lightning AI BYOC clusters ([#13894](https://github.com/Lightning-AI/lightning/pull/13894)) - - - Add support for listing Lightning AI apps ([#13987](https://github.com/Lightning-AI/lightning/pull/13987)) - - - Adds `LightningTrainingComponent`. `LightningTrainingComponent` orchestrates multi-node training in the cloud ([#13830](https://github.com/Lightning-AI/lightning/pull/13830)) - - - Add support for printing application logs using CLI `lightning show logs [components]` ([#13634](https://github.com/Lightning-AI/lightning/pull/13634)) - - - Add support for `Lightning API` through the `configure_api` hook on the Lightning Flow and the `Post`, `Get`, `Delete`, `Put` HttpMethods ([#13945](https://github.com/Lightning-AI/lightning/pull/13945)) - - - Added a warning when `configure_layout` returns URLs configured with http instead of https ([#14233](https://github.com/Lightning-AI/lightning/pull/14233)) - ### Changed - Default values and parameter names for Lightning AI BYOC cluster management ([#14132](https://github.com/Lightning-AI/lightning/pull/14132)) - ### Changed -- - - - Run the flow only if the state has changed from the previous execution ([#14076](https://github.com/Lightning-AI/lightning/pull/14076)) -### Deprecated - -- - - ### Fixed - Unification of app template: moved `app.py` to root dir for `lightning init app ` template ([#13853](https://github.com/Lightning-AI/lightning/pull/13853)) diff --git a/src/lightning_app/__version__.py b/src/lightning_app/__version__.py index 9059a6aeafd77..af4963b4c66b2 100644 --- a/src/lightning_app/__version__.py +++ b/src/lightning_app/__version__.py @@ -1 +1 @@ -version = "0.6.0dev" +version = "0.6.0rc0" From 2157a3b84d50c7e5c5310063a52aeed2efed0a0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlos=20Mochol=C3=AD?= Date: Wed, 24 Aug 2022 01:07:05 +0200 Subject: [PATCH 228/230] Add a required job checker as an action (1/2) (#14363) Co-authored-by: Jirka Borovec --- .github/checkgroup.yml | 41 ++++++++++++++++-------- .github/workflows/probot-check-group.yml | 15 +++++++++ 2 files changed, 43 insertions(+), 13 deletions(-) create mode 100644 .github/workflows/probot-check-group.yml diff --git a/.github/checkgroup.yml b/.github/checkgroup.yml index 4b67bc076b32d..ace786f8ac40a 100644 --- a/.github/checkgroup.yml +++ b/.github/checkgroup.yml @@ -1,4 +1,7 @@ custom_service_name: "Lightning CI required checker" +# For security reasons, configuration is only loaded from the repository's default branch, +# changes made in pull requests from different branches or forks are ignored. This means that changes to this file +# will only be used after they are merged. subprojects: - id: "CI: CircleCI" paths: @@ -6,15 +9,6 @@ subprojects: checks: - "test-on-tpus" - - id: "CI: Azure" - paths: - - ".azure/**" - checks: - - "pytorch-lightning (GPUs)" - - "pytorch-lightning (GPUs) (testing PyTorch - stable)" - - "pytorch-lightning (HPUs)" - - "pytorch-lightning (IPUs)" - - id: "pytorch_lightning" paths: # all examples don't need to be added because they aren't used in CI, but these are @@ -52,7 +46,6 @@ subprojects: - "mypy" - "PR Gatekeeper (pytorch)" - "pytorch-lightning (GPUs)" - - "pytorch-lightning (GPUs) (testing PyTorch - stable)" - "pytorch-lightning (HPUs)" - "pytorch-lightning (IPUs)" - "slow (macOS-11, 3.7, 1.11)" @@ -60,6 +53,25 @@ subprojects: - "slow (windows-2022, 3.7, 1.11)" - "test-on-tpus" + - id: "pytorch_lightning: Azure GPU" + paths: + - ".azure/gpu-tests.yml" + - "tests/tests_pytorch/run_standalone_*.sh" + checks: + - "pytorch-lightning (GPUs)" + + - id: "pytorch_lightning: Azure HPU" + paths: + - ".azure/hpu-tests.yml" + checks: + - "pytorch-lightning (HPUs)" + + - id: "pytorch_lightning: Azure IPU" + paths: + - ".azure/ipu-tests.yml" + checks: + - "pytorch-lightning (IPUs)" + - id: "pytorch_lightning: Docs" paths: - "docs/source-pytorch/**" @@ -73,7 +85,6 @@ subprojects: - id: "pytorch_lightning: Docker" paths: - "dockers/**" - - "!dockers/README.md" - "requirements.txt" - "requirements/*.txt" - "requirements/pytorch/*" @@ -108,12 +119,10 @@ subprojects: - id: "lightning_app" paths: - - ".azure/app-cloud-e2e.yml" - "requirements/app/**" - "src/lightning_app/**" - "tests/tests_app/**" - "tests/tests_app_examples/**" - - "tests/tests_clusters/**" # the examples are used in the app CI - "examples/app_*" checks: @@ -127,6 +136,12 @@ subprojects: - "pytest (windows-2022, 3.8, latest)" - "pytest (windows-2022, 3.8, oldest)" + - id: "lightning_app: Azure" + paths: + - ".azure/app-cloud-e2e.yml" + checks: + - "App.cloud-e2e" + - id: "lightning_app: Docs" paths: - "docs/source-app/**" diff --git a/.github/workflows/probot-check-group.yml b/.github/workflows/probot-check-group.yml new file mode 100644 index 0000000000000..1b37e19c819b6 --- /dev/null +++ b/.github/workflows/probot-check-group.yml @@ -0,0 +1,15 @@ +name: Probot + +on: + check_run: {} + pull_request: {} + issue_comment: {types: [created]} + +jobs: + required-jobs: + runs-on: ubuntu-latest + if: github.event_name != 'issue_comment' || contains(github.event.comment.body, '@probot pls') + steps: + - uses: carmocca/probot@v2 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} From 0bd5703b8135fb6541c50c490ad39d30544ce67b Mon Sep 17 00:00:00 2001 From: otaj <6065855+otaj@users.noreply.github.com> Date: Wed, 24 Aug 2022 08:59:49 +0000 Subject: [PATCH 229/230] [CI] Trick Bagua into installing appropriate wheel in GPU tests (#14380) Bagua trick needs to be replicated on everywhere applicable --- .azure/gpu-tests.yml | 3 ++- dockers/base-conda/Dockerfile | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.azure/gpu-tests.yml b/.azure/gpu-tests.yml index d3fb42d33d278..f19c5bafc7814 100644 --- a/.azure/gpu-tests.yml +++ b/.azure/gpu-tests.yml @@ -73,7 +73,8 @@ jobs: python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'horovod' not in line] ; open(fname, 'w').writelines(lines)" python -c "fname = 'requirements/pytorch/strategies.txt' ; lines = [line for line in open(fname).readlines() if 'bagua' not in line] ; open(fname, 'w').writelines(lines)" CUDA_VERSION_MM=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))") - pip install "bagua-cuda$CUDA_VERSION_MM>=0.9.0" + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") + pip install "bagua-cuda$CUDA_VERSION_BAGUA>=0.9.0" pip install -e .[strategies] pip install -U deepspeed # TODO: remove when docker images are upgraded pip install --requirement requirements/pytorch/devel.txt diff --git a/dockers/base-conda/Dockerfile b/dockers/base-conda/Dockerfile index 03d2fb547ba6d..d6bfeee90d561 100644 --- a/dockers/base-conda/Dockerfile +++ b/dockers/base-conda/Dockerfile @@ -141,8 +141,9 @@ RUN \ RUN \ # install Bagua CUDA_VERSION_MM=$(python -c "print(''.join('$CUDA_VERSION'.split('.')[:2]))") && \ - pip install "bagua-cuda$CUDA_VERSION_MM==0.9.0" && \ - python -c "import bagua_core; bagua_core.install_deps()" && \ + CUDA_VERSION_BAGUA=$(python -c "print([ver for ver in [115,113,111,102] if $CUDA_VERSION_MM >= ver][0])") && \ + pip install "bagua-cuda$CUDA_VERSION_BAGUA==0.9.0" && \ + if [[ "$CUDA_VERSION_MM" = "$CUDA_VERSION_BAGUA" ]]; then python -c "import bagua_core; bagua_core.install_deps()"; fi && \ python -c "import bagua; print(bagua.__version__)" RUN \ From 34f98836fb452674f43f66babb2325ec17e8e192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adrian=20W=C3=A4lchli?= Date: Wed, 24 Aug 2022 15:24:24 +0200 Subject: [PATCH 230/230] Fix silent TPU CI failures (#14034) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Carlos Mocholí --- dockers/tpu-tests/tpu_test_cases.jsonnet | 1 + 1 file changed, 1 insertion(+) diff --git a/dockers/tpu-tests/tpu_test_cases.jsonnet b/dockers/tpu-tests/tpu_test_cases.jsonnet index 48536817920c9..98056674cdc0b 100644 --- a/dockers/tpu-tests/tpu_test_cases.jsonnet +++ b/dockers/tpu-tests/tpu_test_cases.jsonnet @@ -37,6 +37,7 @@ local tputests = base.BaseTest { export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}" export PL_RUN_TPU_TESTS=1 cd tests/tests_pytorch + set -e coverage run --source=pytorch_lightning -m pytest -vv --durations=0 ./ echo "\n||| Running standalone tests |||\n" bash run_standalone_tests.sh